From 9ae765a197d411a9016134cda0217a4a512aaabf Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Thu, 22 Jun 2017 18:31:12 +0200 Subject: [PATCH] Text detector class and Custom Image processor Class --- modules/text/CMakeLists.txt | 85 +- modules/text/FindCaffe.cmake | 14 + modules/text/FindGlog.cmake | 10 + modules/text/FindProtobuf.cmake | 10 + modules/text/FindTesseract.cmake | 24 + modules/text/README.md | 72 ++ modules/text/include/opencv2/text.hpp | 3 +- modules/text/include/opencv2/text/ocr.hpp | 849 +++++++++++++---- .../include/opencv2/text/textDetector.hpp | 235 +++++ modules/text/src/ocr_holistic.cpp | 879 ++++++++++++++++++ modules/text/src/text_detector.cpp | 643 +++++++++++++ modules/text/text_config.hpp.in | 10 +- 12 files changed, 2632 insertions(+), 202 deletions(-) create mode 100644 modules/text/FindCaffe.cmake create mode 100755 modules/text/FindGlog.cmake create mode 100644 modules/text/FindProtobuf.cmake create mode 100644 modules/text/FindTesseract.cmake create mode 100644 modules/text/include/opencv2/text/textDetector.hpp create mode 100644 modules/text/src/ocr_holistic.cpp create mode 100644 modules/text/src/text_detector.cpp diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index 7ec4d2464..52bd828d9 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -1,24 +1,71 @@ set(the_description "Text Detection and Recognition") -ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d OPTIONAL opencv_highgui WRAP python) - -if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT) - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) - find_package(Tesseract QUIET) - if(Tesseract_FOUND) - message(STATUS "Tesseract: YES") - set(HAVE_TESSERACT 1) - ocv_include_directories(${Tesseract_INCLUDE_DIR}) - ocv_target_link_libraries(${the_module} ${Tesseract_LIBRARIES}) - else() - message(STATUS "Tesseract: NO") - endif() +# Using cmake scripts and modules +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}) + +set(TEXT_DEPS opencv_ml opencv_highgui opencv_imgproc opencv_core opencv_features2d opencv_calib3d) + +find_package(Caffe) +if(Caffe_FOUND) + message(STATUS "Caffe: YES") + set(HAVE_CAFFE 1) +else() + message(STATUS "Caffe: NO") +# list(APPEND TEXT_DEPS opencv_dnn) +endif() + +#internal dependencies +find_package(Protobuf) +if(Protobuf_FOUND) + message(STATUS "Protobuf: YES") + set(HAVE_PROTOBUF 1) +else() + message(STATUS "Protobuf: NO") +endif() + +find_package(Glog) +if(Glog_FOUND) + message(STATUS "Glog: YES") + set(HAVE_GLOG 1) +else() + message(STATUS "Glog: NO") +endif() + +ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_calib3d WRAP python) +#ocv_define_module(text ${TEXT_DEPS} WRAP python) + +#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}) + +find_package(Tesseract) +if(${Tesseract_FOUND}) + message(STATUS "Tesseract: YES") + include_directories(${Tesseract_INCLUDE_DIR}) + target_link_libraries(opencv_text ${Tesseract_LIBS}) + add_definitions(-DHAVE_TESSERACT) +else() + message(STATUS "Tesseract: NO") endif() -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in - ${CMAKE_BINARY_DIR}/text_config.hpp @ONLY) -ocv_include_directories(${CMAKE_CURRENT_BINARY_DIR}) -ocv_add_testdata(samples/ contrib/text - FILES_MATCHING PATTERN "*.xml" PATTERN "*.xml.gz" REGEX "scenetext[0-9]+.jpg" -) + +if(HAVE_CAFFE AND HAVE_GLOG AND HAVE_PROTOBUF) + include_directories(${Caffe_INCLUDE_DIR}) + find_package(HDF5 COMPONENTS HL REQUIRED) + include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR}) + list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES}) + find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem) + include_directories(SYSTEM ${Boost_INCLUDE_DIR}) + include_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/include/ usr/local/cuda-8.0/include/ /usr/local/cuda-7.5/targets/x86_64-linux/include/ ) + link_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/lib/ usr/local/cuda-8.0/lib/ /usr/local/cuda-7.5/targets/x86_64-linux/lib/ /usr/lib/openblas-base/lib /usr/local/cuda-8.0/lib64) + list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES}) + target_link_libraries(opencv_text atlas blas ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES}) + add_definitions(-DHAVE_CAFFE) +endif() #HAVE_CAFFE + +message(STATUS "TEXT CAFFE SEARCH") +if() + message(STATUS "TEXT NO CAFFE CONFLICT") +else() + message(STATUS "TEXT CAFFE CONFLICT") +endif() + diff --git a/modules/text/FindCaffe.cmake b/modules/text/FindCaffe.cmake new file mode 100644 index 000000000..12948f629 --- /dev/null +++ b/modules/text/FindCaffe.cmake @@ -0,0 +1,14 @@ +# Caffe package for CNN Triplet training +unset(Caffe_FOUND) + +find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp + HINTS + /usr/local/include) + +find_library(Caffe_LIBS NAMES caffe + HINTS + /usr/local/lib) + +if(Caffe_LIBS AND Caffe_INCLUDE_DIR) + set(Caffe_FOUND 1) +endif() diff --git a/modules/text/FindGlog.cmake b/modules/text/FindGlog.cmake new file mode 100755 index 000000000..c30e9f4a6 --- /dev/null +++ b/modules/text/FindGlog.cmake @@ -0,0 +1,10 @@ +#Required for Caffe +unset(Glog_FOUND) + +find_library(Glog_LIBS NAMES glog + HINTS + /usr/local/lib) + +if(Glog_LIBS) + set(Glog_FOUND 1) +endif() diff --git a/modules/text/FindProtobuf.cmake b/modules/text/FindProtobuf.cmake new file mode 100644 index 000000000..6d0ad56a1 --- /dev/null +++ b/modules/text/FindProtobuf.cmake @@ -0,0 +1,10 @@ +#Protobuf package required for Caffe +unset(Protobuf_FOUND) + +find_library(Protobuf_LIBS NAMES protobuf + HINTS + /usr/local/lib) + +if(Protobuf_LIBS) + set(Protobuf_FOUND 1) +endif() diff --git a/modules/text/FindTesseract.cmake b/modules/text/FindTesseract.cmake new file mode 100644 index 000000000..54c4a4929 --- /dev/null +++ b/modules/text/FindTesseract.cmake @@ -0,0 +1,24 @@ +# Tesseract OCR +unset(Tesseract_FOUND) + +find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h + HINTS + /usr/include + /usr/local/include) + +find_library(Tesseract_LIBRARY NAMES tesseract + HINTS + /usr/lib + /usr/local/lib) + +find_library(Lept_LIBRARY NAMES lept + HINTS + /usr/lib + /usr/local/lib) + +set(Tesseract_LIBS ${Tesseract_LIBRARY} ${Lept_LIBRARY}) +if(Tesseract_LIBS AND Tesseract_INCLUDE_DIR) + set(Tesseract_FOUND 1) +endif() + + diff --git a/modules/text/README.md b/modules/text/README.md index bbbad11a1..3a3a897f7 100644 --- a/modules/text/README.md +++ b/modules/text/README.md @@ -47,3 +47,75 @@ Notes 2. Tesseract configure script may fail to detect leptonica, so you may have to edit the configure script - comment off some if's around this message and retain only "then" branch. 3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages. + + +Word spotting CNN +================= + +Intro +----- + +A word spotting CNN is a CNN that takes an image assumed to contain a single word and provides a probabillity over a given vocabulary. +Although other backends will be supported, for the moment only the Caffe backend is supported. + + + + +Instalation of Caffe backend +---------------------------- +The caffe wrapping backend has the requirements caffe does. +* Caffe can be built against OpenCV, if the caffe backend is enabled, a circular bependency arises. +The simplest solution is to build caffe without support for OpenCV. +* Only the OS supported by Caffe are supported by the backend. +The scripts describing the module have been developed in ubuntu 16.04 and assume such a system. +Other UNIX systems including OSX should be easy to adapt. + +Sample script for building Caffe + +```bash +#!/bin/bash +SRCROOT="${HOME}/caffe_inst/" +mkdir -p "$SRCROOT" +cd "$SRCROOT" +git clone https://github.com/BVLC/caffe.git +cd caffe +git checkout 91b09280f5233cafc62954c98ce8bc4c204e7475 +git branch 91b09280f5233cafc62954c98ce8bc4c204e7475 +cat Makefile.config.example > Makefile.config +echo 'USE_OPENCV := 0' >> Makefile.config +echo 'INCLUDE_DIRS += /usr/include/hdf5/serial/' >> Makefile.config +echo 'LIBRARY_DIRS += /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config + + +echo "--- /tmp/caffe/include/caffe/net.hpp 2017-05-28 04:55:47.929623902 +0200 ++++ caffe/distribute/include/caffe/net.hpp 2017-05-28 04:51:33.437090768 +0200 +@@ -234,6 +234,7 @@ + + template + friend class Net; ++ virtual ~Callback(){} + }; + const vector& before_forward() const { return before_forward_; } + void add_before_forward(Callback* value) { +">/tmp/cleanup_caffe.diff + +patch < /tmp/cleanup_caffe.diff + + +make -j 6 + +make pycaffe + +make distribute +``` + + +```bash +#!/bin/bash +cd $OPENCV_BUILD_DIR #You must set this +CAFFEROOT="${HOME}/caffe_inst/" #If you used the previous code to compile Caffe in ubuntu 16.04 + +cmake -DCaffe_LIBS:FILEPATH="$CAFFEROOT/caffe/distribute/lib/libcaffe.so" -DBUILD_opencv_ts:BOOL="0" -DBUILD_opencv_dnn:BOOL="0" -DBUILD_opencv_dnn_modern:BOOL="0" -DCaffe_INCLUDE_DIR:PATH="$CAFFEROOT/caffe/distribute/include" -DWITH_MATLAB:BOOL="0" -DBUILD_opencv_cudabgsegm:BOOL="0" -DWITH_QT:BOOL="1" -DBUILD_opencv_cudaoptflow:BOOL="0" -DBUILD_opencv_cudastereo:BOOL="0" -DBUILD_opencv_cudafilters:BOOL="0" -DBUILD_opencv_cudev:BOOL="1" -DOPENCV_EXTRA_MODULES_PATH:PATH="/home/anguelos/work/projects/opencv_gsoc/opencv_contrib/modules" ./ + + +``` diff --git a/modules/text/include/opencv2/text.hpp b/modules/text/include/opencv2/text.hpp index 945194a16..c4c2975b8 100644 --- a/modules/text/include/opencv2/text.hpp +++ b/modules/text/include/opencv2/text.hpp @@ -41,6 +41,7 @@ the use of this software, even if advised of the possibility of such damage. #include "opencv2/text/erfilter.hpp" #include "opencv2/text/ocr.hpp" +#include "opencv2/text/textDetector.hpp" /** @defgroup text Scene Text Detection and Recognition @@ -92,7 +93,7 @@ grouping horizontally aligned text, and the method proposed by Lluis Gomez and D in [Gomez13][Gomez14] for grouping arbitrary oriented text (see erGrouping). To see the text detector at work, have a look at the textdetection demo: - + @defgroup text_recognize Scene Text Recognition @} diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 1261046cd..9fc5403fd 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -46,6 +46,10 @@ #include #include +#include +#include + + namespace cv { @@ -61,82 +65,126 @@ enum OCR_LEVEL_TEXTLINE }; -//base class BaseOCR declares a common API that would be used in a typical text recognition scenario +//base class BaseOCR declares a common API that would be used in a typical text +//recognition scenario class CV_EXPORTS_W BaseOCR { -public: + public: virtual ~BaseOCR() {}; - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + + virtual void run(Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0) = 0; - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + + virtual void run(Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0) = 0; + + /** @brief Main functionality of the OCR Hierarchy. Subclasses provide + * default parameters for all parameters other than the input image. + */ + virtual String run(InputArray image){ + std::string res; + std::vector component_rects; + std::vector component_confidences; + std::vector component_texts; + Mat inputImage=image.getMat(); + this->run(inputImage,res,&component_rects,&component_texts, + &component_confidences,OCR_LEVEL_WORD); + return res; + } + }; -/** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++. +/** @brief OCRTesseract class provides an interface with the tesseract-ocr API + * (v3.02.02) in C++. Notice that it is compiled only when tesseract-ocr is correctly installed. @note - - (C++) An example of OCRTesseract recognition combined with scene text detection can be found - at the end_to_end_recognition demo: - - - (C++) Another example of OCRTesseract recognition combined with scene text detection can be - found at the webcam_demo: - + - (C++) An example of OCRTesseract recognition combined with scene text + detection can be found at the end_to_end_recognition demo: + + - (C++) Another example of OCRTesseract recognition combined with scene + text detection can be found at the webcam_demo: + */ class CV_EXPORTS_W OCRTesseract : public BaseOCR { public: /** @brief Recognize text using the tesseract-ocr API. - Takes image on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + Takes image on input and returns recognized text in the output_text + parameter. Optionally provides also the Rects for individual text elements + found (e.g. words), and the list of those text elements with their + confidence values. @param image Input image CV_8UC1 or CV_8UC3 + @param output_text Output text of the tesseract-ocr. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words or text lines). - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words or text lines). - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words or text lines). + + @param component_rects If provided the method will output a list of Rects + for the individual text elements found (e.g. words or text lines). + + @param component_texts If provided the method will output a list of text + strings for the recognition of individual text elements found (e.g. words or + text lines). + + @param component_confidences If provided the method will output a list of + confidence values for the recognition of individual text elements found + (e.g. words or text lines). + @param component_level OCR_LEVEL_WORD (by default), or OCR_LEVEL_TEXT_LINE. */ - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + virtual void run (Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0); - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0); + virtual void run (Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, + int component_level=0); // aliases for scripting - CV_WRAP String run(InputArray image, int min_confidence, int component_level=0); + CV_WRAP String run (InputArray image, int min_confidence, + int component_level=0); - CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, InputArray mask, + int min_confidence, int component_level=0); CV_WRAP virtual void setWhiteList(const String& char_whitelist) = 0; - /** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract. + /** @brief Creates an instance of the OCRTesseract class. Initializes + * Tesseract. + + * @param datapath the name of the parent directory of tessdata ended with + * "/", or NULL to use the system's default directory. + + * @param language an ISO 639-3 code or NULL will default to "eng". + + * @param char_whitelist specifies the list of characters used for + * recognition. NULL defaults to "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ". - @param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the - system's default directory. - @param language an ISO 639-3 code or NULL will default to "eng". - @param char_whitelist specifies the list of characters used for recognition. NULL defaults to - "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ". - @param oem tesseract-ocr offers different OCR Engine Modes (OEM), by deffault - tesseract::OEM_DEFAULT is used. See the tesseract-ocr API documentation for other possible - values. - @param psmode tesseract-ocr offers different Page Segmentation Modes (PSM) tesseract::PSM_AUTO - (fully automatic layout analysis) is used. See the tesseract-ocr API documentation for other - possible values. + * @param oem tesseract-ocr offers different OCR Engine Modes (OEM), by + * default tesseract::OEM_DEFAULT is used. See the tesseract-ocr API + * documentation for other possible values. + + * @param psmode tesseract-ocr offers different Page Segmentation Modes + * (PSM) tesseract::PSM_AUTO (fully automatic layout analysis) is used. See + * the tesseract-ocr API documentation for other possible values. */ - CV_WRAP static Ptr create(const char* datapath=NULL, const char* language=NULL, - const char* char_whitelist=NULL, int oem=3, int psmode=3); + CV_WRAP static Ptr create (const char* datapath=NULL, + const char* language=NULL, + const char* char_whitelist=NULL, + int oem=3, int psmode=3); }; @@ -147,134 +195,156 @@ enum decoder_mode OCR_DECODER_VITERBI = 0 // Other algorithms may be added }; -/** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov Models. +/** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov + * Models. -@note - - (C++) An example on using OCRHMMDecoder recognition combined with scene text detection can - be found at the webcam_demo sample: - + * @note + * - (C++) An example on using OCRHMMDecoder recognition combined with scene + * text detection can be found at the webcam_demo sample: + * */ -class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR -{ -public: +class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { + public: /** @brief Callback with the character classifier is made a class. - This way it hides the feature extractor and the classifier itself, so developers can write - their own OCR code. + * This way it hides the feature extractor and the classifier itself, so + * developers can write their own OCR code. - The default character classifier and feature extractor can be loaded using the utility funtion - loadOCRHMMClassifierNM and KNN model provided in - . - */ - class CV_EXPORTS_W ClassifierCallback - { - public: + * The default character classifier and feature extractor can be loaded using + * the utility funtion loadOCRHMMClassifierNM and KNN model provided in + * . + */ + class CV_EXPORTS_W ClassifierCallback{ + public: virtual ~ClassifierCallback() { } - /** @brief The character classifier must return a (ranked list of) class(es) id('s) + /** @brief The character classifier must return a (ranked list of) + * class(es) id('s) - @param image Input image CV_8UC1 or CV_8UC3 with a single letter. - @param out_class The classifier returns the character class categorical label, or list of - class labels, to which the input image corresponds. - @param out_confidence The classifier returns the probability of the input image - corresponding to each classes in out_class. + * @param image Input image CV_8UC1 or CV_8UC3 with a single letter. + * @param out_class The classifier returns the character class + * categorical label, or list of class labels, to which the input image + * corresponds. + + * @param out_confidence The classifier returns the probability of the + * input image corresponding to each classes in out_class. */ - virtual void eval( InputArray image, std::vector& out_class, std::vector& out_confidence); + virtual void eval (InputArray image, std::vector& out_class, + std::vector& out_confidence); }; -public: /** @brief Recognize text using HMM. - Takes binary image on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + * Takes binary image on input and returns recognized text in the output_text + * parameter. Optionally provides also the Rects for individual text elements + * found (e.g. words), and the list of those text elements with their + * confidence values. - @param image Input binary image CV_8UC1 with a single text line (or word). + * @param image Input binary image CV_8UC1 with a single text line (or word). - @param output_text Output text. Most likely character sequence found by the HMM decoder. + * @param output_text Output text. Most likely character sequence found by + * the HMM decoder. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words). + * @param component_rects If provided the method will output a list of Rects + * for the individual text elements found (e.g. words). - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words). + * @param component_texts If provided the method will output a list of text + * strings for the recognition of individual text elements found (e.g. words) + * . - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words). + * @param component_confidences If provided the method will output a list of + * confidence values for the recognition of individual text elements found + * (e.g. words). - @param component_level Only OCR_LEVEL_WORD is supported. - */ - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0); + * @param component_level Only OCR_LEVEL_WORD is supported. + */ + virtual void run (Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, + int component_level=0); /** @brief Recognize text using HMM. - Takes an image and a mask (where each connected component corresponds to a segmented character) - on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + * Takes an image and a mask (where each connected component corresponds to a + * segmented character) on input and returns recognized text in the + * output_text parameter. Optionally provides also the Rects for individual + * text elements found (e.g. words), and the list of those text elements with + * their confidence values. - @param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word). - @param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image. + * @param image Input image CV_8UC1 or CV_8UC3 with a single text line + * (or word). - @param output_text Output text. Most likely character sequence found by the HMM decoder. + * @param mask Input binary image CV_8UC1 same size as input image. Each + * connected component in mask corresponds to a segmented character in the + * input image. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words). + * @param output_text Output text. Most likely character sequence found by + * the HMM decoder. - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words). + * @param component_rects If provided the method will output a list of Rects + * for the individual text elements found (e.g. words). - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words). + * @param component_texts If provided the method will output a list of text + * strings for the recognition of individual text elements found (e.g. words) + * . - @param component_level Only OCR_LEVEL_WORD is supported. - */ - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + * @param component_confidences If provided the method will output a list of + * confidence values for the recognition of individual text elements found + * (e.g. words). + + * @param component_level Only OCR_LEVEL_WORD is supported. + */ + virtual void run(Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0); // aliases for scripting - CV_WRAP String run(InputArray image, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, + int min_confidence, + int component_level=0); - CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, + InputArray mask, + int min_confidence, + int component_level=0); - /** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder. + /** @brief Creates an instance of the OCRHMMDecoder class. Initializes + * HMMDecoder. - @param classifier The character classifier with built in feature extractor. + * @param classifier The character classifier with built in feature + * extractor. - @param vocabulary The language vocabulary (chars when ascii english text). vocabulary.size() - must be equal to the number of classes of the classifier. + * @param vocabulary The language vocabulary (chars when ascii english text) + * . vocabulary.size() must be equal to the number of classes of the + * classifier. - @param transition_probabilities_table Table with transition probabilities between character - pairs. cols == rows == vocabulary.size(). + * @param transition_probabilities_table Table with transition probabilities + * between character pairs. cols == rows == vocabulary.size(). - @param emission_probabilities_table Table with observation emission probabilities. cols == - rows == vocabulary.size(). + * @param emission_probabilities_table Table with observation emission + * probabilities. cols == rows == vocabulary.size(). - @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available for the moment - (). + * @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available + * for the moment (). */ - static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor - const std::string& vocabulary, // The language vocabulary (chars when ascii english text) - // size() must be equal to the number of classes - InputArray transition_probabilities_table, // Table with transition probabilities between character pairs - // cols == rows == vocabulari.size() - InputArray emission_probabilities_table, // Table with observation emission probabilities - // cols == rows == vocabulari.size() - decoder_mode mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) - - CV_WRAP static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor - const String& vocabulary, // The language vocabulary (chars when ascii english text) - // size() must be equal to the number of classes - InputArray transition_probabilities_table, // Table with transition probabilities between character pairs - // cols == rows == vocabulari.size() - InputArray emission_probabilities_table, // Table with observation emission probabilities - // cols == rows == vocabulari.size() - int mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) - -protected: + static Ptr create( + const Ptr classifier, // The character classifier with built in feature extractor + const std::string& vocabulary, // The language vocabulary (chars when ascii english text) size() must be equal to the number of classes + InputArray transition_probabilities_table, // Table with transition probabilities between character pairs cols == rows == vocabulari.size() + InputArray emission_probabilities_table, // Table with observation emission probabilities cols == rows == vocabulari.size() + decoder_mode mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) + + CV_WRAP static Ptr create( + const Ptr classifier, // The character classifier with built in feature extractor + const String& vocabulary, // The language vocabulary (chars when ascii english text) size() must be equal to the number of classes + InputArray transition_probabilities_table, // Table with transition probabilities between character pairs cols == rows == vocabulari.size() + InputArray emission_probabilities_table, // Table with observation emission probabilities cols == rows == vocabulari.size() + int mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) + + protected: Ptr classifier; std::string vocabulary; @@ -283,76 +353,98 @@ protected: decoder_mode mode; }; -/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. +/** @brief Allow to implicitly load the default character classifier when + * creating an OCRHMMDecoder object. -@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml) + * @param filename The XML or YAML file with the classifier model (e.g. + * OCRHMM_knn_model_data.xml) -The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann & -Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a -fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector -based on gradient orientations along the chain-code of its perimeter. Then, the region is classified -using a KNN model trained with synthetic data of rendered characters with different standard font -types. + * The KNN default classifier is based in the scene text recognition method + * proposed by Lukás Neumann & Jiri Matas in [Neumann11b]. Basically, the region + * (contour) in the input image is normalized to a fixed size, while retaining + * the centroid and aspect ratio, in order to extract a feature vector based on + * gradient orientations along the chain-code of its perimeter. Then, the region + * is classified using a KNN model trained with synthetic data of rendered + * characters with different standard font types. */ +CV_EXPORTS_W Ptr loadOCRHMMClassifierNM ( + const String& filename); -CV_EXPORTS_W Ptr loadOCRHMMClassifierNM(const String& filename); +/** @brief Allow to implicitly load the default character classifier when + * creating an OCRHMMDecoder object. -/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. + * @param filename The XML or YAML file with the classifier model (e.g. + * OCRBeamSearch_CNN_model_data.xml.gz) -@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz) - -The CNN default classifier is based in the scene text recognition method proposed by Adam Coates & -Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and -a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions -at each window location. + * The CNN default classifier is based in the scene text recognition method + * proposed by Adam Coates & Andrew NG in [Coates11a]. The character classifier + * consists in a Single Layer Convolutional Neural Network and a linear + * classifier. It is applied to the input image in a sliding window fashion, + * providing a set of recognitions at each window location. */ -CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN(const String& filename); +CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN ( + const String& filename); //! @} -/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon). - * +/** @brief Utility function to create a tailored language model transitions + * table from a given list of words (lexicon). + * @param vocabulary The language vocabulary (chars when ascii english text). - * + * @param lexicon The list of words that are expected to be found in a particular image. - * - * @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size(). - * - * The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. + + * @param transition_probabilities_table Output table with transition + * probabilities between character pairs. cols == rows == vocabulary.size(). + + * The function calculate frequency statistics of character pairs from the given + * lexicon and fills the output transition_probabilities_table with them. The + * transition_probabilities_table can be used as input in the + * OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. * @note - * - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) : - * + * - (C++) An alternative would be to load the default generic language + * transition table provided in the text module samples folder (created + * from ispell 42869 english words list) : + * **/ -CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector& lexicon, OutputArray transition_probabilities_table); - -CV_EXPORTS_W Mat createOCRHMMTransitionsTable(const String& vocabulary, std::vector& lexicon); +CV_EXPORTS void createOCRHMMTransitionsTable ( + std::string& vocabulary, std::vector& lexicon, + OutputArray transition_probabilities_table); +CV_EXPORTS_W Mat createOCRHMMTransitionsTable ( + const String& vocabulary, std::vector& lexicon); /* OCR BeamSearch Decoder */ -/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam Search algorithm. +/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam + * Search algorithm. @note - - (C++) An example on using OCRBeamSearchDecoder recognition combined with scene text detection can - be found at the demo sample: - + - (C++) An example on using OCRBeamSearchDecoder recognition combined with + scene text detection can be found at the demo sample: + */ -class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR -{ -public: + + +/* Forward declaration of class that can be used to generate an OCRBeamSearchDecoder::ClassifierCallbac */ +class TextImageClassifier; + +class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ + + public: /** @brief Callback with the character classifier is made a class. - This way it hides the feature extractor and the classifier itself, so developers can write - their own OCR code. + * This way it hides the feature extractor and the classifier itself, so + * developers can write their own OCR code. - The default character classifier and feature extractor can be loaded using the utility funtion - loadOCRBeamSearchClassifierCNN with all its parameters provided in - . + * The default character classifier and feature extractor can be loaded + * using the utility funtion loadOCRBeamSearchClassifierCNN with all its + * parameters provided in + * . */ - class CV_EXPORTS_W ClassifierCallback - { - public: + class CV_EXPORTS_W ClassifierCallback{ + public: virtual ~ClassifierCallback() { } /** @brief The character classifier must return a (ranked list of) class(es) id('s) @@ -364,8 +456,8 @@ public: */ virtual void eval( InputArray image, std::vector< std::vector >& recognition_probabilities, std::vector& oversegmentation ); - int getWindowSize() {return 0;} - int getStepSize() {return 0;} + virtual int getWindowSize() {return 0;} + virtual int getStepSize() {return 0;} }; public: @@ -421,6 +513,7 @@ public: @param beam_size Size of the beam in Beam Search algorithm. */ + static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor const std::string& vocabulary, // The language vocabulary (chars when ascii english text) // size() must be equal to the number of classes @@ -441,6 +534,44 @@ public: int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) int beam_size = 500); // Size of the beam in Beam Search algorithm + /** @brief This method allows to plug a classifier that is derivative of TextImageClassifier in to + * OCRBeamSearchDecoder as a ClassifierCallback. + + @param classifier A pointer to a TextImageClassifier decendent + + @param alphabet The language alphabet one char per symbol. alphabet.size() must be equal to the number of classes + of the classifier. In future editinons it should be replaced with a vector of strings. + + @param transition_probabilities_table Table with transition probabilities between character + pairs. cols == rows == alphabet.size(). + + @param emission_probabilities_table Table with observation emission probabilities. cols == + rows == alphabet.size(). + + @param windowWidth The width of the windows to which the sliding window will be iterated. The height will + be the height of the image. The windows might be resized to fit the classifiers input by the classifiers + preprocessor. + + @param windowStep The step for the sliding window + + @param mode HMM Decoding algorithm (only Viterbi for the moment) + + @param beam_size Size of the beam in Beam Search algorithm + */ +// CV_WRAP static Ptr create(const Ptr classifier, // The character classifier with built in feature extractor +// String alphabet, // The language alphabet one char per symbol +// // size() must be equal to the number of classes +// InputArray transition_probabilities_table, // Table with transition probabilities between character pairs +// // cols == rows == alphabet.size() +// InputArray emission_probabilities_table, // Table with observation emission probabilities +// // cols == rows == alphabet.size() +// int windowWidth, // The width of the windows to which the sliding window will be iterated. +// // The height will be the height of the image. The windows might be resized to +// // fit the classifiers input by the classifiers preprocessor +// int windowStep = 1 , // The step for the sliding window +// int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) +// int beam_size = 500); // Size of the beam in Beam Search algorithm + protected: Ptr classifier; @@ -465,6 +596,364 @@ CV_EXPORTS_W Ptr loadOCRBeamSearchClas //! @} -} -} + +//Classifiers should provide diferent backends +//For the moment only caffe is implemeted +enum{ + OCR_HOLISTIC_BACKEND_NONE, + OCR_HOLISTIC_BACKEND_CAFFE +}; + +class TextImageClassifier; + +/** + * @brief The ImagePreprocessor class + */ +class CV_EXPORTS_W ImagePreprocessor{ +protected: + virtual void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels)=0; + virtual void set_mean_(Mat){} + +public: + virtual ~ImagePreprocessor(){} + + /** @brief this method in provides public acces to the preprocessing with respect to a specific + * classifier + * + * This method's main use would be to use the preprocessor without feeding it to a classifier. + * Determining the exact behavior of a preprocessor is the main motivation for this. + * + * @param input an image without any constraints + * + * @param output in most cases an image of fixed depth size and whitened + * + * @param sz the size to which the image would be resize if the preprocessor resizes inputs + * + * @param outputChannels the number of channels for the output image + */ + CV_WRAP void preprocess(InputArray input,OutputArray output,Size sz,int outputChannels); + + CV_WRAP void set_mean(Mat mean); + + /** @brief Creates a functor that only resizes and changes the channels of the input + * without further processing. + * + * @return shared pointer to the generated preprocessor + */ + CV_WRAP static Ptr createResizer(); + + /** @brief + * + * @param sigma + * + * @return shared pointer to generated preprocessor + */ + CV_WRAP static Ptr createImageStandarizer(double sigma); + + /** @brief + * + * @return shared pointer to generated preprocessor + */ + CV_WRAP static Ptr createImageMeanSubtractor(InputArray meanImg); + + CV_WRAP static PtrcreateImageCustomPreprocessor(double rawval=1.0,String channel_order="BGR"); + + friend class TextImageClassifier; + +}; + +/** @brief Abstract class that implements the classifcation of text images. + * + * The interface is generic enough to describe any image classifier. And allows + * to take advantage of compouting in batches. While word classifiers are the default + * networks, any image classifers should work. + * + */ +class CV_EXPORTS_W TextImageClassifier +{ +protected: + Size inputGeometry_; + Size outputGeometry_; + int channelCount_; + Ptr preprocessor_; + /** @brief all image preprocessing is handled here including whitening etc. + * + * @param input the image to be preprocessed for the classifier. If the depth + * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1] + * + * @param output reference to the image to be fed to the classifier, the preprocessor will + * resize the image to the apropriate size and convert it to the apropriate depth\ + * + * The method preprocess should never be used externally, it is up to classify and classifyBatch + * methods to employ it. + */ + virtual void preprocess(const Mat& input,Mat& output); +public: + virtual ~TextImageClassifier() {} + + /** @brief + */ + CV_WRAP virtual void setPreprocessor(Ptr ptr); + + /** @brief + */ + CV_WRAP Ptr getPreprocessor(); + + /** @brief produces a class confidence row-vector given an image + */ + CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0; + /** @brief produces a list of bounding box given an image + */ + + CV_WRAP virtual void detect(InputArray image, OutputArray classProbabilities) = 0; + + /** @brief produces a matrix containing class confidence row-vectors given an collection of images + */ + CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0; + + /** @brief simple getter method returning the number of channels each input sample has + */ + CV_WRAP virtual int getInputChannelCount(){return this->channelCount_;} + + /** @brief simple getter method returning the size of the input sample + */ + CV_WRAP virtual Size getInputSize(){return this->inputGeometry_;} + + /** @brief simple getter method returning the size of the oputput row-vector + */ + CV_WRAP virtual int getOutputSize()=0; + CV_WRAP virtual Size getOutputGeometry()=0; + + /** @brief simple getter method returning the size of the minibatches for this classifier. + * If not applicabe this method should return 1 + */ + CV_WRAP virtual int getMinibatchSize()=0; + + friend class ImagePreprocessor; +}; + + + +class CV_EXPORTS_W DeepCNN:public TextImageClassifier +{ + /** @brief Class that uses a pretrained caffe model for word classification. + * + * This network is described in detail in: + * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015 + * http://arxiv.org/abs/1412.1842 + */ +public: + virtual ~DeepCNN() {}; + + /** @brief Constructs a DeepCNN object from a caffe pretrained model + * + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * + * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be + * very large, up to 2GB. + * + * @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method; + * + * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter + * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + + /** @brief Constructs a DeepCNN intended to be used for word spotting. + * + * This method loads a pretrained classifier and couples him with a preprocessor that standarises pixels with a + * deviation of 113. The architecture file can be downloaded from: + * + * While the weights can be downloaded from: + * + * The words assigned to the network outputs are available at: + * + * + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt". + * + * @param weightsFilename is the path to the pretrained weights of the model. When employing + * OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file. This file can be very large, the + * pretrained DictNet uses 2GB. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr createDictNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + +}; + +namespace cnn_config{ +namespace caffe_backend{ + +/** @brief Prompts Caffe on the computation device beeing used + * + * Caffe can only be controlled globally on whether the GPU or the CPU is used has a + * global behavior. This function queries the current state of caffe. + * If the module is built without caffe, this method throws an exception. + * + * @return true if caffe is computing on the GPU, false if caffe is computing on the CPU + */ +CV_EXPORTS_W bool getCaffeGpuMode(); + +/** @brief Sets the computation device beeing used by Caffe + * + * Caffe can only be controlled globally on whether the GPU or the CPU is used has a + * global behavior. This function queries the current state of caffe. + * If the module is built without caffe, this method throws an exception. + * + * @param useGpu set to true for caffe to be computing on the GPU, false if caffe is + * computing on the CPU + */ +CV_EXPORTS_W void setCaffeGpuMode(bool useGpu); + +/** @brief Provides runtime information on whether Caffe support was compiled in. + * + * The text module API is the same regardless of whether CAffe was available or not + * During compilation. When methods that require Caffe are invocked while Caffe support + * is not compiled in, exceptions are thrown. This method allows to test whether the + * text module was built with caffe during runtime. + * + * @return true if Caffe support for the the text module was provided during compilation, + * false if Caffe was unavailable. + */ +CV_EXPORTS_W bool getCaffeAvailable(); + +}//caffe +}//cnn_config + +/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. + * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable + * word given an input image. + * + * This class implements the logic of providing transcriptions given a vocabulary and and an image + * classifer. The classifier has to be any TextImageClassifier but the classifier for which this + * class was built is the DictNet. In order to load it the following files should be downloaded: + + * + * + * + */ +class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR +{ +public: + virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + /** @brief Recognize text using a segmentation based word-spotting/classifier cnn. + + Takes image on input and returns recognized text in the output_text parameter. Optionally + provides also the Rects for individual text elements found (e.g. words), and the list of those + text elements with their confidence values. + + @param image Input image CV_8UC1 or CV_8UC3 + + @param mask is totally ignored and is only available for compatibillity reasons + + @param output_text Output text of the the word spoting, always one that exists in the dictionary. + + @param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_level must be OCR_LEVEL_WORD. + */ + + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + + /** + @brief Method that provides a quick and simple interface to a single word image classifcation + + @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word + + @param transcription an opencv string that will store the detected word transcription + + @param confidence a double that will be updated with the confidence the classifier has for the selected word + */ + CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0; + + /** + @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage + the classifiers parallel capabilities. + + @param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed + to contain a single word. + + @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each + input image + + @param confidences a vector of double that will be updated with the confidence the classifier has for each of the + selected words. + */ + CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptions,CV_OUT std::vector& confidences)=0; + + + /** + @brief simple getter for the vocabulary employed + */ + CV_WRAP virtual const std::vector& getVocabulary()=0; + + /** @brief simple getter for the preprocessing functor + */ + CV_WRAP virtual Ptr getClassifier()=0; + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class. + + @param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(Ptr classifierPtr,String vocabularyFilename); + + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DeepCNN classifier. + + @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. + + @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename); + + /** @brief + * + * @param classifierPtr + * + * @param vocabulary + */ + CV_WRAP static Ptr create(Ptr classifierPtr,const std::vector& vocabulary); + + /** @brief + * + * @param modelArchFilename + * + * @param modelWeightsFilename + * + * @param vocabulary + */ + CV_WRAP static Ptr create (String modelArchFilename, String modelWeightsFilename, const std::vector& vocabulary); +}; + + +}//namespace text +}//namespace cv + + #endif // _OPENCV_TEXT_OCR_HPP_ diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp new file mode 100644 index 000000000..262795733 --- /dev/null +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -0,0 +1,235 @@ +/*M////////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__ +#define __OPENCV_TEXT_TEXTDETECTOR_HPP__ + +#include +#include +#include +#include +#include"ocr.hpp" + + +namespace cv +{ +namespace text +{ + +//! @addtogroup text_recognize +//! @{ + + + +//base class BaseDetector declares a common API that would be used in a typical text +//recognition scenario +class CV_EXPORTS_W BaseDetector +{ + public: + virtual ~BaseDetector() {}; + + virtual void run(Mat& image, + std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) = 0; + + virtual void run(Mat& image, Mat& mask, + std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) = 0; + + /** @brief Main functionality of the OCR Hierarchy. Subclasses provide + * default parameters for all parameters other than the input image. + */ +// virtual std::vector* run(InputArray image){ +// //std::string res; +// std::vector component_rects; +// std::vector component_confidences; +// //std::vector component_texts; +// Mat inputImage=image.getMat(); +// this->run(inputImage,&component_rects, +// &component_confidences,OCR_LEVEL_WORD); +// return *component_rects; +// } + +}; + + +//Classifiers should provide diferent backends +//For the moment only caffe is implemeted +//enum{ +// OCR_HOLISTIC_BACKEND_NONE, +// OCR_HOLISTIC_BACKEND_CAFFE +//}; + + + + + +/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. + * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable + * word given an input image. + * + * This class implements the logic of providing transcriptions given a vocabulary and and an image + * classifer. The classifier has to be any TextImageClassifier but the classifier for which this + * class was built is the DictNet. In order to load it the following files should be downloaded: + + * + * + * + */ +class CV_EXPORTS_W textDetector : public BaseDetector +{ +public: + virtual void run(Mat& image, std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + /** @brief Recognize text using a segmentation based word-spotting/classifier cnn. + + Takes image on input and returns recognized text in the output_text parameter. Optionally + provides also the Rects for individual text elements found (e.g. words), and the list of those + text elements with their confidence values. + + @param image Input image CV_8UC1 or CV_8UC3 + + @param mask is totally ignored and is only available for compatibillity reasons + + @param output_text Output text of the the word spoting, always one that exists in the dictionary. + + @param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_level must be OCR_LEVEL_WORD. + */ + + virtual void run(Mat& image, Mat& mask, std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + + /** + @brief Method that provides a quick and simple interface to a single word image classifcation + + @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size + + @param transcription an opencv string that will store the detected word transcription + + @param confidence a double that will be updated with the confidence the classifier has for the selected word + */ + CV_WRAP virtual void textDetectInImage(InputArray inputImage,CV_OUT std::vector& Bbox,CV_OUT std::vector& confidence)=0; + + /** + @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage + the classifiers parallel capabilities. + + @param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed + to contain a single word. + + @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each + input image + + @param confidences a vector of double that will be updated with the confidence the classifier has for each of the + selected words. + */ + //CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptions,CV_OUT std::vector& confidences)=0; + + + /** @brief simple getter for the preprocessing functor + */ + CV_WRAP virtual Ptr getClassifier()=0; + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class. + + @param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(Ptr classifierPtr); + + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DeepCNN classifier. + + @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. + + @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(String modelArchFilename, String modelWeightsFilename); + + /** @brief + * + * @param classifierPtr + * + * @param vocabulary + */ + // CV_WRAP static Ptr create(Ptr classifierPtr,const std::vector& vocabulary); + + /** @brief + * + * @param modelArchFilename + * + * @param modelWeightsFilename + * + * @param vocabulary + */ + // CV_WRAP static Ptr create (String modelArchFilename, String modelWeightsFilename, const std::vector& vocabulary); +}; + + +}//namespace text +}//namespace cv + + +#endif // _OPENCV_TEXT_OCR_HPP_ diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp new file mode 100644 index 000000000..9791e62bb --- /dev/null +++ b/modules/text/src/ocr_holistic.cpp @@ -0,0 +1,879 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/core.hpp" + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef HAVE_CAFFE +#include "caffe/caffe.hpp" +#endif + +namespace cv { namespace text { + +//Maybe OpenCV has a routine better suited +inline bool fileExists (String filename) { + std::ifstream f(filename.c_str()); + return f.good(); +} + +//************************************************************************************ +//****************** ImagePreprocessor ******************************************* +//************************************************************************************ + +void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){ + Mat inpImg=input.getMat(); + Mat outImg; + this->preprocess_(inpImg,outImg,sz,outputChannels); + outImg.copyTo(output); +} +void ImagePreprocessor::set_mean(Mat mean){ + + + this->set_mean_(mean); + +} + + +class ResizerPreprocessor: public ImagePreprocessor{ +protected: + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1){ + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U){ + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + } + //void set_mean_(Mat m){} +public: + ResizerPreprocessor(){} + ~ResizerPreprocessor(){} +}; + +class StandarizerPreprocessor: public ImagePreprocessor{ +protected: + double sigma_; + //void set_mean_(Mat M){} + + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + + Scalar mean,dev; + meanStdDev(output,mean,dev); + subtract(output,mean[0],output); + divide(output,(dev[0]/sigma_),output); + } +public: + StandarizerPreprocessor(double sigma):sigma_(sigma){} + ~StandarizerPreprocessor(){} + +}; + +class customPreprocessor:public ImagePreprocessor{ +protected: + + double rawval_; + Mat mean_; + String channel_order_; + + void set_mean_(Mat imMean_){ + + imMean_.copyTo(this->mean_); + + + } + + void set_raw_scale(int rawval){ + rawval_ = rawval; + + } + void set_channels(String channel_order){ + channel_order_=channel_order; + } + + + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + tmpInput.convertTo(output,CV_32FC3,1/255.0); + else + tmpInput.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + tmpInput.convertTo(output, CV_32FC1); + else + tmpInput.convertTo(output, CV_32FC1,rawval_); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + tmpInput.convertTo(output,CV_32FC3,1/255.0); + else + tmpInput.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + tmpInput.convertTo(output, CV_32FC1); + else + tmpInput.convertTo(output, CV_32FC1,rawval_); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + input.convertTo(output,CV_32FC1,1/255.0); + else + input.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + input.convertTo(output, CV_32FC1); + else + input.convertTo(output, CV_32FC1,rawval_); + } + }else + { + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + input.convertTo(output,CV_32FC3,1/255.0); + else + input.convertTo(output,CV_32FC3); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + input.convertTo(output, CV_32FC3); + else + input.convertTo(output, CV_32FC3,rawval_); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + + if (!this->mean_.empty()){ + + Scalar mean_s(this->mean_.at(0,0),this->mean_.at(0,1),this->mean_.at(0,2)); + subtract(output,mean_s,output); + } + else{ + Scalar mean_s; + mean_s = mean(output); + subtract(output,mean_s,output); + } + + } + +public: + customPreprocessor( double rawval,String channel_order):rawval_(rawval),channel_order_(channel_order){} + ~customPreprocessor(){} + +}; + +class MeanSubtractorPreprocessor: public ImagePreprocessor{ +protected: + Mat mean_; + //void set_mean_(Mat m){} + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height); + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + subtract(output,this->mean_,output); + } +public: + MeanSubtractorPreprocessor(Mat mean) + { + mean.copyTo(this->mean_); + } + + ~MeanSubtractorPreprocessor(){} +}; + + + + + +Ptr ImagePreprocessor::createResizer() +{ + return Ptr(new ResizerPreprocessor); +} + +Ptr ImagePreprocessor::createImageStandarizer(double sigma) +{ + return Ptr(new StandarizerPreprocessor(sigma)); +} +Ptr ImagePreprocessor::createImageCustomPreprocessor(double rawval,String channel_order) +{ + + return Ptr(new customPreprocessor(rawval,channel_order)); +} + +Ptr ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg) +{ + Mat tmp=meanImg.getMat(); + return Ptr(new MeanSubtractorPreprocessor(tmp)); +} + +//************************************************************************************ +//****************** TextImageClassifier ***************************************** +//************************************************************************************ + +void TextImageClassifier::preprocess(const Mat& input,Mat& output) +{ + this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_); +} + +void TextImageClassifier::setPreprocessor(Ptr ptr) +{ + CV_Assert(!ptr.empty()); + preprocessor_=ptr; +} + +Ptr TextImageClassifier::getPreprocessor() +{ + return preprocessor_; +} + + +class DeepCNNCaffeImpl: public DeepCNN{ +protected: + void classifyMiniBatch(std::vector inputImageList, Mat outputMat) + { + //Classifies a list of images containing at most minibatchSz_ images + CV_Assert(int(inputImageList.size())<=this->minibatchSz_); + CV_Assert(outputMat.isContinuous()); + +#ifdef HAVE_CAFFE + net_->input_blobs()[0]->Reshape(inputImageList.size(), this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); + net_->Reshape(); + float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); + float* inputData=inputBuffer; + + for(size_t imgNum=0;imgNum input_channels; + Mat preprocessed; + // if the image have multiple color channels the input layer should be populated accordingly + for (int channel=0;channel < this->channelCount_;channel++){ + + cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); + input_channels.push_back(netInputWraped); + //input_data += width * height; + inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + } + this->preprocess(inputImageList[imgNum],preprocessed); + split(preprocessed, input_channels); + + } + this->net_->ForwardPrefilled(); + const float* outputNetData=net_->output_blobs()[0]->cpu_data(); + this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); + int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; + + //outputMat.resize(this->outputGeometry_.height * this->outputGeometry_.width); + float*outputMatData=(float*)(outputMat.data); + memcpy(outputMatData,outputNetData,sizeof(float)*outputSz*inputImageList.size()); + +#endif + } + + void process_(Mat inputImage, Mat &outputMat) + { + // do forward pass and stores the output in outputMat + //Process one image + CV_Assert(this->minibatchSz_==1); + //CV_Assert(outputMat.isContinuous()); + +#ifdef HAVE_CAFFE + net_->input_blobs()[0]->Reshape(1, this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); + net_->Reshape(); + float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); + float* inputData=inputBuffer; + + std::vector input_channels; + Mat preprocessed; + // if the image have multiple color channels the input layer should be populated accordingly + for (int channel=0;channel < this->channelCount_;channel++){ + + cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); + input_channels.push_back(netInputWraped); + //input_data += width * height; + inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + } + this->preprocess(inputImage,preprocessed); + split(preprocessed, input_channels); + + //preprocessed.copyTo(netInputWraped); + + + this->net_->Forward(); + const float* outputNetData=net_->output_blobs()[0]->cpu_data(); + // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); + + + + + this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); + int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; + outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); + float*outputMatData=(float*)(outputMat.data); + + memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); + + + +#endif + } + + + +#ifdef HAVE_CAFFE + Ptr > net_; +#endif + //Size inputGeometry_; + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + int outputSize_; +public: + DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn): + minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ + channelCount_=dn.channelCount_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + } + DeepCNNCaffeImpl& operator=(const DeepCNNCaffeImpl &dn) + { +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->channelCount_=dn.channelCount_; + this->minibatchSz_=dn.minibatchSz_; + this->outputSize_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + this->outputGeometry_=dn.outputGeometry_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) + :minibatchSz_(maxMinibatchSz) + { + + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_CAFFE + this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); + CV_Assert(net_->num_inputs()==1); + CV_Assert(net_->num_outputs()==1); + CV_Assert(this->net_->input_blobs()[0]->channels()==1 + ||this->net_->input_blobs()[0]->channels()==3); + this->channelCount_=this->net_->input_blobs()[0]->channels(); + + + + this->net_->CopyTrainedLayersFrom(modelWeightsFilename); + + caffe::Blob* inputLayer = this->net_->input_blobs()[0]; + + this->inputGeometry_=Size(inputLayer->width(), inputLayer->height()); + this->channelCount_ = inputLayer->channels(); + + inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width); + net_->Reshape(); + this->outputSize_=net_->output_blobs()[0]->channels(); + this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); + + + + + +#else + CV_Error(Error::StsError,"Caffe not available during compilation!"); +#endif + } + + void classify(InputArray image, OutputArray classProbabilities) + { + std::vector inputImageList; + inputImageList.push_back(image.getMat()); + classifyBatch(inputImageList,classProbabilities); + } + void detect(InputArray image, OutputArray Bbox_prob) + { + + Bbox_prob.create(this->outputGeometry_,CV_32F); // dummy initialization is it needed + Mat outputMat = Bbox_prob.getMat(); + process_(image.getMat(),outputMat); + //copy back to outputArray + outputMat.copyTo(Bbox_prob); + } + + void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) + { + std::vector allImageVector; + inputImageList.getMatVector(allImageVector); + size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic + + size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic + classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); + Mat outputMat = classProbabilities.getMat(); + for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); + std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); + std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); + std::vector minibatchInput(from,to); + classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); + + } + + } + + int getOutputSize() + { + return this->outputSize_; + } + Size getOutputGeometry() + { + return this->outputGeometry_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_CAFFE; + } +}; + + +Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) +{ + if(preprocessor.empty()) + { + preprocessor=ImagePreprocessor::createResizer(); + } + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + + +Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,int backEnd) +{ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + +namespace cnn_config{ +namespace caffe_backend{ + +#ifdef HAVE_CAFFE + +bool getCaffeGpuMode() +{ + return caffe::Caffe::mode()==caffe::Caffe::GPU; +} + +void setCaffeGpuMode(bool useGpu) +{ + if(useGpu) + { + caffe::Caffe::set_mode(caffe::Caffe::GPU); + }else + { + caffe::Caffe::set_mode(caffe::Caffe::CPU); + } +} + +bool getCaffeAvailable() +{ + return true; +} + +#else + +bool getCaffeGpuMode() +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + return 0; +} + +void setCaffeGpuMode(bool useGpu) +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + CV_Assert(useGpu==1);//Compilation directives force +} + +bool getCaffeAvailable(){ + return 0; +} + +#endif + +}//namespace caffe +}//namespace cnn_config + +class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{ +private: + struct NetOutput{ + //Auxiliary structure that handles the logic of getting class ids and probabillities from + //the raw outputs of caffe + int wordIdx; + float probabillity; + + static bool sorter(const NetOutput& o1,const NetOutput& o2) + {//used with std::sort to provide the most probable class + return o1.probabillity>o2.probabillity; + } + + static void getOutputs(const float* buffer,int nbOutputs,std::vector& res) + { + res.resize(nbOutputs); + for(int k=0;k tmp; + getOutputs(buffer,nbOutputs,tmp); + classNum=tmp[0].wordIdx; + confidence=tmp[0].probabillity; + } + }; +protected: + std::vector labels_; + Ptr classifier_; +public: + OCRHolisticWordRecognizerImpl(Ptr classifierPtr,String vocabularyFilename):classifier_(classifierPtr) + { + CV_Assert(fileExists(vocabularyFilename));//this fails for some rason + std::ifstream labelsFile(vocabularyFilename.c_str()); + if(!labelsFile) + { + CV_Error(Error::StsError,"Could not read Labels from file"); + } + std::string line; + while (std::getline(labelsFile, line)) + { + labels_.push_back(std::string(line)); + } + CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size())); + } + + OCRHolisticWordRecognizerImpl(Ptr classifierPtr,const std::vector& vocabulary):classifier_(classifierPtr) + { + this->labels_=vocabulary; + CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size())); + } + + void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence) + { + Mat netOutput; + this->classifier_->classify(inputImage,netOutput); + int classNum; + NetOutput::getClassification((float*)(netOutput.data),this->classifier_->getOutputSize(),classNum,confidence); + transcription=this->labels_[classNum]; + } + + void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptionVec,CV_OUT std::vector& confidenceVec) + { + Mat netOutput; + this->classifier_->classifyBatch(inputImageList,netOutput); + for(int k=0;kclassifier_->getOutputSize(),classNum,confidence); + transcriptionVec.push_back(this->labels_[classNum]); + confidenceVec.push_back(confidence); + } + } + + + void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting + double confidence; + String transcription; + recogniseImage(image,transcription,confidence); + output_text=transcription.c_str(); + if(component_rects!=NULL) + { + component_rects->resize(1); + (*component_rects)[0]=Rect(0,0,image.size().width,image.size().height); + } + if(component_texts!=NULL) + { + component_texts->resize(1); + (*component_texts)[0]=transcription.c_str(); + } + if(component_confidences!=NULL) + { + component_confidences->resize(1); + (*component_confidences)[0]=float(confidence); + } + } + + void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image + this->run(image,output_text,component_rects,component_texts,component_confidences,component_level); + } + + std::vector& getVocabulary() + { + return this->labels_; + } + + Ptr getClassifier() + { + return this->classifier_; + } +}; + +Ptr OCRHolisticWordRecognizer::create(Ptr classifierPtr,String vocabularyFilename ) +{ + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename)); +} + +Ptr OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename) +{ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + Ptr classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100)); + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename)); +} + +Ptr OCRHolisticWordRecognizer::create(Ptr classifierPtr,const std::vector& vocabulary) +{ + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary)); +} + +Ptr OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename,const std::vector& vocabulary){ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + Ptr classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100)); + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary)); +} + + + + + +} } //namespace text namespace cv diff --git a/modules/text/src/text_detector.cpp b/modules/text/src/text_detector.cpp new file mode 100644 index 000000000..8f224a70f --- /dev/null +++ b/modules/text/src/text_detector.cpp @@ -0,0 +1,643 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/core.hpp" + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef HAVE_CAFFE +#include "caffe/caffe.hpp" +#endif + +namespace cv { namespace text { + +//Maybe OpenCV has a routine better suited +//inline bool fileExists (String filename) { +// std::ifstream f(filename.c_str()); +// return f.good(); +//} + +//************************************************************************************ +//****************** ImagePreprocessor ******************************************* +//************************************************************************************ + +/*void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){ + Mat inpImg=input.getMat(); + Mat outImg; + this->preprocess_(inpImg,outImg,sz,outputChannels); + outImg.copyTo(output); +}*/ + + +/*class ResizerPreprocessor: public ImagePreprocessor{ +protected: + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1){ + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U){ + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + } +public: + ResizerPreprocessor(){} + ~ResizerPreprocessor(){} +}; + +class StandarizerPreprocessor: public ImagePreprocessor{ +protected: + double sigma_; + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + Scalar dev,mean; + meanStdDev(output,mean,dev); + subtract(output,mean[0],output); + divide(output,(dev[0]/sigma_),output); + } +public: + StandarizerPreprocessor(double sigma):sigma_(sigma){} + ~StandarizerPreprocessor(){} +}; + +class MeanSubtractorPreprocessor: public ImagePreprocessor{ +protected: + Mat mean_; + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height); + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + subtract(output,this->mean_,output); + } +public: + MeanSubtractorPreprocessor(Mat mean) + { + mean.copyTo(this->mean_); + } + + ~MeanSubtractorPreprocessor(){} +}; + + +Ptr ImagePreprocessor::createResizer() +{ + return Ptr(new ResizerPreprocessor); +} + +Ptr ImagePreprocessor::createImageStandarizer(double sigma) +{ + return Ptr(new StandarizerPreprocessor(sigma)); +} + +Ptr ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg) +{ + Mat tmp=meanImg.getMat(); + return Ptr(new MeanSubtractorPreprocessor(tmp)); +} + +//************************************************************************************ +//****************** TextImageClassifier ***************************************** +//************************************************************************************ + +void TextImageClassifier::preprocess(const Mat& input,Mat& output) +{ + this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_); +} + +void TextImageClassifier::setPreprocessor(Ptr ptr) +{ + CV_Assert(!ptr.empty()); + preprocessor_=ptr; +} + +Ptr TextImageClassifier::getPreprocessor() +{ + return preprocessor_; +}*/ + +/* +class DeepCNNCaffeImpl: public DeepCNN{ +protected: + void classifyMiniBatch(std::vector inputImageList, Mat outputMat) + { + //Classifies a list of images containing at most minibatchSz_ images + CV_Assert(int(inputImageList.size())<=this->minibatchSz_); + CV_Assert(outputMat.isContinuous()); +#ifdef HAVE_CAFFE + net_->input_blobs()[0]->Reshape(inputImageList.size(), 1,this->inputGeometry_.height,this->inputGeometry_.width); + net_->Reshape(); + float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); + float* inputData=inputBuffer; + for(size_t imgNum=0;imgNuminputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); + this->preprocess(inputImageList[imgNum],preprocessed); + preprocessed.copyTo(netInputWraped); + inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + } + this->net_->ForwardPrefilled(); + const float* outputNetData=net_->output_blobs()[0]->cpu_data(); + float*outputMatData=(float*)(outputMat.data); + memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size()); +#endif + } + +#ifdef HAVE_CAFFE + Ptr > net_; +#endif + //Size inputGeometry_; + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + int outputSize_; +public: + DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn): + minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ + channelCount_=dn.channelCount_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + } + DeepCNNCaffeImpl& operator=(const DeepCNNCaffeImpl &dn) + { +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->channelCount_=dn.channelCount_; + this->minibatchSz_=dn.minibatchSz_; + this->outputSize_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) + :minibatchSz_(maxMinibatchSz) + { + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_CAFFE + this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); + CV_Assert(net_->num_inputs()==1); + CV_Assert(net_->num_outputs()==1); + CV_Assert(this->net_->input_blobs()[0]->channels()==1 + ||this->net_->input_blobs()[0]->channels()==3); + this->channelCount_=this->net_->input_blobs()[0]->channels(); + this->net_->CopyTrainedLayersFrom(modelWeightsFilename); + caffe::Blob* inputLayer = this->net_->input_blobs()[0]; + this->inputGeometry_=Size(inputLayer->width(), inputLayer->height()); + inputLayer->Reshape(this->minibatchSz_,1,this->inputGeometry_.height, this->inputGeometry_.width); + net_->Reshape(); + this->outputSize_=net_->output_blobs()[0]->channels(); + +#else + CV_Error(Error::StsError,"Caffe not available during compilation!"); +#endif + } + + void classify(InputArray image, OutputArray classProbabilities) + { + std::vector inputImageList; + inputImageList.push_back(image.getMat()); + classifyBatch(inputImageList,classProbabilities); + } + + void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) + { + std::vector allImageVector; + inputImageList.getMatVector(allImageVector); + size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic + size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic + classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); + Mat outputMat = classProbabilities.getMat(); + for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); + std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); + std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); + std::vector minibatchInput(from,to); + classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); + } + } + + int getOutputSize() + { + return this->outputSize_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_CAFFE; + } +}; + + +Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) +{ + if(preprocessor.empty()) + { + preprocessor=ImagePreprocessor::createResizer(); + } + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + + +Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,int backEnd) +{ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + +namespace cnn_config{ +namespace caffe_backend{ + +#ifdef HAVE_CAFFE + +bool getCaffeGpuMode() +{ + return caffe::Caffe::mode()==caffe::Caffe::GPU; +} + +void setCaffeGpuMode(bool useGpu) +{ + if(useGpu) + { + caffe::Caffe::set_mode(caffe::Caffe::GPU); + }else + { + caffe::Caffe::set_mode(caffe::Caffe::CPU); + } +} + +bool getCaffeAvailable() +{ + return true; +} + +#else + +bool getCaffeGpuMode() +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + return 0; +} + +void setCaffeGpuMode(bool useGpu) +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + CV_Assert(useGpu==1);//Compilation directives force +} + +bool getCaffeAvailable(){ + return 0; +} + +#endif + +}//namespace caffe +}//namespace cnn_config +*/ + +class textDetectImpl: public textDetector{ +private: + struct NetOutput{ + //Auxiliary structure that handles the logic of getting bounding box and confidences of textness from + //the raw outputs of caffe + Rect bbox; + float probability; + +// static bool sorter(const NetOutput& o1,const NetOutput& o2) +// {//used with std::sort to provide the most probable class +// return o1.probabillity>o2.probabillity; +// } + + static void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,std::vector& res,Size inputShape) + { + + res.resize(nbrTextBoxes); + for(int k=0;k inputShape.width?inputShape.width-1:x_max; + y_max = y_max > inputShape.height?inputShape.height-1:y_max; + float wd = x_max-x_min+1; + float ht = y_max-y_min+1; + + res[k].bbox=Rect(int(x_min),int(y_min),int(wd),int(ht)); + // printf("%f %f %f %f\n",buffer[k*nCol+3],buffer[k*nCol+4],buffer[k*nCol+5],buffer[k*nCol+6]); + res[k].probability=buffer[k*nCol+2]; + } +// std::sort(res.begin(),res.end(),NetOutput::sorter); + } + +// static void getDetections(const float* buffer,int nbOutputs,int &classNum,double& confidence) +// { +// std::vector tmp; +// getOutputs(buffer,nbOutputs,tmp); +// classNum=tmp[0].wordIdx; +// confidence=tmp[0].probabillity; +// } + }; +protected: + //std::vector labels_; + Ptr classifier_; +public: + textDetectImpl(Ptr classifierPtr):classifier_(classifierPtr) + { + + } + + + + void textDetectInImage(InputArray inputImage,CV_OUT std::vector& Bbox,CV_OUT std::vector& confidence) + { + Mat netOutput; + //std::cout<<"started detect"<classifier_->detect(inputImage,netOutput); + //std::cout<<"After Detect"<classifier_->getOutputGeometry(); + int nbrTextBoxes = OutputGeometry_.height; + int nCol = OutputGeometry_.width; + //std::cout< tmp; + Size inputImageShape = Size(inputImage.cols(),inputImage.rows()); + NetOutput::getOutputs((float*)(netOutput.data),nbrTextBoxes,nCol,tmp,inputImageShape); + //Bbox.resize(nbrTextBoxes); + //confidence.resize(nbrTextBoxes); + for (int k=0;k* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting + //double confidence; + //String transcription; + std::vector bbox; + std::vector score; + textDetectInImage(image,bbox,score); + //output_text=transcription.c_str(); + if(component_rects!=NULL) + { + component_rects->resize(bbox.size()); // should be a user behavior + + component_rects = &bbox; + } + + if(component_confidences!=NULL) + { + component_confidences->resize(score.size()); // shoub be a user behavior + + component_confidences = &score; + } + } + + void run(Mat& image, Mat& mask, std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image + this->run(image,component_rects,component_confidences,component_level); + } + +// std::vector& getVocabulary() +// { +// return this->labels_; +// } + + Ptr getClassifier() + { + return this->classifier_; + } +}; + +Ptr textDetector::create(Ptr classifierPtr) +{ + return Ptr(new textDetectImpl(classifierPtr)); +} + +Ptr textDetector::create(String modelArchFilename, String modelWeightsFilename) +{ + + + Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); + + Mat textbox_mean(1,3,CV_8U); + textbox_mean.at(0,0)=104; + textbox_mean.at(0,1)=117; + textbox_mean.at(0,2)=123; + preprocessor->set_mean(textbox_mean); + + Ptr classifierPtr(DeepCNN::create(modelArchFilename,modelWeightsFilename,preprocessor,1)); + return Ptr(new textDetectImpl(classifierPtr)); +} + + + + + + + +} } //namespace text namespace cv diff --git a/modules/text/text_config.hpp.in b/modules/text/text_config.hpp.in index 30089bd3c..71b32993a 100644 --- a/modules/text/text_config.hpp.in +++ b/modules/text/text_config.hpp.in @@ -1,7 +1,13 @@ #ifndef __OPENCV_TEXT_CONFIG_HPP__ #define __OPENCV_TEXT_CONFIG_HPP__ +// HAVE QT5 +//#cmakedefine HAVE_QT5GUI + +// HAVE CAFFE +//#cmakedefine HAVE_CAFFE + // HAVE OCR Tesseract -#cmakedefine HAVE_TESSERACT +//#cmakedefine HAVE_TESSERACT -#endif \ No newline at end of file +#endif