Text detector class and Custom Image processor Class

pull/1285/head
sghoshcvc 8 years ago
parent fa94c16065
commit 9ae765a197
  1. 85
      modules/text/CMakeLists.txt
  2. 14
      modules/text/FindCaffe.cmake
  3. 10
      modules/text/FindGlog.cmake
  4. 10
      modules/text/FindProtobuf.cmake
  5. 24
      modules/text/FindTesseract.cmake
  6. 72
      modules/text/README.md
  7. 3
      modules/text/include/opencv2/text.hpp
  8. 849
      modules/text/include/opencv2/text/ocr.hpp
  9. 235
      modules/text/include/opencv2/text/textDetector.hpp
  10. 879
      modules/text/src/ocr_holistic.cpp
  11. 643
      modules/text/src/text_detector.cpp
  12. 10
      modules/text/text_config.hpp.in

@ -1,24 +1,71 @@
set(the_description "Text Detection and Recognition")
ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d OPTIONAL opencv_highgui WRAP python)
if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
find_package(Tesseract QUIET)
if(Tesseract_FOUND)
message(STATUS "Tesseract: YES")
set(HAVE_TESSERACT 1)
ocv_include_directories(${Tesseract_INCLUDE_DIR})
ocv_target_link_libraries(${the_module} ${Tesseract_LIBRARIES})
else()
message(STATUS "Tesseract: NO")
endif()
# Using cmake scripts and modules
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR})
set(TEXT_DEPS opencv_ml opencv_highgui opencv_imgproc opencv_core opencv_features2d opencv_calib3d)
find_package(Caffe)
if(Caffe_FOUND)
message(STATUS "Caffe: YES")
set(HAVE_CAFFE 1)
else()
message(STATUS "Caffe: NO")
# list(APPEND TEXT_DEPS opencv_dnn)
endif()
#internal dependencies
find_package(Protobuf)
if(Protobuf_FOUND)
message(STATUS "Protobuf: YES")
set(HAVE_PROTOBUF 1)
else()
message(STATUS "Protobuf: NO")
endif()
find_package(Glog)
if(Glog_FOUND)
message(STATUS "Glog: YES")
set(HAVE_GLOG 1)
else()
message(STATUS "Glog: NO")
endif()
ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_calib3d WRAP python)
#ocv_define_module(text ${TEXT_DEPS} WRAP python)
#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR})
find_package(Tesseract)
if(${Tesseract_FOUND})
message(STATUS "Tesseract: YES")
include_directories(${Tesseract_INCLUDE_DIR})
target_link_libraries(opencv_text ${Tesseract_LIBS})
add_definitions(-DHAVE_TESSERACT)
else()
message(STATUS "Tesseract: NO")
endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
${CMAKE_BINARY_DIR}/text_config.hpp @ONLY)
ocv_include_directories(${CMAKE_CURRENT_BINARY_DIR})
ocv_add_testdata(samples/ contrib/text
FILES_MATCHING PATTERN "*.xml" PATTERN "*.xml.gz" REGEX "scenetext[0-9]+.jpg"
)
if(HAVE_CAFFE AND HAVE_GLOG AND HAVE_PROTOBUF)
include_directories(${Caffe_INCLUDE_DIR})
find_package(HDF5 COMPONENTS HL REQUIRED)
include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
include_directories(SYSTEM ${Boost_INCLUDE_DIR})
include_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/include/ usr/local/cuda-8.0/include/ /usr/local/cuda-7.5/targets/x86_64-linux/include/ )
link_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/lib/ usr/local/cuda-8.0/lib/ /usr/local/cuda-7.5/targets/x86_64-linux/lib/ /usr/lib/openblas-base/lib /usr/local/cuda-8.0/lib64)
list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES})
target_link_libraries(opencv_text atlas blas ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES})
add_definitions(-DHAVE_CAFFE)
endif() #HAVE_CAFFE
message(STATUS "TEXT CAFFE SEARCH")
if()
message(STATUS "TEXT NO CAFFE CONFLICT")
else()
message(STATUS "TEXT CAFFE CONFLICT")
endif()

@ -0,0 +1,14 @@
# Caffe package for CNN Triplet training
unset(Caffe_FOUND)
find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp
HINTS
/usr/local/include)
find_library(Caffe_LIBS NAMES caffe
HINTS
/usr/local/lib)
if(Caffe_LIBS AND Caffe_INCLUDE_DIR)
set(Caffe_FOUND 1)
endif()

@ -0,0 +1,10 @@
#Required for Caffe
unset(Glog_FOUND)
find_library(Glog_LIBS NAMES glog
HINTS
/usr/local/lib)
if(Glog_LIBS)
set(Glog_FOUND 1)
endif()

@ -0,0 +1,10 @@
#Protobuf package required for Caffe
unset(Protobuf_FOUND)
find_library(Protobuf_LIBS NAMES protobuf
HINTS
/usr/local/lib)
if(Protobuf_LIBS)
set(Protobuf_FOUND 1)
endif()

@ -0,0 +1,24 @@
# Tesseract OCR
unset(Tesseract_FOUND)
find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h
HINTS
/usr/include
/usr/local/include)
find_library(Tesseract_LIBRARY NAMES tesseract
HINTS
/usr/lib
/usr/local/lib)
find_library(Lept_LIBRARY NAMES lept
HINTS
/usr/lib
/usr/local/lib)
set(Tesseract_LIBS ${Tesseract_LIBRARY} ${Lept_LIBRARY})
if(Tesseract_LIBS AND Tesseract_INCLUDE_DIR)
set(Tesseract_FOUND 1)
endif()

@ -47,3 +47,75 @@ Notes
2. Tesseract configure script may fail to detect leptonica, so you may have to edit the configure script - comment off some if's around this message and retain only "then" branch.
3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages.
Word spotting CNN
=================
Intro
-----
A word spotting CNN is a CNN that takes an image assumed to contain a single word and provides a probabillity over a given vocabulary.
Although other backends will be supported, for the moment only the Caffe backend is supported.
Instalation of Caffe backend
----------------------------
The caffe wrapping backend has the requirements caffe does.
* Caffe can be built against OpenCV, if the caffe backend is enabled, a circular bependency arises.
The simplest solution is to build caffe without support for OpenCV.
* Only the OS supported by Caffe are supported by the backend.
The scripts describing the module have been developed in ubuntu 16.04 and assume such a system.
Other UNIX systems including OSX should be easy to adapt.
Sample script for building Caffe
```bash
#!/bin/bash
SRCROOT="${HOME}/caffe_inst/"
mkdir -p "$SRCROOT"
cd "$SRCROOT"
git clone https://github.com/BVLC/caffe.git
cd caffe
git checkout 91b09280f5233cafc62954c98ce8bc4c204e7475
git branch 91b09280f5233cafc62954c98ce8bc4c204e7475
cat Makefile.config.example > Makefile.config
echo 'USE_OPENCV := 0' >> Makefile.config
echo 'INCLUDE_DIRS += /usr/include/hdf5/serial/' >> Makefile.config
echo 'LIBRARY_DIRS += /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config
echo "--- /tmp/caffe/include/caffe/net.hpp 2017-05-28 04:55:47.929623902 +0200
+++ caffe/distribute/include/caffe/net.hpp 2017-05-28 04:51:33.437090768 +0200
@@ -234,6 +234,7 @@
template <typename T>
friend class Net;
+ virtual ~Callback(){}
};
const vector<Callback*>& before_forward() const { return before_forward_; }
void add_before_forward(Callback* value) {
">/tmp/cleanup_caffe.diff
patch < /tmp/cleanup_caffe.diff
make -j 6
make pycaffe
make distribute
```
```bash
#!/bin/bash
cd $OPENCV_BUILD_DIR #You must set this
CAFFEROOT="${HOME}/caffe_inst/" #If you used the previous code to compile Caffe in ubuntu 16.04
cmake -DCaffe_LIBS:FILEPATH="$CAFFEROOT/caffe/distribute/lib/libcaffe.so" -DBUILD_opencv_ts:BOOL="0" -DBUILD_opencv_dnn:BOOL="0" -DBUILD_opencv_dnn_modern:BOOL="0" -DCaffe_INCLUDE_DIR:PATH="$CAFFEROOT/caffe/distribute/include" -DWITH_MATLAB:BOOL="0" -DBUILD_opencv_cudabgsegm:BOOL="0" -DWITH_QT:BOOL="1" -DBUILD_opencv_cudaoptflow:BOOL="0" -DBUILD_opencv_cudastereo:BOOL="0" -DBUILD_opencv_cudafilters:BOOL="0" -DBUILD_opencv_cudev:BOOL="1" -DOPENCV_EXTRA_MODULES_PATH:PATH="/home/anguelos/work/projects/opencv_gsoc/opencv_contrib/modules" ./
```

@ -41,6 +41,7 @@ the use of this software, even if advised of the possibility of such damage.
#include "opencv2/text/erfilter.hpp"
#include "opencv2/text/ocr.hpp"
#include "opencv2/text/textDetector.hpp"
/** @defgroup text Scene Text Detection and Recognition
@ -92,7 +93,7 @@ grouping horizontally aligned text, and the method proposed by Lluis Gomez and D
in [Gomez13][Gomez14] for grouping arbitrary oriented text (see erGrouping).
To see the text detector at work, have a look at the textdetection demo:
<https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/textdetection.cpp>
<https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/textdetection.cpp>
@defgroup text_recognize Scene Text Recognition
@}

@ -46,6 +46,10 @@
#include <vector>
#include <string>
#include <iostream>
#include <sstream>
namespace cv
{
@ -61,82 +65,126 @@ enum
OCR_LEVEL_TEXTLINE
};
//base class BaseOCR declares a common API that would be used in a typical text recognition scenario
//base class BaseOCR declares a common API that would be used in a typical text
//recognition scenario
class CV_EXPORTS_W BaseOCR
{
public:
public:
virtual ~BaseOCR() {};
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
virtual void run(Mat& image, std::string& output_text,
std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0) = 0;
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
virtual void run(Mat& image, Mat& mask, std::string& output_text,
std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0) = 0;
/** @brief Main functionality of the OCR Hierarchy. Subclasses provide
* default parameters for all parameters other than the input image.
*/
virtual String run(InputArray image){
std::string res;
std::vector<Rect> component_rects;
std::vector<float> component_confidences;
std::vector<std::string> component_texts;
Mat inputImage=image.getMat();
this->run(inputImage,res,&component_rects,&component_texts,
&component_confidences,OCR_LEVEL_WORD);
return res;
}
};
/** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++.
/** @brief OCRTesseract class provides an interface with the tesseract-ocr API
* (v3.02.02) in C++.
Notice that it is compiled only when tesseract-ocr is correctly installed.
@note
- (C++) An example of OCRTesseract recognition combined with scene text detection can be found
at the end_to_end_recognition demo:
<https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/end_to_end_recognition.cpp>
- (C++) Another example of OCRTesseract recognition combined with scene text detection can be
found at the webcam_demo:
<https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp>
- (C++) An example of OCRTesseract recognition combined with scene text
detection can be found at the end_to_end_recognition demo:
<https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/end_to_end_recognition.cpp>
- (C++) Another example of OCRTesseract recognition combined with scene
text detection can be found at the webcam_demo:
<https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp>
*/
class CV_EXPORTS_W OCRTesseract : public BaseOCR
{
public:
/** @brief Recognize text using the tesseract-ocr API.
Takes image on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
Takes image on input and returns recognized text in the output_text
parameter. Optionally provides also the Rects for individual text elements
found (e.g. words), and the list of those text elements with their
confidence values.
@param image Input image CV_8UC1 or CV_8UC3
@param output_text Output text of the tesseract-ocr.
@param component_rects If provided the method will output a list of Rects for the individual
text elements found (e.g. words or text lines).
@param component_texts If provided the method will output a list of text strings for the
recognition of individual text elements found (e.g. words or text lines).
@param component_confidences If provided the method will output a list of confidence values
for the recognition of individual text elements found (e.g. words or text lines).
@param component_rects If provided the method will output a list of Rects
for the individual text elements found (e.g. words or text lines).
@param component_texts If provided the method will output a list of text
strings for the recognition of individual text elements found (e.g. words or
text lines).
@param component_confidences If provided the method will output a list of
confidence values for the recognition of individual text elements found
(e.g. words or text lines).
@param component_level OCR_LEVEL_WORD (by default), or OCR_LEVEL_TEXT_LINE.
*/
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
virtual void run (Mat& image, std::string& output_text,
std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0);
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
virtual void run (Mat& image, Mat& mask, std::string& output_text,
std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0);
// aliases for scripting
CV_WRAP String run(InputArray image, int min_confidence, int component_level=0);
CV_WRAP String run (InputArray image, int min_confidence,
int component_level=0);
CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0);
CV_WRAP String run(InputArray image, InputArray mask,
int min_confidence, int component_level=0);
CV_WRAP virtual void setWhiteList(const String& char_whitelist) = 0;
/** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract.
/** @brief Creates an instance of the OCRTesseract class. Initializes
* Tesseract.
* @param datapath the name of the parent directory of tessdata ended with
* "/", or NULL to use the system's default directory.
* @param language an ISO 639-3 code or NULL will default to "eng".
* @param char_whitelist specifies the list of characters used for
* recognition. NULL defaults to "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".
@param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the
system's default directory.
@param language an ISO 639-3 code or NULL will default to "eng".
@param char_whitelist specifies the list of characters used for recognition. NULL defaults to
"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".
@param oem tesseract-ocr offers different OCR Engine Modes (OEM), by deffault
tesseract::OEM_DEFAULT is used. See the tesseract-ocr API documentation for other possible
values.
@param psmode tesseract-ocr offers different Page Segmentation Modes (PSM) tesseract::PSM_AUTO
(fully automatic layout analysis) is used. See the tesseract-ocr API documentation for other
possible values.
* @param oem tesseract-ocr offers different OCR Engine Modes (OEM), by
* default tesseract::OEM_DEFAULT is used. See the tesseract-ocr API
* documentation for other possible values.
* @param psmode tesseract-ocr offers different Page Segmentation Modes
* (PSM) tesseract::PSM_AUTO (fully automatic layout analysis) is used. See
* the tesseract-ocr API documentation for other possible values.
*/
CV_WRAP static Ptr<OCRTesseract> create(const char* datapath=NULL, const char* language=NULL,
const char* char_whitelist=NULL, int oem=3, int psmode=3);
CV_WRAP static Ptr<OCRTesseract> create (const char* datapath=NULL,
const char* language=NULL,
const char* char_whitelist=NULL,
int oem=3, int psmode=3);
};
@ -147,134 +195,156 @@ enum decoder_mode
OCR_DECODER_VITERBI = 0 // Other algorithms may be added
};
/** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov Models.
/** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov
* Models.
@note
- (C++) An example on using OCRHMMDecoder recognition combined with scene text detection can
be found at the webcam_demo sample:
<https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp>
* @note
* - (C++) An example on using OCRHMMDecoder recognition combined with scene
* text detection can be found at the webcam_demo sample:
* <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/webcam_demo.cpp>
*/
class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR
{
public:
class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR {
public:
/** @brief Callback with the character classifier is made a class.
This way it hides the feature extractor and the classifier itself, so developers can write
their own OCR code.
* This way it hides the feature extractor and the classifier itself, so
* developers can write their own OCR code.
The default character classifier and feature extractor can be loaded using the utility funtion
loadOCRHMMClassifierNM and KNN model provided in
<https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/OCRHMM_knn_model_data.xml.gz>.
*/
class CV_EXPORTS_W ClassifierCallback
{
public:
* The default character classifier and feature extractor can be loaded using
* the utility funtion loadOCRHMMClassifierNM and KNN model provided in
* <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_knn_model_data.xml.gz>.
*/
class CV_EXPORTS_W ClassifierCallback{
public:
virtual ~ClassifierCallback() { }
/** @brief The character classifier must return a (ranked list of) class(es) id('s)
/** @brief The character classifier must return a (ranked list of)
* class(es) id('s)
@param image Input image CV_8UC1 or CV_8UC3 with a single letter.
@param out_class The classifier returns the character class categorical label, or list of
class labels, to which the input image corresponds.
@param out_confidence The classifier returns the probability of the input image
corresponding to each classes in out_class.
* @param image Input image CV_8UC1 or CV_8UC3 with a single letter.
* @param out_class The classifier returns the character class
* categorical label, or list of class labels, to which the input image
* corresponds.
* @param out_confidence The classifier returns the probability of the
* input image corresponding to each classes in out_class.
*/
virtual void eval( InputArray image, std::vector<int>& out_class, std::vector<double>& out_confidence);
virtual void eval (InputArray image, std::vector<int>& out_class,
std::vector<double>& out_confidence);
};
public:
/** @brief Recognize text using HMM.
Takes binary image on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
* Takes binary image on input and returns recognized text in the output_text
* parameter. Optionally provides also the Rects for individual text elements
* found (e.g. words), and the list of those text elements with their
* confidence values.
@param image Input binary image CV_8UC1 with a single text line (or word).
* @param image Input binary image CV_8UC1 with a single text line (or word).
@param output_text Output text. Most likely character sequence found by the HMM decoder.
* @param output_text Output text. Most likely character sequence found by
* the HMM decoder.
@param component_rects If provided the method will output a list of Rects for the individual
text elements found (e.g. words).
* @param component_rects If provided the method will output a list of Rects
* for the individual text elements found (e.g. words).
@param component_texts If provided the method will output a list of text strings for the
recognition of individual text elements found (e.g. words).
* @param component_texts If provided the method will output a list of text
* strings for the recognition of individual text elements found (e.g. words)
* .
@param component_confidences If provided the method will output a list of confidence values
for the recognition of individual text elements found (e.g. words).
* @param component_confidences If provided the method will output a list of
* confidence values for the recognition of individual text elements found
* (e.g. words).
@param component_level Only OCR_LEVEL_WORD is supported.
*/
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0);
* @param component_level Only OCR_LEVEL_WORD is supported.
*/
virtual void run (Mat& image, std::string& output_text,
std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0);
/** @brief Recognize text using HMM.
Takes an image and a mask (where each connected component corresponds to a segmented character)
on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
* Takes an image and a mask (where each connected component corresponds to a
* segmented character) on input and returns recognized text in the
* output_text parameter. Optionally provides also the Rects for individual
* text elements found (e.g. words), and the list of those text elements with
* their confidence values.
@param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word).
@param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image.
* @param image Input image CV_8UC1 or CV_8UC3 with a single text line
* (or word).
@param output_text Output text. Most likely character sequence found by the HMM decoder.
* @param mask Input binary image CV_8UC1 same size as input image. Each
* connected component in mask corresponds to a segmented character in the
* input image.
@param component_rects If provided the method will output a list of Rects for the individual
text elements found (e.g. words).
* @param output_text Output text. Most likely character sequence found by
* the HMM decoder.
@param component_texts If provided the method will output a list of text strings for the
recognition of individual text elements found (e.g. words).
* @param component_rects If provided the method will output a list of Rects
* for the individual text elements found (e.g. words).
@param component_confidences If provided the method will output a list of confidence values
for the recognition of individual text elements found (e.g. words).
* @param component_texts If provided the method will output a list of text
* strings for the recognition of individual text elements found (e.g. words)
* .
@param component_level Only OCR_LEVEL_WORD is supported.
*/
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
* @param component_confidences If provided the method will output a list of
* confidence values for the recognition of individual text elements found
* (e.g. words).
* @param component_level Only OCR_LEVEL_WORD is supported.
*/
virtual void run(Mat& image, Mat& mask, std::string& output_text,
std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0);
// aliases for scripting
CV_WRAP String run(InputArray image, int min_confidence, int component_level=0);
CV_WRAP String run(InputArray image,
int min_confidence,
int component_level=0);
CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0);
CV_WRAP String run(InputArray image,
InputArray mask,
int min_confidence,
int component_level=0);
/** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder.
/** @brief Creates an instance of the OCRHMMDecoder class. Initializes
* HMMDecoder.
@param classifier The character classifier with built in feature extractor.
* @param classifier The character classifier with built in feature
* extractor.
@param vocabulary The language vocabulary (chars when ascii english text). vocabulary.size()
must be equal to the number of classes of the classifier.
* @param vocabulary The language vocabulary (chars when ascii english text)
* . vocabulary.size() must be equal to the number of classes of the
* classifier.
@param transition_probabilities_table Table with transition probabilities between character
pairs. cols == rows == vocabulary.size().
* @param transition_probabilities_table Table with transition probabilities
* between character pairs. cols == rows == vocabulary.size().
@param emission_probabilities_table Table with observation emission probabilities. cols ==
rows == vocabulary.size().
* @param emission_probabilities_table Table with observation emission
* probabilities. cols == rows == vocabulary.size().
@param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available for the moment
(<http://en.wikipedia.org/wiki/Viterbi_algorithm>).
* @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available
* for the moment (<http://en.wikipedia.org/wiki/Viterbi_algorithm>).
*/
static Ptr<OCRHMMDecoder> create(const Ptr<OCRHMMDecoder::ClassifierCallback> classifier,// The character classifier with built in feature extractor
const std::string& vocabulary, // The language vocabulary (chars when ascii english text)
// size() must be equal to the number of classes
InputArray transition_probabilities_table, // Table with transition probabilities between character pairs
// cols == rows == vocabulari.size()
InputArray emission_probabilities_table, // Table with observation emission probabilities
// cols == rows == vocabulari.size()
decoder_mode mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment)
CV_WRAP static Ptr<OCRHMMDecoder> create(const Ptr<OCRHMMDecoder::ClassifierCallback> classifier,// The character classifier with built in feature extractor
const String& vocabulary, // The language vocabulary (chars when ascii english text)
// size() must be equal to the number of classes
InputArray transition_probabilities_table, // Table with transition probabilities between character pairs
// cols == rows == vocabulari.size()
InputArray emission_probabilities_table, // Table with observation emission probabilities
// cols == rows == vocabulari.size()
int mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment)
protected:
static Ptr<OCRHMMDecoder> create(
const Ptr<OCRHMMDecoder::ClassifierCallback> classifier, // The character classifier with built in feature extractor
const std::string& vocabulary, // The language vocabulary (chars when ascii english text) size() must be equal to the number of classes
InputArray transition_probabilities_table, // Table with transition probabilities between character pairs cols == rows == vocabulari.size()
InputArray emission_probabilities_table, // Table with observation emission probabilities cols == rows == vocabulari.size()
decoder_mode mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment)
CV_WRAP static Ptr<OCRHMMDecoder> create(
const Ptr<OCRHMMDecoder::ClassifierCallback> classifier, // The character classifier with built in feature extractor
const String& vocabulary, // The language vocabulary (chars when ascii english text) size() must be equal to the number of classes
InputArray transition_probabilities_table, // Table with transition probabilities between character pairs cols == rows == vocabulari.size()
InputArray emission_probabilities_table, // Table with observation emission probabilities cols == rows == vocabulari.size()
int mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment)
protected:
Ptr<OCRHMMDecoder::ClassifierCallback> classifier;
std::string vocabulary;
@ -283,76 +353,98 @@ protected:
decoder_mode mode;
};
/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object.
/** @brief Allow to implicitly load the default character classifier when
* creating an OCRHMMDecoder object.
@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml)
* @param filename The XML or YAML file with the classifier model (e.g.
* OCRHMM_knn_model_data.xml)
The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann &
Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a
fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector
based on gradient orientations along the chain-code of its perimeter. Then, the region is classified
using a KNN model trained with synthetic data of rendered characters with different standard font
types.
* The KNN default classifier is based in the scene text recognition method
* proposed by Lukás Neumann & Jiri Matas in [Neumann11b]. Basically, the region
* (contour) in the input image is normalized to a fixed size, while retaining
* the centroid and aspect ratio, in order to extract a feature vector based on
* gradient orientations along the chain-code of its perimeter. Then, the region
* is classified using a KNN model trained with synthetic data of rendered
* characters with different standard font types.
*/
CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM (
const String& filename);
CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierNM(const String& filename);
/** @brief Allow to implicitly load the default character classifier when
* creating an OCRHMMDecoder object.
/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object.
* @param filename The XML or YAML file with the classifier model (e.g.
* OCRBeamSearch_CNN_model_data.xml.gz)
@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz)
The CNN default classifier is based in the scene text recognition method proposed by Adam Coates &
Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and
a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions
at each window location.
* The CNN default classifier is based in the scene text recognition method
* proposed by Adam Coates & Andrew NG in [Coates11a]. The character classifier
* consists in a Single Layer Convolutional Neural Network and a linear
* classifier. It is applied to the input image in a sliding window fashion,
* providing a set of recognitions at each window location.
*/
CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN(const String& filename);
CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN (
const String& filename);
//! @}
/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
*
/** @brief Utility function to create a tailored language model transitions
* table from a given list of words (lexicon).
* @param vocabulary The language vocabulary (chars when ascii english text).
*
* @param lexicon The list of words that are expected to be found in a particular image.
*
* @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size().
*
* The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
* @param transition_probabilities_table Output table with transition
* probabilities between character pairs. cols == rows == vocabulary.size().
* The function calculate frequency statistics of character pairs from the given
* lexicon and fills the output transition_probabilities_table with them. The
* transition_probabilities_table can be used as input in the
* OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods.
* @note
* - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) :
* <https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
* - (C++) An alternative would be to load the default generic language
* transition table provided in the text module samples folder (created
* from ispell 42869 english words list) :
* <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRHMM_transitions_table.xml>
**/
CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector<std::string>& lexicon, OutputArray transition_probabilities_table);
CV_EXPORTS_W Mat createOCRHMMTransitionsTable(const String& vocabulary, std::vector<cv::String>& lexicon);
CV_EXPORTS void createOCRHMMTransitionsTable (
std::string& vocabulary, std::vector<std::string>& lexicon,
OutputArray transition_probabilities_table);
CV_EXPORTS_W Mat createOCRHMMTransitionsTable (
const String& vocabulary, std::vector<cv::String>& lexicon);
/* OCR BeamSearch Decoder */
/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam Search algorithm.
/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam
* Search algorithm.
@note
- (C++) An example on using OCRBeamSearchDecoder recognition combined with scene text detection can
be found at the demo sample:
<https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/word_recognition.cpp>
- (C++) An example on using OCRBeamSearchDecoder recognition combined with
scene text detection can be found at the demo sample:
<https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/word_recognition.cpp>
*/
class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR
{
public:
/* Forward declaration of class that can be used to generate an OCRBeamSearchDecoder::ClassifierCallbac */
class TextImageClassifier;
class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{
public:
/** @brief Callback with the character classifier is made a class.
This way it hides the feature extractor and the classifier itself, so developers can write
their own OCR code.
* This way it hides the feature extractor and the classifier itself, so
* developers can write their own OCR code.
The default character classifier and feature extractor can be loaded using the utility funtion
loadOCRBeamSearchClassifierCNN with all its parameters provided in
<https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz>.
* The default character classifier and feature extractor can be loaded
* using the utility funtion loadOCRBeamSearchClassifierCNN with all its
* parameters provided in
* <https://github.com/Itseez/opencv_contrib/blob/master/modules/text/samples/OCRBeamSearch_CNN_model_data.xml.gz>.
*/
class CV_EXPORTS_W ClassifierCallback
{
public:
class CV_EXPORTS_W ClassifierCallback{
public:
virtual ~ClassifierCallback() { }
/** @brief The character classifier must return a (ranked list of) class(es) id('s)
@ -364,8 +456,8 @@ public:
*/
virtual void eval( InputArray image, std::vector< std::vector<double> >& recognition_probabilities, std::vector<int>& oversegmentation );
int getWindowSize() {return 0;}
int getStepSize() {return 0;}
virtual int getWindowSize() {return 0;}
virtual int getStepSize() {return 0;}
};
public:
@ -421,6 +513,7 @@ public:
@param beam_size Size of the beam in Beam Search algorithm.
*/
static Ptr<OCRBeamSearchDecoder> create(const Ptr<OCRBeamSearchDecoder::ClassifierCallback> classifier,// The character classifier with built in feature extractor
const std::string& vocabulary, // The language vocabulary (chars when ascii english text)
// size() must be equal to the number of classes
@ -441,6 +534,44 @@ public:
int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment)
int beam_size = 500); // Size of the beam in Beam Search algorithm
/** @brief This method allows to plug a classifier that is derivative of TextImageClassifier in to
* OCRBeamSearchDecoder as a ClassifierCallback.
@param classifier A pointer to a TextImageClassifier decendent
@param alphabet The language alphabet one char per symbol. alphabet.size() must be equal to the number of classes
of the classifier. In future editinons it should be replaced with a vector of strings.
@param transition_probabilities_table Table with transition probabilities between character
pairs. cols == rows == alphabet.size().
@param emission_probabilities_table Table with observation emission probabilities. cols ==
rows == alphabet.size().
@param windowWidth The width of the windows to which the sliding window will be iterated. The height will
be the height of the image. The windows might be resized to fit the classifiers input by the classifiers
preprocessor.
@param windowStep The step for the sliding window
@param mode HMM Decoding algorithm (only Viterbi for the moment)
@param beam_size Size of the beam in Beam Search algorithm
*/
// CV_WRAP static Ptr<OCRBeamSearchDecoder> create(const Ptr<TextImageClassifier> classifier, // The character classifier with built in feature extractor
// String alphabet, // The language alphabet one char per symbol
// // size() must be equal to the number of classes
// InputArray transition_probabilities_table, // Table with transition probabilities between character pairs
// // cols == rows == alphabet.size()
// InputArray emission_probabilities_table, // Table with observation emission probabilities
// // cols == rows == alphabet.size()
// int windowWidth, // The width of the windows to which the sliding window will be iterated.
// // The height will be the height of the image. The windows might be resized to
// // fit the classifiers input by the classifiers preprocessor
// int windowStep = 1 , // The step for the sliding window
// int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment)
// int beam_size = 500); // Size of the beam in Beam Search algorithm
protected:
Ptr<OCRBeamSearchDecoder::ClassifierCallback> classifier;
@ -465,6 +596,364 @@ CV_EXPORTS_W Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClas
//! @}
}
}
//Classifiers should provide diferent backends
//For the moment only caffe is implemeted
enum{
OCR_HOLISTIC_BACKEND_NONE,
OCR_HOLISTIC_BACKEND_CAFFE
};
class TextImageClassifier;
/**
* @brief The ImagePreprocessor class
*/
class CV_EXPORTS_W ImagePreprocessor{
protected:
virtual void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels)=0;
virtual void set_mean_(Mat){}
public:
virtual ~ImagePreprocessor(){}
/** @brief this method in provides public acces to the preprocessing with respect to a specific
* classifier
*
* This method's main use would be to use the preprocessor without feeding it to a classifier.
* Determining the exact behavior of a preprocessor is the main motivation for this.
*
* @param input an image without any constraints
*
* @param output in most cases an image of fixed depth size and whitened
*
* @param sz the size to which the image would be resize if the preprocessor resizes inputs
*
* @param outputChannels the number of channels for the output image
*/
CV_WRAP void preprocess(InputArray input,OutputArray output,Size sz,int outputChannels);
CV_WRAP void set_mean(Mat mean);
/** @brief Creates a functor that only resizes and changes the channels of the input
* without further processing.
*
* @return shared pointer to the generated preprocessor
*/
CV_WRAP static Ptr<ImagePreprocessor> createResizer();
/** @brief
*
* @param sigma
*
* @return shared pointer to generated preprocessor
*/
CV_WRAP static Ptr<ImagePreprocessor> createImageStandarizer(double sigma);
/** @brief
*
* @return shared pointer to generated preprocessor
*/
CV_WRAP static Ptr<ImagePreprocessor> createImageMeanSubtractor(InputArray meanImg);
CV_WRAP static Ptr<ImagePreprocessor>createImageCustomPreprocessor(double rawval=1.0,String channel_order="BGR");
friend class TextImageClassifier;
};
/** @brief Abstract class that implements the classifcation of text images.
*
* The interface is generic enough to describe any image classifier. And allows
* to take advantage of compouting in batches. While word classifiers are the default
* networks, any image classifers should work.
*
*/
class CV_EXPORTS_W TextImageClassifier
{
protected:
Size inputGeometry_;
Size outputGeometry_;
int channelCount_;
Ptr<ImagePreprocessor> preprocessor_;
/** @brief all image preprocessing is handled here including whitening etc.
*
* @param input the image to be preprocessed for the classifier. If the depth
* is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1]
*
* @param output reference to the image to be fed to the classifier, the preprocessor will
* resize the image to the apropriate size and convert it to the apropriate depth\
*
* The method preprocess should never be used externally, it is up to classify and classifyBatch
* methods to employ it.
*/
virtual void preprocess(const Mat& input,Mat& output);
public:
virtual ~TextImageClassifier() {}
/** @brief
*/
CV_WRAP virtual void setPreprocessor(Ptr<ImagePreprocessor> ptr);
/** @brief
*/
CV_WRAP Ptr<ImagePreprocessor> getPreprocessor();
/** @brief produces a class confidence row-vector given an image
*/
CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0;
/** @brief produces a list of bounding box given an image
*/
CV_WRAP virtual void detect(InputArray image, OutputArray classProbabilities) = 0;
/** @brief produces a matrix containing class confidence row-vectors given an collection of images
*/
CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0;
/** @brief simple getter method returning the number of channels each input sample has
*/
CV_WRAP virtual int getInputChannelCount(){return this->channelCount_;}
/** @brief simple getter method returning the size of the input sample
*/
CV_WRAP virtual Size getInputSize(){return this->inputGeometry_;}
/** @brief simple getter method returning the size of the oputput row-vector
*/
CV_WRAP virtual int getOutputSize()=0;
CV_WRAP virtual Size getOutputGeometry()=0;
/** @brief simple getter method returning the size of the minibatches for this classifier.
* If not applicabe this method should return 1
*/
CV_WRAP virtual int getMinibatchSize()=0;
friend class ImagePreprocessor;
};
class CV_EXPORTS_W DeepCNN:public TextImageClassifier
{
/** @brief Class that uses a pretrained caffe model for word classification.
*
* This network is described in detail in:
* Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015
* http://arxiv.org/abs/1412.1842
*/
public:
virtual ~DeepCNN() {};
/** @brief Constructs a DeepCNN object from a caffe pretrained model
*
* @param archFilename is the path to the prototxt file containing the deployment model architecture description.
*
* @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be
* very large, up to 2GB.
*
* @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method;
*
* @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter
* has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU.
*
* @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is
* the only option
*/
CV_WRAP static Ptr<DeepCNN> create(String archFilename,String weightsFilename,Ptr<ImagePreprocessor> preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE);
/** @brief Constructs a DeepCNN intended to be used for word spotting.
*
* This method loads a pretrained classifier and couples him with a preprocessor that standarises pixels with a
* deviation of 113. The architecture file can be downloaded from:
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt>
* While the weights can be downloaded from:
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel>
* The words assigned to the network outputs are available at:
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt>
*
* @param archFilename is the path to the prototxt file containing the deployment model architecture description.
* When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt".
*
* @param weightsFilename is the path to the pretrained weights of the model. When employing
* OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file. This file can be very large, the
* pretrained DictNet uses 2GB.
*
* @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is
* the only option
*/
CV_WRAP static Ptr<DeepCNN> createDictNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE);
};
namespace cnn_config{
namespace caffe_backend{
/** @brief Prompts Caffe on the computation device beeing used
*
* Caffe can only be controlled globally on whether the GPU or the CPU is used has a
* global behavior. This function queries the current state of caffe.
* If the module is built without caffe, this method throws an exception.
*
* @return true if caffe is computing on the GPU, false if caffe is computing on the CPU
*/
CV_EXPORTS_W bool getCaffeGpuMode();
/** @brief Sets the computation device beeing used by Caffe
*
* Caffe can only be controlled globally on whether the GPU or the CPU is used has a
* global behavior. This function queries the current state of caffe.
* If the module is built without caffe, this method throws an exception.
*
* @param useGpu set to true for caffe to be computing on the GPU, false if caffe is
* computing on the CPU
*/
CV_EXPORTS_W void setCaffeGpuMode(bool useGpu);
/** @brief Provides runtime information on whether Caffe support was compiled in.
*
* The text module API is the same regardless of whether CAffe was available or not
* During compilation. When methods that require Caffe are invocked while Caffe support
* is not compiled in, exceptions are thrown. This method allows to test whether the
* text module was built with caffe during runtime.
*
* @return true if Caffe support for the the text module was provided during compilation,
* false if Caffe was unavailable.
*/
CV_EXPORTS_W bool getCaffeAvailable();
}//caffe
}//cnn_config
/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting.
* Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable
* word given an input image.
*
* This class implements the logic of providing transcriptions given a vocabulary and and an image
* classifer. The classifier has to be any TextImageClassifier but the classifier for which this
* class was built is the DictNet. In order to load it the following files should be downloaded:
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt>
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel>
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt>
*/
class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR
{
public:
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=OCR_LEVEL_WORD)=0;
/** @brief Recognize text using a segmentation based word-spotting/classifier cnn.
Takes image on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
@param image Input image CV_8UC1 or CV_8UC3
@param mask is totally ignored and is only available for compatibillity reasons
@param output_text Output text of the the word spoting, always one that exists in the dictionary.
@param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will
be put in the vector.
@param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will
be put in the vector.
@param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will
be put in the vector.
@param component_level must be OCR_LEVEL_WORD.
*/
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=OCR_LEVEL_WORD)=0;
/**
@brief Method that provides a quick and simple interface to a single word image classifcation
@param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word
@param transcription an opencv string that will store the detected word transcription
@param confidence a double that will be updated with the confidence the classifier has for the selected word
*/
CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0;
/**
@brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage
the classifiers parallel capabilities.
@param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed
to contain a single word.
@param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each
input image
@param confidences a vector of double that will be updated with the confidence the classifier has for each of the
selected words.
*/
CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptions,CV_OUT std::vector<double>& confidences)=0;
/**
@brief simple getter for the vocabulary employed
*/
CV_WRAP virtual const std::vector<String>& getVocabulary()=0;
/** @brief simple getter for the preprocessing functor
*/
CV_WRAP virtual Ptr<TextImageClassifier> getClassifier()=0;
/** @brief Creates an instance of the OCRHolisticWordRecognizer class.
@param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance
@param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
of the classifier.
*/
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(Ptr<TextImageClassifier> classifierPtr,String vocabularyFilename);
/** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DeepCNN classifier.
@param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
@param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
@param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
of the classifier.
*/
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename);
/** @brief
*
* @param classifierPtr
*
* @param vocabulary
*/
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(Ptr<TextImageClassifier> classifierPtr,const std::vector<String>& vocabulary);
/** @brief
*
* @param modelArchFilename
*
* @param modelWeightsFilename
*
* @param vocabulary
*/
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create (String modelArchFilename, String modelWeightsFilename, const std::vector<String>& vocabulary);
};
}//namespace text
}//namespace cv
#endif // _OPENCV_TEXT_OCR_HPP_

@ -0,0 +1,235 @@
/*M//////////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__
#define __OPENCV_TEXT_TEXTDETECTOR_HPP__
#include <vector>
#include <string>
#include <iostream>
#include <sstream>
#include"ocr.hpp"
namespace cv
{
namespace text
{
//! @addtogroup text_recognize
//! @{
//base class BaseDetector declares a common API that would be used in a typical text
//recognition scenario
class CV_EXPORTS_W BaseDetector
{
public:
virtual ~BaseDetector() {};
virtual void run(Mat& image,
std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0) = 0;
virtual void run(Mat& image, Mat& mask,
std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0) = 0;
/** @brief Main functionality of the OCR Hierarchy. Subclasses provide
* default parameters for all parameters other than the input image.
*/
// virtual std::vector<Rect>* run(InputArray image){
// //std::string res;
// std::vector<Rect> component_rects;
// std::vector<float> component_confidences;
// //std::vector<std::string> component_texts;
// Mat inputImage=image.getMat();
// this->run(inputImage,&component_rects,
// &component_confidences,OCR_LEVEL_WORD);
// return *component_rects;
// }
};
//Classifiers should provide diferent backends
//For the moment only caffe is implemeted
//enum{
// OCR_HOLISTIC_BACKEND_NONE,
// OCR_HOLISTIC_BACKEND_CAFFE
//};
/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting.
* Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable
* word given an input image.
*
* This class implements the logic of providing transcriptions given a vocabulary and and an image
* classifer. The classifier has to be any TextImageClassifier but the classifier for which this
* class was built is the DictNet. In order to load it the following files should be downloaded:
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt>
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel>
* <http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt>
*/
class CV_EXPORTS_W textDetector : public BaseDetector
{
public:
virtual void run(Mat& image, std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=OCR_LEVEL_WORD)=0;
/** @brief Recognize text using a segmentation based word-spotting/classifier cnn.
Takes image on input and returns recognized text in the output_text parameter. Optionally
provides also the Rects for individual text elements found (e.g. words), and the list of those
text elements with their confidence values.
@param image Input image CV_8UC1 or CV_8UC3
@param mask is totally ignored and is only available for compatibillity reasons
@param output_text Output text of the the word spoting, always one that exists in the dictionary.
@param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will
be put in the vector.
@param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will
be put in the vector.
@param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will
be put in the vector.
@param component_level must be OCR_LEVEL_WORD.
*/
virtual void run(Mat& image, Mat& mask, std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=OCR_LEVEL_WORD)=0;
/**
@brief Method that provides a quick and simple interface to a single word image classifcation
@param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size
@param transcription an opencv string that will store the detected word transcription
@param confidence a double that will be updated with the confidence the classifier has for the selected word
*/
CV_WRAP virtual void textDetectInImage(InputArray inputImage,CV_OUT std::vector<Rect>& Bbox,CV_OUT std::vector<float>& confidence)=0;
/**
@brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage
the classifiers parallel capabilities.
@param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed
to contain a single word.
@param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each
input image
@param confidences a vector of double that will be updated with the confidence the classifier has for each of the
selected words.
*/
//CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptions,CV_OUT std::vector<double>& confidences)=0;
/** @brief simple getter for the preprocessing functor
*/
CV_WRAP virtual Ptr<TextImageClassifier> getClassifier()=0;
/** @brief Creates an instance of the OCRHolisticWordRecognizer class.
@param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance
@param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
of the classifier.
*/
CV_WRAP static Ptr<textDetector> create(Ptr<TextImageClassifier> classifierPtr);
/** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DeepCNN classifier.
@param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
@param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
@param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
of the classifier.
*/
CV_WRAP static Ptr<textDetector> create(String modelArchFilename, String modelWeightsFilename);
/** @brief
*
* @param classifierPtr
*
* @param vocabulary
*/
// CV_WRAP static Ptr<textDetectImage> create(Ptr<TextImageClassifier> classifierPtr,const std::vector<String>& vocabulary);
/** @brief
*
* @param modelArchFilename
*
* @param modelWeightsFilename
*
* @param vocabulary
*/
// CV_WRAP static Ptr<textDetectImage> create (String modelArchFilename, String modelWeightsFilename, const std::vector<String>& vocabulary);
};
}//namespace text
}//namespace cv
#endif // _OPENCV_TEXT_OCR_HPP_

@ -0,0 +1,879 @@
#include "precomp.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/core.hpp"
#include <iostream>
#include <fstream>
#include <sstream>
#include <queue>
#include <algorithm>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#ifdef HAVE_CAFFE
#include "caffe/caffe.hpp"
#endif
namespace cv { namespace text {
//Maybe OpenCV has a routine better suited
inline bool fileExists (String filename) {
std::ifstream f(filename.c_str());
return f.good();
}
//************************************************************************************
//****************** ImagePreprocessor *******************************************
//************************************************************************************
void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){
Mat inpImg=input.getMat();
Mat outImg;
this->preprocess_(inpImg,outImg,sz,outputChannels);
outImg.copyTo(output);
}
void ImagePreprocessor::set_mean(Mat mean){
this->set_mean_(mean);
}
class ResizerPreprocessor: public ImagePreprocessor{
protected:
void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){
//TODO put all the logic of channel and depth conversions in ImageProcessor class
CV_Assert(outputChannels==1 || outputChannels==3);
CV_Assert(input.channels()==1 || input.channels()==3);
if(input.channels()!=outputChannels)
{
Mat tmpInput;
if(outputChannels==1){
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC1);
}
}else
{
cvtColor(input,tmpInput,COLOR_GRAY2BGR);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC3);
}
}
}else
{
if(input.channels()==1)
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC1);
}
}else
{
if(input.depth()==CV_8U){
input.convertTo(output, CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC3);
}
}
}
if(outputSize.width!=0 && outputSize.height!=0)
{
resize(output,output,outputSize);
}
}
//void set_mean_(Mat m){}
public:
ResizerPreprocessor(){}
~ResizerPreprocessor(){}
};
class StandarizerPreprocessor: public ImagePreprocessor{
protected:
double sigma_;
//void set_mean_(Mat M){}
void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){
//TODO put all the logic of channel and depth conversions in ImageProcessor class
CV_Assert(outputChannels==1 || outputChannels==3);
CV_Assert(input.channels()==1 || input.channels()==3);
if(input.channels()!=outputChannels)
{
Mat tmpInput;
if(outputChannels==1)
{
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC1);
}
}else
{
cvtColor(input,tmpInput,COLOR_GRAY2BGR);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC3);
}
}
}else
{
if(input.channels()==1)
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC1);
}
}else
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC3);
}
}
}
if(outputSize.width!=0 && outputSize.height!=0)
{
resize(output,output,outputSize);
}
Scalar mean,dev;
meanStdDev(output,mean,dev);
subtract(output,mean[0],output);
divide(output,(dev[0]/sigma_),output);
}
public:
StandarizerPreprocessor(double sigma):sigma_(sigma){}
~StandarizerPreprocessor(){}
};
class customPreprocessor:public ImagePreprocessor{
protected:
double rawval_;
Mat mean_;
String channel_order_;
void set_mean_(Mat imMean_){
imMean_.copyTo(this->mean_);
}
void set_raw_scale(int rawval){
rawval_ = rawval;
}
void set_channels(String channel_order){
channel_order_=channel_order;
}
void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){
//TODO put all the logic of channel and depth conversions in ImageProcessor class
CV_Assert(outputChannels==1 || outputChannels==3);
CV_Assert(input.channels()==1 || input.channels()==3);
if(input.channels()!=outputChannels)
{
Mat tmpInput;
if(outputChannels==1)
{
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U)
{
if (rawval_ == 1)
tmpInput.convertTo(output,CV_32FC3,1/255.0);
else
tmpInput.convertTo(output,CV_32FC1);
}else
{//Assuming values are at the desired [0,1] range
if (rawval_ ==1)
tmpInput.convertTo(output, CV_32FC1);
else
tmpInput.convertTo(output, CV_32FC1,rawval_);
}
}else
{
cvtColor(input,tmpInput,COLOR_GRAY2BGR);
if(input.depth()==CV_8U)
{
if (rawval_ == 1)
tmpInput.convertTo(output,CV_32FC3,1/255.0);
else
tmpInput.convertTo(output,CV_32FC1);
}else
{//Assuming values are at the desired [0,1] range
if (rawval_ ==1)
tmpInput.convertTo(output, CV_32FC1);
else
tmpInput.convertTo(output, CV_32FC1,rawval_);
}
}
}else
{
if(input.channels()==1)
{
if(input.depth()==CV_8U)
{
if (rawval_ == 1)
input.convertTo(output,CV_32FC1,1/255.0);
else
input.convertTo(output,CV_32FC1);
}else
{//Assuming values are at the desired [0,1] range
if (rawval_ ==1)
input.convertTo(output, CV_32FC1);
else
input.convertTo(output, CV_32FC1,rawval_);
}
}else
{
if(input.depth()==CV_8U)
{
if (rawval_ == 1)
input.convertTo(output,CV_32FC3,1/255.0);
else
input.convertTo(output,CV_32FC3);
}else
{//Assuming values are at the desired [0,1] range
if (rawval_ ==1)
input.convertTo(output, CV_32FC3);
else
input.convertTo(output, CV_32FC3,rawval_);
}
}
}
if(outputSize.width!=0 && outputSize.height!=0)
{
resize(output,output,outputSize);
}
if (!this->mean_.empty()){
Scalar mean_s(this->mean_.at<uchar>(0,0),this->mean_.at<uchar>(0,1),this->mean_.at<uchar>(0,2));
subtract(output,mean_s,output);
}
else{
Scalar mean_s;
mean_s = mean(output);
subtract(output,mean_s,output);
}
}
public:
customPreprocessor( double rawval,String channel_order):rawval_(rawval),channel_order_(channel_order){}
~customPreprocessor(){}
};
class MeanSubtractorPreprocessor: public ImagePreprocessor{
protected:
Mat mean_;
//void set_mean_(Mat m){}
void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){
//TODO put all the logic of channel and depth conversions in ImageProcessor class
CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height);
CV_Assert(outputChannels==1 || outputChannels==3);
CV_Assert(input.channels()==1 || input.channels()==3);
if(input.channels()!=outputChannels)
{
Mat tmpInput;
if(outputChannels==1)
{
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC1);
}
}else
{
cvtColor(input,tmpInput,COLOR_GRAY2BGR);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC3);
}
}
}else
{
if(input.channels()==1)
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC1);
}
}else
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC3);
}
}
}
if(outputSize.width!=0 && outputSize.height!=0)
{
resize(output,output,outputSize);
}
subtract(output,this->mean_,output);
}
public:
MeanSubtractorPreprocessor(Mat mean)
{
mean.copyTo(this->mean_);
}
~MeanSubtractorPreprocessor(){}
};
Ptr<ImagePreprocessor> ImagePreprocessor::createResizer()
{
return Ptr<ImagePreprocessor>(new ResizerPreprocessor);
}
Ptr<ImagePreprocessor> ImagePreprocessor::createImageStandarizer(double sigma)
{
return Ptr<ImagePreprocessor>(new StandarizerPreprocessor(sigma));
}
Ptr<ImagePreprocessor> ImagePreprocessor::createImageCustomPreprocessor(double rawval,String channel_order)
{
return Ptr<ImagePreprocessor>(new customPreprocessor(rawval,channel_order));
}
Ptr<ImagePreprocessor> ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg)
{
Mat tmp=meanImg.getMat();
return Ptr<ImagePreprocessor>(new MeanSubtractorPreprocessor(tmp));
}
//************************************************************************************
//****************** TextImageClassifier *****************************************
//************************************************************************************
void TextImageClassifier::preprocess(const Mat& input,Mat& output)
{
this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_);
}
void TextImageClassifier::setPreprocessor(Ptr<ImagePreprocessor> ptr)
{
CV_Assert(!ptr.empty());
preprocessor_=ptr;
}
Ptr<ImagePreprocessor> TextImageClassifier::getPreprocessor()
{
return preprocessor_;
}
class DeepCNNCaffeImpl: public DeepCNN{
protected:
void classifyMiniBatch(std::vector<Mat> inputImageList, Mat outputMat)
{
//Classifies a list of images containing at most minibatchSz_ images
CV_Assert(int(inputImageList.size())<=this->minibatchSz_);
CV_Assert(outputMat.isContinuous());
#ifdef HAVE_CAFFE
net_->input_blobs()[0]->Reshape(inputImageList.size(), this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width);
net_->Reshape();
float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data();
float* inputData=inputBuffer;
for(size_t imgNum=0;imgNum<inputImageList.size();imgNum++)
{
std::vector<Mat> input_channels;
Mat preprocessed;
// if the image have multiple color channels the input layer should be populated accordingly
for (int channel=0;channel < this->channelCount_;channel++){
cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData);
input_channels.push_back(netInputWraped);
//input_data += width * height;
inputData+=(this->inputGeometry_.height*this->inputGeometry_.width);
}
this->preprocess(inputImageList[imgNum],preprocessed);
split(preprocessed, input_channels);
}
this->net_->ForwardPrefilled();
const float* outputNetData=net_->output_blobs()[0]->cpu_data();
this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height());
int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width;
//outputMat.resize(this->outputGeometry_.height * this->outputGeometry_.width);
float*outputMatData=(float*)(outputMat.data);
memcpy(outputMatData,outputNetData,sizeof(float)*outputSz*inputImageList.size());
#endif
}
void process_(Mat inputImage, Mat &outputMat)
{
// do forward pass and stores the output in outputMat
//Process one image
CV_Assert(this->minibatchSz_==1);
//CV_Assert(outputMat.isContinuous());
#ifdef HAVE_CAFFE
net_->input_blobs()[0]->Reshape(1, this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width);
net_->Reshape();
float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data();
float* inputData=inputBuffer;
std::vector<Mat> input_channels;
Mat preprocessed;
// if the image have multiple color channels the input layer should be populated accordingly
for (int channel=0;channel < this->channelCount_;channel++){
cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData);
input_channels.push_back(netInputWraped);
//input_data += width * height;
inputData+=(this->inputGeometry_.height*this->inputGeometry_.width);
}
this->preprocess(inputImage,preprocessed);
split(preprocessed, input_channels);
//preprocessed.copyTo(netInputWraped);
this->net_->Forward();
const float* outputNetData=net_->output_blobs()[0]->cpu_data();
// const float* outputNetData1=net_->output_blobs()[1]->cpu_data();
this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height());
int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width;
outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1);
float*outputMatData=(float*)(outputMat.data);
memcpy(outputMatData,outputNetData,sizeof(float)*outputSz);
#endif
}
#ifdef HAVE_CAFFE
Ptr<caffe::Net<float> > net_;
#endif
//Size inputGeometry_;
int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst
int outputSize_;
public:
DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn):
minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){
channelCount_=dn.channelCount_;
inputGeometry_=dn.inputGeometry_;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
#ifdef HAVE_CAFFE
this->net_=dn.net_;
#endif
}
DeepCNNCaffeImpl& operator=(const DeepCNNCaffeImpl &dn)
{
#ifdef HAVE_CAFFE
this->net_=dn.net_;
#endif
this->setPreprocessor(dn.preprocessor_);
this->inputGeometry_=dn.inputGeometry_;
this->channelCount_=dn.channelCount_;
this->minibatchSz_=dn.minibatchSz_;
this->outputSize_=dn.outputSize_;
this->preprocessor_=dn.preprocessor_;
this->outputGeometry_=dn.outputGeometry_;
return *this;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
}
DeepCNNCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr<ImagePreprocessor> preprocessor, int maxMinibatchSz)
:minibatchSz_(maxMinibatchSz)
{
CV_Assert(this->minibatchSz_>0);
CV_Assert(fileExists(modelArchFilename));
CV_Assert(fileExists(modelWeightsFilename));
CV_Assert(!preprocessor.empty());
this->setPreprocessor(preprocessor);
#ifdef HAVE_CAFFE
this->net_.reset(new caffe::Net<float>(modelArchFilename, caffe::TEST));
CV_Assert(net_->num_inputs()==1);
CV_Assert(net_->num_outputs()==1);
CV_Assert(this->net_->input_blobs()[0]->channels()==1
||this->net_->input_blobs()[0]->channels()==3);
this->channelCount_=this->net_->input_blobs()[0]->channels();
this->net_->CopyTrainedLayersFrom(modelWeightsFilename);
caffe::Blob<float>* inputLayer = this->net_->input_blobs()[0];
this->inputGeometry_=Size(inputLayer->width(), inputLayer->height());
this->channelCount_ = inputLayer->channels();
inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width);
net_->Reshape();
this->outputSize_=net_->output_blobs()[0]->channels();
this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height());
#else
CV_Error(Error::StsError,"Caffe not available during compilation!");
#endif
}
void classify(InputArray image, OutputArray classProbabilities)
{
std::vector<Mat> inputImageList;
inputImageList.push_back(image.getMat());
classifyBatch(inputImageList,classProbabilities);
}
void detect(InputArray image, OutputArray Bbox_prob)
{
Bbox_prob.create(this->outputGeometry_,CV_32F); // dummy initialization is it needed
Mat outputMat = Bbox_prob.getMat();
process_(image.getMat(),outputMat);
//copy back to outputArray
outputMat.copyTo(Bbox_prob);
}
void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities)
{
std::vector<Mat> allImageVector;
inputImageList.getMatVector(allImageVector);
size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic
size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic
classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F);
Mat outputMat = classProbabilities.getMat();
for(size_t imgNum=0;imgNum<allImageVector.size();imgNum+=minibatchSize)
{
size_t rangeEnd=imgNum+std::min<size_t>(allImageVector.size()-imgNum,minibatchSize);
std::vector<Mat>::const_iterator from=std::vector<Mat>::const_iterator(allImageVector.begin()+imgNum);
std::vector<Mat>::const_iterator to=std::vector<Mat>::const_iterator(allImageVector.begin()+rangeEnd);
std::vector<Mat> minibatchInput(from,to);
classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd)));
}
}
int getOutputSize()
{
return this->outputSize_;
}
Size getOutputGeometry()
{
return this->outputGeometry_;
}
int getMinibatchSize()
{
return this->minibatchSz_;
}
int getBackend()
{
return OCR_HOLISTIC_BACKEND_CAFFE;
}
};
Ptr<DeepCNN> DeepCNN::create(String archFilename,String weightsFilename,Ptr<ImagePreprocessor> preprocessor,int minibatchSz,int backEnd)
{
if(preprocessor.empty())
{
preprocessor=ImagePreprocessor::createResizer();
}
switch(backEnd){
case OCR_HOLISTIC_BACKEND_CAFFE:
return Ptr<DeepCNN>(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz));
break;
case OCR_HOLISTIC_BACKEND_NONE:
default:
CV_Error(Error::StsError,"DeepCNN::create backend not implemented");
return Ptr<DeepCNN>();
break;
}
}
Ptr<DeepCNN> DeepCNN::createDictNet(String archFilename,String weightsFilename,int backEnd)
{
Ptr<ImagePreprocessor> preprocessor=ImagePreprocessor::createImageStandarizer(113);
switch(backEnd){
case OCR_HOLISTIC_BACKEND_CAFFE:
return Ptr<DeepCNN>(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100));
break;
case OCR_HOLISTIC_BACKEND_NONE:
default:
CV_Error(Error::StsError,"DeepCNN::create backend not implemented");
return Ptr<DeepCNN>();
break;
}
}
namespace cnn_config{
namespace caffe_backend{
#ifdef HAVE_CAFFE
bool getCaffeGpuMode()
{
return caffe::Caffe::mode()==caffe::Caffe::GPU;
}
void setCaffeGpuMode(bool useGpu)
{
if(useGpu)
{
caffe::Caffe::set_mode(caffe::Caffe::GPU);
}else
{
caffe::Caffe::set_mode(caffe::Caffe::CPU);
}
}
bool getCaffeAvailable()
{
return true;
}
#else
bool getCaffeGpuMode()
{
CV_Error(Error::StsError,"Caffe not available during compilation!");
return 0;
}
void setCaffeGpuMode(bool useGpu)
{
CV_Error(Error::StsError,"Caffe not available during compilation!");
CV_Assert(useGpu==1);//Compilation directives force
}
bool getCaffeAvailable(){
return 0;
}
#endif
}//namespace caffe
}//namespace cnn_config
class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{
private:
struct NetOutput{
//Auxiliary structure that handles the logic of getting class ids and probabillities from
//the raw outputs of caffe
int wordIdx;
float probabillity;
static bool sorter(const NetOutput& o1,const NetOutput& o2)
{//used with std::sort to provide the most probable class
return o1.probabillity>o2.probabillity;
}
static void getOutputs(const float* buffer,int nbOutputs,std::vector<NetOutput>& res)
{
res.resize(nbOutputs);
for(int k=0;k<nbOutputs;k++)
{
res[k].wordIdx=k;
res[k].probabillity=buffer[k];
}
std::sort(res.begin(),res.end(),NetOutput::sorter);
}
static void getClassification(const float* buffer,int nbOutputs,int &classNum,double& confidence)
{
std::vector<NetOutput> tmp;
getOutputs(buffer,nbOutputs,tmp);
classNum=tmp[0].wordIdx;
confidence=tmp[0].probabillity;
}
};
protected:
std::vector<String> labels_;
Ptr<TextImageClassifier> classifier_;
public:
OCRHolisticWordRecognizerImpl(Ptr<TextImageClassifier> classifierPtr,String vocabularyFilename):classifier_(classifierPtr)
{
CV_Assert(fileExists(vocabularyFilename));//this fails for some rason
std::ifstream labelsFile(vocabularyFilename.c_str());
if(!labelsFile)
{
CV_Error(Error::StsError,"Could not read Labels from file");
}
std::string line;
while (std::getline(labelsFile, line))
{
labels_.push_back(std::string(line));
}
CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size()));
}
OCRHolisticWordRecognizerImpl(Ptr<TextImageClassifier> classifierPtr,const std::vector<String>& vocabulary):classifier_(classifierPtr)
{
this->labels_=vocabulary;
CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size()));
}
void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)
{
Mat netOutput;
this->classifier_->classify(inputImage,netOutput);
int classNum;
NetOutput::getClassification((float*)(netOutput.data),this->classifier_->getOutputSize(),classNum,confidence);
transcription=this->labels_[classNum];
}
void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptionVec,CV_OUT std::vector<double>& confidenceVec)
{
Mat netOutput;
this->classifier_->classifyBatch(inputImageList,netOutput);
for(int k=0;k<netOutput.rows;k++)
{
int classNum;
double confidence;
NetOutput::getClassification((float*)(netOutput.row(k).data),this->classifier_->getOutputSize(),classNum,confidence);
transcriptionVec.push_back(this->labels_[classNum]);
confidenceVec.push_back(confidence);
}
}
void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0)
{
CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting
double confidence;
String transcription;
recogniseImage(image,transcription,confidence);
output_text=transcription.c_str();
if(component_rects!=NULL)
{
component_rects->resize(1);
(*component_rects)[0]=Rect(0,0,image.size().width,image.size().height);
}
if(component_texts!=NULL)
{
component_texts->resize(1);
(*component_texts)[0]=transcription.c_str();
}
if(component_confidences!=NULL)
{
component_confidences->resize(1);
(*component_confidences)[0]=float(confidence);
}
}
void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
int component_level=0)
{
CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image
this->run(image,output_text,component_rects,component_texts,component_confidences,component_level);
}
std::vector<String>& getVocabulary()
{
return this->labels_;
}
Ptr<TextImageClassifier> getClassifier()
{
return this->classifier_;
}
};
Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(Ptr<TextImageClassifier> classifierPtr,String vocabularyFilename )
{
return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename));
}
Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename)
{
Ptr<ImagePreprocessor> preprocessor=ImagePreprocessor::createImageStandarizer(113);
Ptr<TextImageClassifier> classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100));
return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename));
}
Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(Ptr<TextImageClassifier> classifierPtr,const std::vector<String>& vocabulary)
{
return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary));
}
Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename,const std::vector<String>& vocabulary){
Ptr<ImagePreprocessor> preprocessor=ImagePreprocessor::createImageStandarizer(113);
Ptr<TextImageClassifier> classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100));
return Ptr<OCRHolisticWordRecognizer>(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary));
}
} } //namespace text namespace cv

@ -0,0 +1,643 @@
#include "precomp.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/core.hpp"
#include <iostream>
#include <fstream>
#include <sstream>
#include <queue>
#include <algorithm>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#ifdef HAVE_CAFFE
#include "caffe/caffe.hpp"
#endif
namespace cv { namespace text {
//Maybe OpenCV has a routine better suited
//inline bool fileExists (String filename) {
// std::ifstream f(filename.c_str());
// return f.good();
//}
//************************************************************************************
//****************** ImagePreprocessor *******************************************
//************************************************************************************
/*void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){
Mat inpImg=input.getMat();
Mat outImg;
this->preprocess_(inpImg,outImg,sz,outputChannels);
outImg.copyTo(output);
}*/
/*class ResizerPreprocessor: public ImagePreprocessor{
protected:
void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){
//TODO put all the logic of channel and depth conversions in ImageProcessor class
CV_Assert(outputChannels==1 || outputChannels==3);
CV_Assert(input.channels()==1 || input.channels()==3);
if(input.channels()!=outputChannels)
{
Mat tmpInput;
if(outputChannels==1){
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC1);
}
}else
{
cvtColor(input,tmpInput,COLOR_GRAY2BGR);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC3);
}
}
}else
{
if(input.channels()==1)
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC1);
}
}else
{
if(input.depth()==CV_8U){
input.convertTo(output, CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC3);
}
}
}
if(outputSize.width!=0 && outputSize.height!=0)
{
resize(output,output,outputSize);
}
}
public:
ResizerPreprocessor(){}
~ResizerPreprocessor(){}
};
class StandarizerPreprocessor: public ImagePreprocessor{
protected:
double sigma_;
void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){
//TODO put all the logic of channel and depth conversions in ImageProcessor class
CV_Assert(outputChannels==1 || outputChannels==3);
CV_Assert(input.channels()==1 || input.channels()==3);
if(input.channels()!=outputChannels)
{
Mat tmpInput;
if(outputChannels==1)
{
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC1);
}
}else
{
cvtColor(input,tmpInput,COLOR_GRAY2BGR);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC3);
}
}
}else
{
if(input.channels()==1)
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC1);
}
}else
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC3);
}
}
}
if(outputSize.width!=0 && outputSize.height!=0)
{
resize(output,output,outputSize);
}
Scalar dev,mean;
meanStdDev(output,mean,dev);
subtract(output,mean[0],output);
divide(output,(dev[0]/sigma_),output);
}
public:
StandarizerPreprocessor(double sigma):sigma_(sigma){}
~StandarizerPreprocessor(){}
};
class MeanSubtractorPreprocessor: public ImagePreprocessor{
protected:
Mat mean_;
void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){
//TODO put all the logic of channel and depth conversions in ImageProcessor class
CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height);
CV_Assert(outputChannels==1 || outputChannels==3);
CV_Assert(input.channels()==1 || input.channels()==3);
if(input.channels()!=outputChannels)
{
Mat tmpInput;
if(outputChannels==1)
{
cvtColor(input,tmpInput,COLOR_BGR2GRAY);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC1);
}
}else
{
cvtColor(input,tmpInput,COLOR_GRAY2BGR);
if(input.depth()==CV_8U)
{
tmpInput.convertTo(output,CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
tmpInput.convertTo(output, CV_32FC3);
}
}
}else
{
if(input.channels()==1)
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC1,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC1);
}
}else
{
if(input.depth()==CV_8U)
{
input.convertTo(output, CV_32FC3,1/255.0);
}else
{//Assuming values are at the desired [0,1] range
input.convertTo(output, CV_32FC3);
}
}
}
if(outputSize.width!=0 && outputSize.height!=0)
{
resize(output,output,outputSize);
}
subtract(output,this->mean_,output);
}
public:
MeanSubtractorPreprocessor(Mat mean)
{
mean.copyTo(this->mean_);
}
~MeanSubtractorPreprocessor(){}
};
Ptr<ImagePreprocessor> ImagePreprocessor::createResizer()
{
return Ptr<ImagePreprocessor>(new ResizerPreprocessor);
}
Ptr<ImagePreprocessor> ImagePreprocessor::createImageStandarizer(double sigma)
{
return Ptr<ImagePreprocessor>(new StandarizerPreprocessor(sigma));
}
Ptr<ImagePreprocessor> ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg)
{
Mat tmp=meanImg.getMat();
return Ptr<ImagePreprocessor>(new MeanSubtractorPreprocessor(tmp));
}
//************************************************************************************
//****************** TextImageClassifier *****************************************
//************************************************************************************
void TextImageClassifier::preprocess(const Mat& input,Mat& output)
{
this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_);
}
void TextImageClassifier::setPreprocessor(Ptr<ImagePreprocessor> ptr)
{
CV_Assert(!ptr.empty());
preprocessor_=ptr;
}
Ptr<ImagePreprocessor> TextImageClassifier::getPreprocessor()
{
return preprocessor_;
}*/
/*
class DeepCNNCaffeImpl: public DeepCNN{
protected:
void classifyMiniBatch(std::vector<Mat> inputImageList, Mat outputMat)
{
//Classifies a list of images containing at most minibatchSz_ images
CV_Assert(int(inputImageList.size())<=this->minibatchSz_);
CV_Assert(outputMat.isContinuous());
#ifdef HAVE_CAFFE
net_->input_blobs()[0]->Reshape(inputImageList.size(), 1,this->inputGeometry_.height,this->inputGeometry_.width);
net_->Reshape();
float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data();
float* inputData=inputBuffer;
for(size_t imgNum=0;imgNum<inputImageList.size();imgNum++)
{
Mat preprocessed;
cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData);
this->preprocess(inputImageList[imgNum],preprocessed);
preprocessed.copyTo(netInputWraped);
inputData+=(this->inputGeometry_.height*this->inputGeometry_.width);
}
this->net_->ForwardPrefilled();
const float* outputNetData=net_->output_blobs()[0]->cpu_data();
float*outputMatData=(float*)(outputMat.data);
memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size());
#endif
}
#ifdef HAVE_CAFFE
Ptr<caffe::Net<float> > net_;
#endif
//Size inputGeometry_;
int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst
int outputSize_;
public:
DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn):
minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){
channelCount_=dn.channelCount_;
inputGeometry_=dn.inputGeometry_;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
#ifdef HAVE_CAFFE
this->net_=dn.net_;
#endif
}
DeepCNNCaffeImpl& operator=(const DeepCNNCaffeImpl &dn)
{
#ifdef HAVE_CAFFE
this->net_=dn.net_;
#endif
this->setPreprocessor(dn.preprocessor_);
this->inputGeometry_=dn.inputGeometry_;
this->channelCount_=dn.channelCount_;
this->minibatchSz_=dn.minibatchSz_;
this->outputSize_=dn.outputSize_;
this->preprocessor_=dn.preprocessor_;
return *this;
//Implemented to supress Visual Studio warning "assignment operator could not be generated"
}
DeepCNNCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr<ImagePreprocessor> preprocessor, int maxMinibatchSz)
:minibatchSz_(maxMinibatchSz)
{
CV_Assert(this->minibatchSz_>0);
CV_Assert(fileExists(modelArchFilename));
CV_Assert(fileExists(modelWeightsFilename));
CV_Assert(!preprocessor.empty());
this->setPreprocessor(preprocessor);
#ifdef HAVE_CAFFE
this->net_.reset(new caffe::Net<float>(modelArchFilename, caffe::TEST));
CV_Assert(net_->num_inputs()==1);
CV_Assert(net_->num_outputs()==1);
CV_Assert(this->net_->input_blobs()[0]->channels()==1
||this->net_->input_blobs()[0]->channels()==3);
this->channelCount_=this->net_->input_blobs()[0]->channels();
this->net_->CopyTrainedLayersFrom(modelWeightsFilename);
caffe::Blob<float>* inputLayer = this->net_->input_blobs()[0];
this->inputGeometry_=Size(inputLayer->width(), inputLayer->height());
inputLayer->Reshape(this->minibatchSz_,1,this->inputGeometry_.height, this->inputGeometry_.width);
net_->Reshape();
this->outputSize_=net_->output_blobs()[0]->channels();
#else
CV_Error(Error::StsError,"Caffe not available during compilation!");
#endif
}
void classify(InputArray image, OutputArray classProbabilities)
{
std::vector<Mat> inputImageList;
inputImageList.push_back(image.getMat());
classifyBatch(inputImageList,classProbabilities);
}
void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities)
{
std::vector<Mat> allImageVector;
inputImageList.getMatVector(allImageVector);
size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic
size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic
classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F);
Mat outputMat = classProbabilities.getMat();
for(size_t imgNum=0;imgNum<allImageVector.size();imgNum+=minibatchSize)
{
size_t rangeEnd=imgNum+std::min<size_t>(allImageVector.size()-imgNum,minibatchSize);
std::vector<Mat>::const_iterator from=std::vector<Mat>::const_iterator(allImageVector.begin()+imgNum);
std::vector<Mat>::const_iterator to=std::vector<Mat>::const_iterator(allImageVector.begin()+rangeEnd);
std::vector<Mat> minibatchInput(from,to);
classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd)));
}
}
int getOutputSize()
{
return this->outputSize_;
}
int getMinibatchSize()
{
return this->minibatchSz_;
}
int getBackend()
{
return OCR_HOLISTIC_BACKEND_CAFFE;
}
};
Ptr<DeepCNN> DeepCNN::create(String archFilename,String weightsFilename,Ptr<ImagePreprocessor> preprocessor,int minibatchSz,int backEnd)
{
if(preprocessor.empty())
{
preprocessor=ImagePreprocessor::createResizer();
}
switch(backEnd){
case OCR_HOLISTIC_BACKEND_CAFFE:
return Ptr<DeepCNN>(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz));
break;
case OCR_HOLISTIC_BACKEND_NONE:
default:
CV_Error(Error::StsError,"DeepCNN::create backend not implemented");
return Ptr<DeepCNN>();
break;
}
}
Ptr<DeepCNN> DeepCNN::createDictNet(String archFilename,String weightsFilename,int backEnd)
{
Ptr<ImagePreprocessor> preprocessor=ImagePreprocessor::createImageStandarizer(113);
switch(backEnd){
case OCR_HOLISTIC_BACKEND_CAFFE:
return Ptr<DeepCNN>(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100));
break;
case OCR_HOLISTIC_BACKEND_NONE:
default:
CV_Error(Error::StsError,"DeepCNN::create backend not implemented");
return Ptr<DeepCNN>();
break;
}
}
namespace cnn_config{
namespace caffe_backend{
#ifdef HAVE_CAFFE
bool getCaffeGpuMode()
{
return caffe::Caffe::mode()==caffe::Caffe::GPU;
}
void setCaffeGpuMode(bool useGpu)
{
if(useGpu)
{
caffe::Caffe::set_mode(caffe::Caffe::GPU);
}else
{
caffe::Caffe::set_mode(caffe::Caffe::CPU);
}
}
bool getCaffeAvailable()
{
return true;
}
#else
bool getCaffeGpuMode()
{
CV_Error(Error::StsError,"Caffe not available during compilation!");
return 0;
}
void setCaffeGpuMode(bool useGpu)
{
CV_Error(Error::StsError,"Caffe not available during compilation!");
CV_Assert(useGpu==1);//Compilation directives force
}
bool getCaffeAvailable(){
return 0;
}
#endif
}//namespace caffe
}//namespace cnn_config
*/
class textDetectImpl: public textDetector{
private:
struct NetOutput{
//Auxiliary structure that handles the logic of getting bounding box and confidences of textness from
//the raw outputs of caffe
Rect bbox;
float probability;
// static bool sorter(const NetOutput& o1,const NetOutput& o2)
// {//used with std::sort to provide the most probable class
// return o1.probabillity>o2.probabillity;
// }
static void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,std::vector<NetOutput>& res,Size inputShape)
{
res.resize(nbrTextBoxes);
for(int k=0;k<nbrTextBoxes;k++)
{
float x_min = buffer[k*nCol+3]*inputShape.width;
float y_min = buffer[k*nCol+4]*inputShape.height;
float x_max = buffer[k*nCol+5]*inputShape.width;
float y_max = buffer[k*nCol +6]*inputShape.height;
x_min = x_min<0?0:x_min;
y_min = y_min<0?0:y_min;
x_max = x_max> inputShape.width?inputShape.width-1:x_max;
y_max = y_max > inputShape.height?inputShape.height-1:y_max;
float wd = x_max-x_min+1;
float ht = y_max-y_min+1;
res[k].bbox=Rect(int(x_min),int(y_min),int(wd),int(ht));
// printf("%f %f %f %f\n",buffer[k*nCol+3],buffer[k*nCol+4],buffer[k*nCol+5],buffer[k*nCol+6]);
res[k].probability=buffer[k*nCol+2];
}
// std::sort(res.begin(),res.end(),NetOutput::sorter);
}
// static void getDetections(const float* buffer,int nbOutputs,int &classNum,double& confidence)
// {
// std::vector<NetOutput> tmp;
// getOutputs(buffer,nbOutputs,tmp);
// classNum=tmp[0].wordIdx;
// confidence=tmp[0].probabillity;
// }
};
protected:
//std::vector<String> labels_;
Ptr<TextImageClassifier> classifier_;
public:
textDetectImpl(Ptr<TextImageClassifier> classifierPtr):classifier_(classifierPtr)
{
}
void textDetectInImage(InputArray inputImage,CV_OUT std::vector<Rect>& Bbox,CV_OUT std::vector<float>& confidence)
{
Mat netOutput;
//std::cout<<"started detect"<<std::endl;
this->classifier_->detect(inputImage,netOutput);
//std::cout<<"After Detect"<<std::endl;
Size OutputGeometry_ = this->classifier_->getOutputGeometry();
int nbrTextBoxes = OutputGeometry_.height;
int nCol = OutputGeometry_.width;
//std::cout<<nbrTextBoxes<<std::endl;
std::vector<NetOutput> tmp;
Size inputImageShape = Size(inputImage.cols(),inputImage.rows());
NetOutput::getOutputs((float*)(netOutput.data),nbrTextBoxes,nCol,tmp,inputImageShape);
//Bbox.resize(nbrTextBoxes);
//confidence.resize(nbrTextBoxes);
for (int k=0;k<nbrTextBoxes;k++)
{
Bbox.push_back(tmp[k].bbox);
confidence.push_back(tmp[k].probability);
}
//Bbox = netOutput.data;
//confidence = netOutput.data;
}
void run(Mat& image, std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0)
{
CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting
//double confidence;
//String transcription;
std::vector<Rect> bbox;
std::vector<float> score;
textDetectInImage(image,bbox,score);
//output_text=transcription.c_str();
if(component_rects!=NULL)
{
component_rects->resize(bbox.size()); // should be a user behavior
component_rects = &bbox;
}
if(component_confidences!=NULL)
{
component_confidences->resize(score.size()); // shoub be a user behavior
component_confidences = &score;
}
}
void run(Mat& image, Mat& mask, std::vector<Rect>* component_rects=NULL,
std::vector<float>* component_confidences=NULL,
int component_level=0)
{
CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image
this->run(image,component_rects,component_confidences,component_level);
}
// std::vector<String>& getVocabulary()
// {
// return this->labels_;
// }
Ptr<TextImageClassifier> getClassifier()
{
return this->classifier_;
}
};
Ptr<textDetector> textDetector::create(Ptr<TextImageClassifier> classifierPtr)
{
return Ptr<textDetector>(new textDetectImpl(classifierPtr));
}
Ptr<textDetector> textDetector::create(String modelArchFilename, String modelWeightsFilename)
{
Ptr<ImagePreprocessor> preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255);
Mat textbox_mean(1,3,CV_8U);
textbox_mean.at<uchar>(0,0)=104;
textbox_mean.at<uchar>(0,1)=117;
textbox_mean.at<uchar>(0,2)=123;
preprocessor->set_mean(textbox_mean);
Ptr<TextImageClassifier> classifierPtr(DeepCNN::create(modelArchFilename,modelWeightsFilename,preprocessor,1));
return Ptr<textDetector>(new textDetectImpl(classifierPtr));
}
} } //namespace text namespace cv

@ -1,7 +1,13 @@
#ifndef __OPENCV_TEXT_CONFIG_HPP__
#define __OPENCV_TEXT_CONFIG_HPP__
// HAVE QT5
//#cmakedefine HAVE_QT5GUI
// HAVE CAFFE
//#cmakedefine HAVE_CAFFE
// HAVE OCR Tesseract
#cmakedefine HAVE_TESSERACT
//#cmakedefine HAVE_TESSERACT
#endif
#endif

Loading…
Cancel
Save