diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 58c4a49d12..3eb6f82e5f 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -157,6 +157,9 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
 # ifndef PPC_FEATURE2_ARCH_3_00
 #   define PPC_FEATURE2_ARCH_3_00 0x00800000
 # endif
+# ifndef PPC_FEATURE_HAS_VSX
+#   define PPC_FEATURE_HAS_VSX 0x00000080
+# endif
 #endif
 
 #if defined _WIN32 || defined WINCE
@@ -616,7 +619,7 @@ struct HWFeatures
         have[CV_CPU_MSA] = true;
     #endif
 
-    #if (defined __ppc64__ || defined __PPC64__) && defined __unix__
+    #if (defined __ppc64__ || defined __PPC64__) && defined __linux__
         unsigned int hwcap = getauxval(AT_HWCAP);
         if (hwcap & PPC_FEATURE_HAS_VSX) {
             hwcap = getauxval(AT_HWCAP2);
@@ -626,8 +629,19 @@ struct HWFeatures
                 have[CV_CPU_VSX] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
             }
         }
+    #elif (defined __ppc64__ || defined __PPC64__) && defined __FreeBSD__
+        unsigned int hwcap = 0;
+        elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+        if (hwcap & PPC_FEATURE_HAS_VSX) {
+            elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
+            if (hwcap & PPC_FEATURE2_ARCH_3_00) {
+                have[CV_CPU_VSX] = have[CV_CPU_VSX3] = true;
+            } else {
+                have[CV_CPU_VSX] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
+            }
+        }
     #else
-        // TODO: AIX, FreeBSD
+        // TODO: AIX, OpenBSD
         #if CV_VSX || defined _ARCH_PWR8 || defined __POWER9_VECTOR__
             have[CV_CPU_VSX] = true;
         #endif
diff --git a/modules/dnn/src/opencl/ocl4dnn_lrn.cl b/modules/dnn/src/opencl/ocl4dnn_lrn.cl
index 31c9f49451..22370c7303 100644
--- a/modules/dnn/src/opencl/ocl4dnn_lrn.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_lrn.cl
@@ -64,36 +64,37 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
     const int step = height * width;
     __global const Dtype* in_off = in + offset;
     __global Dtype* out_off = out + offset;
-    KERNEL_ARG_DTYPE scale_val;
     int head = 0;
     const int pre_pad = (size - 1) / 2;
     const int post_pad = size - pre_pad - 1;
-    KERNEL_ARG_DTYPE accum_scale = 0;
+    float accum_scale = 0;
     // fill the scale at [n, :, h, w]
     // accumulate values
     while (head < post_pad && head < channels) {
-      accum_scale += in_off[head * step] * in_off[head * step];
+      float v = in_off[head * step];
+      accum_scale += v * v;
       ++head;
     }
     // both add and subtract
     while (head < channels) {
-      accum_scale += in_off[head * step] * in_off[head * step];
+      float v = in_off[head * step];
+      accum_scale += v * v;
       if (head - size >= 0) {
-        accum_scale -= in_off[(head - size) * step]
-            * in_off[(head - size) * step];
+        v = in_off[(head - size) * step];
+        accum_scale -= v * v;
       }
-      scale_val = k + accum_scale * alpha_over_size;
-      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr(scale_val, negative_beta);
+      float scale_val = k + accum_scale * alpha_over_size;
+      out_off[(head - post_pad) * step] = (Dtype)((float)in_off[(head - post_pad) * step] * native_powr(scale_val, negative_beta));
       ++head;
     }
     // subtract only
     while (head < channels + post_pad) {
       if (head - size >= 0) {
-        accum_scale -= in_off[(head - size) * step]
-            * in_off[(head - size) * step];
+        float v = in_off[(head - size) * step];
+        accum_scale -= v * v;
       }
-      scale_val = k + accum_scale * alpha_over_size;
-      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr(scale_val, negative_beta);
+      float scale_val = k + accum_scale * alpha_over_size;
+      out_off[(head - post_pad) * step] = (Dtype)((float)in_off[(head - post_pad) * step] * native_powr(scale_val, negative_beta));
       ++head;
     }
   }
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 3a2b69747c..a6f9c07980 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -2289,6 +2289,7 @@ void TFImporter::parseMean(tensorflow::GraphDef& net, const tensorflow::NodeDef&
     const std::string& type = layer.op();
     const int num_inputs = layer.input_size();
     std::string pool_type = cv::toLowerCase(type);
+    DataLayout layout = getDataLayout(name, data_layouts);
 
     if (pool_type == "mean")
     {
@@ -2352,6 +2353,16 @@ void TFImporter::parseMean(tensorflow::GraphDef& net, const tensorflow::NodeDef&
 
         if (!keepDims)
         {
+            if (layout == DATA_LAYOUT_NHWC)
+            {
+                LayerParams permLP;
+                int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
+                std::string permName = name + "/nhwc";
+                Pin inpId = Pin(layerShapeName);
+                addPermuteLayer(order, permName, inpId);
+                layerShapeName = permName;
+            }
+
             LayerParams squeezeLp;
             std::string squeezeName = name + "/squeeze";
             CV_Assert(layer_id.find(squeezeName) == layer_id.end());
@@ -2374,22 +2385,30 @@ void TFImporter::parseMean(tensorflow::GraphDef& net, const tensorflow::NodeDef&
             layerParams.set("pool", pool_type);
             layerParams.set(axis == 2 ? "kernel_w" : "kernel_h", 1);
             layerParams.set(axis == 2 ? "global_pooling_h" : "global_pooling_w", true);
-            int id = dstNet.addLayer(name, "Pooling", layerParams);
-            layer_id[name] = id;
-            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
 
-            if (!keepDims)
+            if (keepDims)
+            {
+                int id = dstNet.addLayer(name, "Pooling", layerParams);
+                layer_id[name] = id;
+                connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+            }
+            else
             {
                 // To keep correct order after squeeze dims we first need to change layout from NCHW to NHWC
+                std::string poolingName = name + "/Pooling";
+                CV_Assert(layer_id.find(poolingName) == layer_id.end());
+                int id = dstNet.addLayer(poolingName, "Pooling", layerParams);
+                layer_id[poolingName] = id;
+                connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+
                 LayerParams permLP;
                 int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
-                std::string permName = name + "/nchw";
-                Pin inpId = Pin(name);
+                std::string permName = name + "/nhwc";
+                Pin inpId = Pin(poolingName);
                 addPermuteLayer(order, permName, inpId);
 
                 LayerParams squeezeLp;
-                std::string squeezeName = name + "/squeeze";
-                CV_Assert(layer_id.find(squeezeName) == layer_id.end());
+                const std::string& squeezeName = name;
                 squeezeLp.set("axis", indices.at<int>(0));
                 squeezeLp.set("end_axis", indices.at<int>(0) + 1);
                 int squeezeId = dstNet.addLayer(squeezeName, "Flatten", squeezeLp);
@@ -2401,32 +2420,34 @@ void TFImporter::parseMean(tensorflow::GraphDef& net, const tensorflow::NodeDef&
         {
             int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
             Pin inpId = parsePin(layer.input(0));
-            addPermuteLayer(order, name + "/nhwc", inpId);
+            std::string permName = name + "/nhwc";
+            addPermuteLayer(order, permName, inpId);
 
             layerParams.set("pool", pool_type);
             layerParams.set("kernel_h", 1);
             layerParams.set("global_pooling_w", true);
-            int id = dstNet.addLayer(name, "Pooling", layerParams);
-            layer_id[name] = id;
-            connect(layer_id, dstNet, inpId, id, 0);
+            std::string poolingName = name + "/Pooling";
+            CV_Assert(layer_id.find(poolingName) == layer_id.end());
+            int id = dstNet.addLayer(poolingName, "Pooling", layerParams);
+            layer_id[poolingName] = id;
+            connect(layer_id, dstNet, Pin(permName), id, 0);
 
             if (!keepDims)
             {
                 LayerParams squeezeLp;
-                std::string squeezeName = name + "/squeeze";
-                CV_Assert(layer_id.find(squeezeName) == layer_id.end());
+                const std::string& squeezeName = name;
                 int channel_id = 3; // TF NHWC layout
                 squeezeLp.set("axis", channel_id - 1);
                 squeezeLp.set("end_axis", channel_id);
                 int squeezeId = dstNet.addLayer(squeezeName, "Flatten", squeezeLp);
                 layer_id[squeezeName] = squeezeId;
-                connect(layer_id, dstNet, Pin(name), squeezeId, 0);
+                connect(layer_id, dstNet, Pin(poolingName), squeezeId, 0);
             }
             else
             {
                 int order[] = {0, 3, 1, 2};  // From NHWC to OpenCV's NCHW.
-                Pin inpId = parsePin(name);
-                addPermuteLayer(order, name + "/nchw", inpId);
+                Pin inpId = parsePin(poolingName);
+                addPermuteLayer(order, name, inpId);
             }
         }
     } else {
@@ -2435,18 +2456,26 @@ void TFImporter::parseMean(tensorflow::GraphDef& net, const tensorflow::NodeDef&
 
         layerParams.set("pool", pool_type);
         layerParams.set("global_pooling", true);
-        int id = dstNet.addLayer(name, "Pooling", layerParams);
-        layer_id[name] = id;
-        connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
 
-        if (!keepDims)
+        if (keepDims)
         {
+            int id = dstNet.addLayer(name, "Pooling", layerParams);
+            layer_id[name] = id;
+            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+        }
+        else
+        {
+            std::string poolingName = name + "/Pooling";
+            CV_Assert(layer_id.find(poolingName) == layer_id.end());
+            int id = dstNet.addLayer(poolingName, "Pooling", layerParams);
+            layer_id[poolingName] = id;
+            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
             LayerParams flattenLp;
-            std::string flattenName = name + "/flatten";
-            CV_Assert(layer_id.find(flattenName) == layer_id.end());
+            const std::string& flattenName = name;
             int flattenId = dstNet.addLayer(flattenName, "Flatten", flattenLp);
             layer_id[flattenName] = flattenId;
-            connect(layer_id, dstNet, Pin(name), flattenId, 0);
+            connect(layer_id, dstNet, Pin(poolingName), flattenId, 0);
+            data_layouts[name] = DATA_LAYOUT_PLANAR;
         }
     }
 }
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index c0282207dd..24e4fa21a2 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -207,7 +207,7 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
     ASSERT_EQ(inLayerShapes[0][3], 227);
 
     const float l1 = 1e-5;
-    const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-3 : 1e-4;
+    const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-3 : 1e-4;
 
     net.setPreferableBackend(DNN_BACKEND_OPENCV);
     net.setPreferableTarget(targetId);
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 8e7f969480..812523311b 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -423,6 +423,19 @@ TEST_P(Test_TensorFlow_layers, pooling_reduce_sum)
     runTensorFlowNet("reduce_sum");  // a SUM pooling over all spatial dimensions.
 }
 
+TEST_P(Test_TensorFlow_layers, pooling_reduce_sum2)
+{
+    int axises[] = {0, 1, 2, 3};
+    for (int keepdims = 0; keepdims <= 1; ++keepdims)
+    {
+        for (int i = 0; i < sizeof(axises)/sizeof(axises[0]); ++i)
+        {
+            runTensorFlowNet(cv::format("reduce_sum_%d_%s", axises[i], (keepdims ? "True" : "False")));
+        }
+        runTensorFlowNet(cv::format("reduce_sum_1_2_%s", keepdims ? "True" : "False"));
+    }
+}
+
 TEST_P(Test_TensorFlow_layers, max_pool_grad)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 527a55c97d..b1e946f9fe 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -2157,12 +2157,12 @@ The function finds lines in a set of points using a modification of the Hough tr
 @param point Input vector of points. Each vector must be encoded as a Point vector \f$(x,y)\f$. Type must be CV_32FC2 or CV_32SC2.
 @param lines Output vector of found lines. Each vector is encoded as a vector<Vec3d> \f$(votes, rho, theta)\f$.
 The larger the value of 'votes', the higher the reliability of the Hough line.
-@param lines_max Max count of hough lines.
+@param lines_max Max count of Hough lines.
 @param threshold Accumulator threshold parameter. Only those lines are returned that get enough
-votes ( \f$>\texttt{threshold}\f$ )
-@param min_rho Minimum Distance value of the accumulator in pixels.
-@param max_rho Maximum Distance value of the accumulator in pixels.
-@param rho_step Distance resolution of the accumulator in pixels.
+votes ( \f$>\texttt{threshold}\f$ ).
+@param min_rho Minimum value for \f$\rho\f$ for the accumulator (Note: \f$\rho\f$ can be negative. The absolute value \f$|\rho|\f$ is the distance of a line to the origin.).
+@param max_rho Maximum value for \f$\rho\f$ for the accumulator.
+@param rho_step Distance resolution of the accumulator.
 @param min_theta Minimum angle value of the accumulator in radians.
 @param max_theta Maximum angle value of the accumulator in radians.
 @param theta_step Angle resolution of the accumulator in radians.
diff --git a/modules/imgproc/src/hough.cpp b/modules/imgproc/src/hough.cpp
index 50784f6d99..10fa536454 100644
--- a/modules/imgproc/src/hough.cpp
+++ b/modules/imgproc/src/hough.cpp
@@ -975,7 +975,9 @@ void HoughLinesPointSet( InputArray _point, OutputArray _lines, int lines_max, i
         for(int n = 0; n < numangle; n++ )
         {
             int r = cvRound( point.at(i).x  * tabCos[n] + point.at(i).y * tabSin[n] - irho_min);
-            accum[(n+1) * (numrho+2) + r+1]++;
+            if ( r >= 0 && r <= numrho) {
+                accum[(n+1) * (numrho+2) + r+1]++;
+            }
         }
 
     // stage 2. find local maximums
diff --git a/modules/imgproc/test/test_houghlines.cpp b/modules/imgproc/test/test_houghlines.cpp
index fca0449b91..e90891274a 100644
--- a/modules/imgproc/test/test_houghlines.cpp
+++ b/modules/imgproc/test/test_houghlines.cpp
@@ -299,6 +299,36 @@ TEST_P(HoughLinesPointSetTest, regression)
     run_test();
 }
 
+TEST(HoughLinesPointSet, regression_21029)
+{
+    std::vector<Point2f> points;
+    points.push_back(Point2f(100, 100));
+    points.push_back(Point2f(1000, 1000));
+    points.push_back(Point2f(10000, 10000));
+    points.push_back(Point2f(100000, 100000));
+
+    double rhoMin = 0;
+    double rhoMax = 10;
+    double rhoStep = 0.1;
+
+    double thetaMin = 85 * CV_PI / 180.0;
+    double thetaMax = 95 * CV_PI / 180.0;
+    double thetaStep = 1 * CV_PI / 180.0;
+
+    int lines_max = 5;
+    int threshold = 100;
+
+    Mat lines;
+
+    HoughLinesPointSet(points, lines,
+        lines_max, threshold,
+        rhoMin, rhoMax, rhoStep,
+        thetaMin, thetaMax, thetaStep
+    );
+
+    EXPECT_TRUE(lines.empty());
+}
+
 INSTANTIATE_TEST_CASE_P( ImgProc, StandartHoughLinesTest, testing::Combine(testing::Values( "shared/pic5.png", "../stitching/a1.png" ),
                                                                            testing::Values( 1, 10 ),
                                                                            testing::Values( 0.05, 0.1 ),
diff --git a/samples/cpp/falsecolor.cpp b/samples/cpp/falsecolor.cpp
index f73ffad4ce..bfe43a72ca 100644
--- a/samples/cpp/falsecolor.cpp
+++ b/samples/cpp/falsecolor.cpp
@@ -16,14 +16,14 @@ struct ParamColorMap {
 String winName="False color";
 static const String ColorMaps[] = { "Autumn", "Bone", "Jet", "Winter", "Rainbow", "Ocean", "Summer", "Spring",
                                     "Cool", "HSV", "Pink", "Hot", "Parula", "Magma", "Inferno", "Plasma", "Viridis",
-                                    "Cividis", "Twilight", "Twilight Shifted", "Turbo", "User defined (random)" };
+                                    "Cividis", "Twilight", "Twilight Shifted", "Turbo", "Deep Green", "User defined (random)" };
 
 static void TrackColorMap(int x, void *r)
 {
     ParamColorMap *p = (ParamColorMap*)r;
     Mat dst;
     p->iColormap= x;
-    if (x == COLORMAP_TURBO + 1)
+    if (x == COLORMAP_DEEPGREEN + 1)
     {
         Mat lutRND(256, 1, CV_8UC3);
         randu(lutRND, Scalar(0, 0, 0), Scalar(255, 255, 255));
@@ -97,10 +97,10 @@ int main(int argc, char** argv)
 
     imshow("Gray image",img);
     namedWindow(winName);
-    createTrackbar("colormap", winName,&p.iColormap,1,TrackColorMap,(void*)&p);
+    createTrackbar("colormap", winName, NULL, COLORMAP_DEEPGREEN + 1, TrackColorMap, (void*)&p);
     setTrackbarMin("colormap", winName, COLORMAP_AUTUMN);
-    setTrackbarMax("colormap", winName, COLORMAP_TURBO+1);
-    setTrackbarPos("colormap", winName, -1);
+    setTrackbarMax("colormap", winName, COLORMAP_DEEPGREEN + 1);
+    setTrackbarPos("colormap", winName, COLORMAP_AUTUMN);
 
     TrackColorMap(0, (void*)&p);