From b8d6f196d30e8672fc5c9939dc5eae68a1d16221 Mon Sep 17 00:00:00 2001
From: Suleyman TURKMEN <sturkmen@hotmail.com>
Date: Tue, 2 Jul 2024 23:26:58 +0300
Subject: [PATCH 1/6] Added EDColor support to EdgeDrawing class.

Fixed a bug where params.PFmode = true caused incorrect edge detection results.
Updated edge_drawing.py to support the new EDColor feature.
Created a new edge_drawing_demo.cpp for demonstrating the EDColor feature.
Updated ximgproc.bib to include references for EDColor.
Added tests for PFmode and EDColor in test_fld.cpp.
Added CV_WRAP to Params::read and Params::write functions to expose them to Python bindings.
the class NFALUT uses the code from original ED code
---
 modules/ximgproc/doc/ximgproc.bib             |   9 +
 modules/ximgproc/include/opencv2/ximgproc.hpp |  48 +-
 .../include/opencv2/ximgproc/edge_drawing.hpp |  10 +-
 modules/ximgproc/samples/edge_drawing.py      | 149 ++--
 .../ximgproc/samples/edge_drawing_demo.cpp    | 185 +++++
 modules/ximgproc/samples/fld_lines.cpp        |  62 +-
 modules/ximgproc/src/edge_drawing.cpp         | 673 ++++++++++++++----
 modules/ximgproc/src/edge_drawing_common.hpp  | 210 ++++--
 modules/ximgproc/test/test_fld.cpp            |  46 +-
 9 files changed, 1075 insertions(+), 317 deletions(-)
 create mode 100644 modules/ximgproc/samples/edge_drawing_demo.cpp

diff --git a/modules/ximgproc/doc/ximgproc.bib b/modules/ximgproc/doc/ximgproc.bib
index 279bac1d115..06b89f8a654 100644
--- a/modules/ximgproc/doc/ximgproc.bib
+++ b/modules/ximgproc/doc/ximgproc.bib
@@ -388,6 +388,15 @@ @article{akinlar2013edcircles
   publisher={Elsevier}
 }
 
+@article{akinlar201782,
+  title = {ColorED: Color edge and segment detection by Edge Drawing (ED)},
+  author = {Cuneyt Akinlar and Cihan Topal},
+  journal = {Journal of Visual Communication and Image Representation},
+  volume = {44},
+  pages = {82-94},
+  year = {2017},
+  publisher={Academic Press}
+
 @article{loke2021accelerated,
   title={Accelerated superpixel image segmentation with a parallelized DBSCAN algorithm},
   author={Loke, Seng Cheong and MacDonald, Bruce A and Parsons, Matthew and W{\"u}nsche, Burkhard Claus},
diff --git a/modules/ximgproc/include/opencv2/ximgproc.hpp b/modules/ximgproc/include/opencv2/ximgproc.hpp
index 099205126cb..4c171c4ce19 100644
--- a/modules/ximgproc/include/opencv2/ximgproc.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc.hpp
@@ -83,18 +83,42 @@
 
     @defgroup ximgproc_fast_line_detector Fast line detector
 
-    @defgroup ximgproc_edge_drawing EdgeDrawing
-
-    EDGE DRAWING LIBRARY FOR GEOMETRIC FEATURE EXTRACTION AND VALIDATION
-
-    Edge Drawing (ED) algorithm is an proactive approach on edge detection problem. In contrast to many other existing edge detection algorithms which follow a subtractive
-    approach (i.e. after applying gradient filters onto an image eliminating pixels w.r.t. several rules, e.g. non-maximal suppression and hysteresis in Canny), ED algorithm
-    works via an additive strategy, i.e. it picks edge pixels one by one, hence the name Edge Drawing. Then we process those random shaped edge segments to extract higher level
-    edge features, i.e. lines, circles, ellipses, etc. The popular method of extraction edge pixels from the thresholded gradient magnitudes is non-maximal supression that tests
-    every pixel whether it has the maximum gradient response along its gradient direction and eliminates if it does not. However, this method does not check status of the
-    neighboring pixels, and therefore might result low quality (in terms of edge continuity, smoothness, thinness, localization) edge segments. Instead of non-maximal supression,
-    ED points a set of edge pixels and join them by maximizing the total gradient response of edge segments. Therefore it can extract high quality edge segments without need for
-    an additional hysteresis step.
+    @defgroup ximgproc_edge_drawing Edge Drawing
+
+    Edge Drawing (ED) algorithm for geometric feature extraction and validation.
+
+    The Edge Drawing (ED) algorithm is a proactive approach to the edge detection problem.
+    In contrast to many existing edge detection algorithms, which follow a subtractive
+    approach (i.e., applying gradient filters and eliminating pixels based on several rules,
+    such as non-maximal suppression and hysteresis in the Canny Edge Detector), the ED algorithm
+    operates via an additive strategy. It selects edge pixels one by one and connects them,
+    hence the name Edge Drawing.
+
+    ED offers several key advantages:
+
+    1. **Additive Strategy**: Instead of eliminating non-edge pixels after gradient filtering,
+    ED incrementally builds up edge segments by selecting and connecting pixels based on
+    their gradient response. This differs from traditional methods, which rely on
+    non-maximal suppression and hysteresis to filter out non-edge pixels.
+
+    2. **Edge Pixel Selection**: ED selects edge pixels by analyzing their local gradient
+    response, while also considering neighboring pixels. This results in smoother and
+    more continuous edge segments, as ED aims to maximize the overall gradient strength
+    along the edge segment.
+
+    3. **Edge Segment Formation**: Traditional methods, such as non-maximal suppression,
+    check whether a pixel has the maximum gradient response along its gradient direction,
+    eliminating it otherwise. However, this approach doesn't consider neighboring pixels,
+    often resulting in lower-quality edge segments. ED, on the other hand, joins a set of
+    edge pixels together by maximizing the total gradient response of the segment, leading
+    to high-quality, well-localized edges.
+
+    4. **Higher-Level Feature Extraction**: After forming edge segments, ED enables the
+    extraction of higher-level geometric features such as lines, circles, ellipses, and
+    other shapes, making it useful for tasks involving geometric feature extraction and validation.
+
+    The ED algorithm produces continuous, smooth, and localized edge segments, making it ideal
+    for applications requiring precise edge detection and geometric shape analysis.
 
     @defgroup ximgproc_fourier Fourier descriptors
 
diff --git a/modules/ximgproc/include/opencv2/ximgproc/edge_drawing.hpp b/modules/ximgproc/include/opencv2/ximgproc/edge_drawing.hpp
index 33741e90089..1842da5967b 100644
--- a/modules/ximgproc/include/opencv2/ximgproc/edge_drawing.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc/edge_drawing.hpp
@@ -15,7 +15,7 @@ namespace ximgproc
 //! @addtogroup ximgproc_edge_drawing
 //! @{
 
-/** @brief Class implementing the ED (EdgeDrawing) @cite topal2012edge, EDLines @cite akinlar2011edlines, EDPF @cite akinlar2012edpf and EDCircles @cite akinlar2013edcircles algorithms
+/** @brief Class implementing the ED (EdgeDrawing) @cite topal2012edge, EDLines @cite akinlar2011edlines, EDPF @cite akinlar2012edpf, EDCircles @cite akinlar2013edcircles and ColorED @cite akinlar201782 algorithms.
 */
 
 class CV_EXPORTS_W EdgeDrawing : public Algorithm
@@ -66,13 +66,13 @@ class CV_EXPORTS_W EdgeDrawing : public Algorithm
         //! Default value is 1.3
         CV_PROP_RW double MaxErrorThreshold;
 
-        void read(const FileNode& fn);
-        void write(FileStorage& fs) const;
+        CV_WRAP void read(const FileNode& fn);
+        CV_WRAP void write(FileStorage& fs) const;
     };
 
-    /** @brief Detects edges in a grayscale image and prepares them to detect lines and ellipses.
+    /** @brief Detects edges in a grayscale or color image and prepares them to detect lines and ellipses.
 
-    @param src 8-bit, single-channel, grayscale input image.
+    @param src 8-bit, single-channel (CV_8UC1) or color (CV_8UC3, CV_8UC4) input image.
     */
     CV_WRAP virtual void detectEdges(InputArray src) = 0;
 
diff --git a/modules/ximgproc/samples/edge_drawing.py b/modules/ximgproc/samples/edge_drawing.py
index 8c8d6add3b7..d379291b743 100644
--- a/modules/ximgproc/samples/edge_drawing.py
+++ b/modules/ximgproc/samples/edge_drawing.py
@@ -1,7 +1,24 @@
 #!/usr/bin/python
 
 '''
-This example illustrates how to use cv.ximgproc.EdgeDrawing class.
+This example script illustrates how to use cv.ximgproc.EdgeDrawing class.
+
+It uses the OpenCV library to load an image, and then use the EdgeDrawing class
+to detect edges, lines, and ellipses. The detected features are then drawn and displayed.
+
+The main loop allows the user changing parameters of EdgeDrawing by pressing following keys:
+
+to toggle the grayscale conversion press 'space' key
+to increase MinPathLength value press '/' key
+to decrease MinPathLength value press '*' key
+to increase MinLineLength value press '+' key
+to decrease MinLineLength value press '-' key
+to toggle NFAValidation value press 'n' key
+to toggle PFmode value press 'p' key
+to save parameters to file press 's' key
+to load parameters from file press 'l' key
+
+The program exits when the Esc key is pressed.
 
 Usage:
     ed.py [<image_name>]
@@ -16,71 +33,117 @@
 import random as rng
 import sys
 
-rng.seed(12345)
-
-def main():
-    try:
-        fn = sys.argv[1]
-    except IndexError:
-        fn = 'board.jpg'
-
-    src = cv.imread(cv.samples.findFile(fn))
-    gray = cv.cvtColor(src, cv.COLOR_BGR2GRAY)
-    cv.imshow("source", src)
-
-    ssrc = src.copy()*0
+def EdgeDrawingDemo(src, ed, EDParams, convert_to_gray):
+    rng.seed(12345)
+    ssrc = np.zeros_like(src)
     lsrc = src.copy()
     esrc = src.copy()
 
-    ed = cv.ximgproc.createEdgeDrawing()
+    img_to_detect = cv.cvtColor(src, cv.COLOR_BGR2GRAY) if convert_to_gray else src
 
-    # you can change parameters (refer the documentation to see all parameters)
-    EDParams = cv.ximgproc_EdgeDrawing_Params()
-    EDParams.MinPathLength = 50     # try changing this value between 5 to 1000
-    EDParams.PFmode = False         # defaut value try to swich it to True
-    EDParams.MinLineLength = 10     # try changing this value between 5 to 100
-    EDParams.NFAValidation = True   # defaut value try to swich it to False
+    cv.imshow("source image", img_to_detect)
 
-    ed.setParams(EDParams)
+    print("")
+    print("convert_to_gray:", convert_to_gray)
+    print("MinPathLength:", EDParams.MinPathLength)
+    print("MinLineLength:", EDParams.MinLineLength)
+    print("PFmode:", EDParams.PFmode)
+    print("NFAValidation:", EDParams.NFAValidation)
+
+    tm = cv.TickMeter()
+    tm.start()
 
     # Detect edges
     # you should call this before detectLines() and detectEllipses()
-    ed.detectEdges(gray)
+    ed.detectEdges(img_to_detect)
 
     segments = ed.getSegments()
     lines = ed.detectLines()
     ellipses = ed.detectEllipses()
 
-    #Draw detected edge segments
-    for i in range(len(segments)):
-        color = (rng.randint(0,256), rng.randint(0,256), rng.randint(0,256))
-        cv.polylines(ssrc, [segments[i]], False, color, 1, cv.LINE_8)
+    tm.stop()
+
+    print("Detection time : {:.2f} ms. using the parameters above".format(tm.getTimeMilli()))
+
+    # Draw detected edge segments
+    for segment in segments:
+        color = (rng.randint(0, 256), rng.randint(0, 256), rng.randint(0, 256))
+        cv.polylines(ssrc, [segment], False, color, 1, cv.LINE_8)
 
     cv.imshow("detected edge segments", ssrc)
 
-    #Draw detected lines
-    if lines is not None: # Check if the lines have been found and only then iterate over these and add them to the image
+    # Draw detected lines
+    if lines is not None:  # Check if the lines have been found and only then iterate over these and add them to the image
         lines = np.uint16(np.around(lines))
-        for i in range(len(lines)):
-            cv.line(lsrc, (lines[i][0][0], lines[i][0][1]), (lines[i][0][2], lines[i][0][3]), (0, 0, 255), 1, cv.LINE_AA)
+        for line in lines:
+            cv.line(lsrc, (line[0][0], line[0][1]), (line[0][2], line[0][3]), (0, 0, 255), 1, cv.LINE_AA)
 
     cv.imshow("detected lines", lsrc)
 
-    #Draw detected circles and ellipses
-    if ellipses is not None: # Check if circles and ellipses have been found and only then iterate over these and add them to the image
-        for i in range(len(ellipses)):
-            center = (int(ellipses[i][0][0]), int(ellipses[i][0][1]))
-            axes = (int(ellipses[i][0][2])+int(ellipses[i][0][3]),int(ellipses[i][0][2])+int(ellipses[i][0][4]))
-            angle = ellipses[i][0][5]
-            color = (0, 0, 255)
-            if ellipses[i][0][2] == 0:
-                color = (0, 255, 0)
-            cv.ellipse(esrc, center, axes, angle,0, 360, color, 2, cv.LINE_AA)
+    # Draw detected circles and ellipses
+    if ellipses is not None:  # Check if circles and ellipses have been found and only then iterate over these and add them to the image
+        for ellipse in ellipses:
+            center = (int(ellipse[0][0]), int(ellipse[0][1]))
+            axes = (int(ellipse[0][2] + ellipse[0][3]), int(ellipse[0][2] + ellipse[0][4]))
+            angle = ellipse[0][5]
+
+            color = (0, 255, 0) if ellipse[0][2] == 0 else (0, 0, 255)
+
+            cv.ellipse(esrc, center, axes, angle, 0, 360, color, 2, cv.LINE_AA)
 
     cv.imshow("detected circles and ellipses", esrc)
-    cv.waitKey(0)
-    print('Done')
 
+def main():
+    try:
+        fn = sys.argv[1]
+    except IndexError:
+        fn = 'board.jpg'
+    src = cv.imread(cv.samples.findFile(fn))
+    if src is None:
+        print("Error loading image")
+        return
+
+    ed = cv.ximgproc.createEdgeDrawing()
+
+    # Set parameters (refer to the documentation for all parameters)
+    EDParams = cv.ximgproc_EdgeDrawing_Params()
+    EDParams.MinPathLength = 10     # try changing this value by pressing '/' and '*' keys
+    EDParams.MinLineLength = 10     # try changing this value by pressing '+' and '-' keys
+    EDParams.PFmode = False         # default value is False, try switching by pressing 'p' key
+    EDParams.NFAValidation = True   # default value is True, try switching by pressing 'n' key
+
+    convert_to_gray = True
+    key = 0
+
+    while key != 27:
+        ed.setParams(EDParams)
+        EdgeDrawingDemo(src, ed, EDParams, convert_to_gray)
+        key = cv.waitKey()
+        if key == 32:  # space key
+            convert_to_gray = not convert_to_gray
+        if key == 112: # 'p' key
+            EDParams.PFmode = not EDParams.PFmode
+        if key == 110: # 'n' key
+            EDParams.NFAValidation = not EDParams.NFAValidation
+        if key == 43: # '+' key
+            EDParams.MinLineLength = EDParams.MinLineLength + 5
+        if key == 45: # '-' key
+            EDParams.MinLineLength = max(0, EDParams.MinLineLength - 5)
+        if key == 47: # '/' key
+            EDParams.MinPathLength = EDParams.MinPathLength + 20
+        if key == 42: # '*' key
+            EDParams.MinPathLength = max(0, EDParams.MinPathLength - 20)
+        if key == 115: # 's' key
+            fs = cv.FileStorage("ed-params.xml",cv.FileStorage_WRITE)
+            EDParams.write(fs)
+            fs.release()
+            print("parameters saved to ed-params.xml")
+        if key == 108: # 'l' key
+            fs = cv.FileStorage("ed-params.xml",cv.FileStorage_READ)
+            if fs.isOpened():
+                EDParams.read(fs.root())
+                fs.release()
+                print("parameters loaded from ed-params.xml")
 
 if __name__ == '__main__':
     print(__doc__)
diff --git a/modules/ximgproc/samples/edge_drawing_demo.cpp b/modules/ximgproc/samples/edge_drawing_demo.cpp
new file mode 100644
index 00000000000..6a815183d8c
--- /dev/null
+++ b/modules/ximgproc/samples/edge_drawing_demo.cpp
@@ -0,0 +1,185 @@
+/* edge_drawing.cpp
+
+This example illustrates how to use cv.ximgproc.EdgeDrawing class.
+
+It uses the OpenCV library to load an image, and then use the EdgeDrawing class
+to detect edges, lines, and ellipses. The detected features are then drawn and displayed.
+
+The main loop allows the user changing parameters of EdgeDrawing by pressing following keys:
+
+to toggle the grayscale conversion press 'space' key
+to increase MinPathLength value press '/' key
+to decrease MinPathLength value press '*' key
+to increase MinLineLength value press '+' key
+to decrease MinLineLength value press '-' key
+to toggle NFAValidation value press 'n' key
+to toggle PFmode value press 'p' key
+to save parameters to file press 's' key
+to load parameters from file press 'l' key
+
+The program exits when the Esc key is pressed.
+*/
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/ximgproc.hpp>
+#include <iostream>
+
+void EdgeDrawingDemo(const cv::Mat src, cv::Ptr<cv::ximgproc::EdgeDrawing> ed, bool convert_to_gray);
+
+void EdgeDrawingDemo(const cv::Mat src, cv::Ptr<cv::ximgproc::EdgeDrawing> ed, bool convert_to_gray)
+{
+    cv::Mat ssrc = cv::Mat::zeros(src.size(), src.type());
+    cv::Mat lsrc = src.clone();
+    cv::Mat esrc = src.clone();
+
+    std::cout << std::endl << "convert_to_gray: " << convert_to_gray << std::endl;
+    std::cout << "MinPathLength: " << ed->params.MinPathLength << std::endl;
+    std::cout << "MinLineLength: " << ed->params.MinLineLength << std::endl;
+    std::cout << "PFmode: " << ed->params.PFmode << std::endl;
+    std::cout << "NFAValidation: " << ed->params.NFAValidation << std::endl;
+
+    cv::TickMeter tm;
+    tm.start();
+
+    cv::Mat img_to_detect;
+
+    if (convert_to_gray)
+    {
+        cv::cvtColor(src, img_to_detect, cv::COLOR_BGR2GRAY);
+    }
+    else
+    {
+        img_to_detect = src;
+    }
+
+    cv::imshow("source image", img_to_detect);
+
+    tm.start();
+
+    // Detect edges
+    ed->detectEdges(img_to_detect);
+
+    std::vector<std::vector<cv::Point>> segments = ed->getSegments();
+    std::vector<cv::Vec4f> lines;
+    ed->detectLines(lines);
+    std::vector<cv::Vec6d> ellipses;
+    ed->detectEllipses(ellipses);
+
+    tm.stop();
+
+    cv::RNG& rng = cv::theRNG();
+    cv::setRNGSeed(0);
+
+    // Draw detected edge segments
+    for (const auto& segment : segments)
+    {
+        cv::Scalar color(rng.uniform(0, 256), rng.uniform(0, 256), rng.uniform(0, 256));
+        cv::polylines(ssrc, segment, false, color, 1, cv::LINE_8);
+    }
+
+    cv::imshow("detected edge segments", ssrc);
+
+    // Draw detected lines
+    if (!lines.empty())  // Check if the lines have been found and only then iterate over these and add them to the image
+    {
+        for (size_t i = 0; i < lines.size(); i++)
+        {
+            cv::line(lsrc, cv::Point2d(lines[i][0], lines[i][1]), cv::Point2d(lines[i][2], lines[i][3]), cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
+        }
+    }
+
+    cv::imshow("detected lines", lsrc);
+
+    // Draw detected circles and ellipses
+    if (!ellipses.empty())  // Check if circles and ellipses have been found and only then iterate over these and add them to the image
+    {
+        for (const auto& ellipse : ellipses)
+        {
+            cv::Point center((int)ellipse[0], (int)ellipse[1]);
+            cv::Size axes((int)ellipse[2] + (int)ellipse[3], (int)ellipse[2] + (int)ellipse[4]);
+            double angle(ellipse[5]);
+            cv::Scalar color = (ellipse[2] == 0) ? cv::Scalar(0, 255, 0) : cv::Scalar(0, 0, 255);
+            cv::ellipse(esrc, center, axes, angle, 0, 360, color, 1, cv::LINE_AA);
+        }
+    }
+
+    cv::imshow("detected circles and ellipses", esrc);
+    std::cout << "Total Detection Time : " << tm.getTimeMilli() << "ms." << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+    std::string filename = (argc > 1) ? argv[1] : "board.jpg";
+    cv::Mat src = cv::imread(cv::samples::findFile(filename));
+
+    if (src.empty())
+    {
+        std::cerr << "Error: Could not open or find the image!" << std::endl;
+        return -1;
+    }
+
+    cv::Ptr<cv::ximgproc::EdgeDrawing> ed = cv::ximgproc::createEdgeDrawing();
+
+    // Set parameters (refer to the documentation for all parameters)
+    ed->params.MinPathLength = 10;     // try changing this value by pressing '/' and '*' keys
+    ed->params.MinLineLength = 10;     // try changing this value by pressing '+' and '-' keys
+    ed->params.PFmode = false;         // default value is false, try switching by pressing 'p' key
+    ed->params.NFAValidation = true;   // default value is true, try switching by pressing 'n' key
+
+    bool convert_to_gray = true;
+    int key = 0;
+
+    while (key != 27)
+    {
+        EdgeDrawingDemo(src, ed, convert_to_gray);
+        key = cv::waitKey(0);
+
+        switch (key)
+        {
+        case 32:  // space key
+            convert_to_gray = !convert_to_gray;
+            break;
+        case 'p':  // 'p' key
+            ed->params.PFmode = !ed->params.PFmode;
+            break;
+        case 'n':  // 'n' key
+            ed->params.NFAValidation = !ed->params.NFAValidation;
+            break;
+        case '+':  // '+' key
+            ed->params.MinLineLength = std::max(0, ed->params.MinLineLength + 5);
+            break;
+        case '-':  // '-' key
+            ed->params.MinLineLength = std::max(0, ed->params.MinLineLength - 5);
+            break;
+        case '/':  // '/' key
+            ed->params.MinPathLength += 20;
+            break;
+        case '*':  // '*' key
+            ed->params.MinPathLength = std::max(0, ed->params.MinPathLength - 20);
+            break;
+        case 's':  // 's' key
+        {
+            cv::FileStorage fs("ed-params.xml", cv::FileStorage::WRITE);
+            ed->params.write(fs);
+            fs.release();
+            std::cout << "Parameters saved to ed-params.xml" << std::endl;
+        }
+        break;
+        case 'l':  // 'l' key
+        {
+            cv::FileStorage fs("ed-params.xml", cv::FileStorage::READ);
+            if (fs.isOpened())
+            {
+                ed->params.read(fs.root());
+                fs.release();
+                std::cout << "Parameters loaded from ed-params.xml" << std::endl;
+            }
+        }
+        break;
+        default:
+            break;
+        }
+    }
+    return 0;
+}
diff --git a/modules/ximgproc/samples/fld_lines.cpp b/modules/ximgproc/samples/fld_lines.cpp
index 87edf62420d..c6af79abc38 100644
--- a/modules/ximgproc/samples/fld_lines.cpp
+++ b/modules/ximgproc/samples/fld_lines.cpp
@@ -24,6 +24,7 @@ int main(int argc, char** argv)
 
     if( image.empty() )
     {
+        parser.printMessage();
         return -1;
     }
 
@@ -54,7 +55,7 @@ int main(int argc, char** argv)
     vector<Vec4f> lines;
 
     // Because of some CPU's power strategy, it seems that the first running of
-    // an algorithm takes much longer. So here we run the algorithm 10 times
+    // an algorithm takes much longer. So here we run the algorithm 5 times
     // to see the algorithm's processing time with sufficiently warmed-up
     // CPU performance.
     for (int run_count = 0; run_count < 5; run_count++) {
@@ -71,64 +72,7 @@ int main(int argc, char** argv)
     Mat line_image_fld(image);
     fld->drawSegments(line_image_fld, lines);
     imshow("FLD result", line_image_fld);
-
-    waitKey(1);
-
-    Ptr<EdgeDrawing> ed = createEdgeDrawing();
-    ed->params.EdgeDetectionOperator = EdgeDrawing::SOBEL;
-    ed->params.GradientThresholdValue = 38;
-    ed->params.AnchorThresholdValue = 8;
-
-    vector<Vec6d> ellipses;
-
-    for (int run_count = 0; run_count < 5; run_count++) {
-        double freq = getTickFrequency();
-        lines.clear();
-        int64 start = getTickCount();
-
-        // Detect edges
-        //you should call this before detectLines() and detectEllipses()
-        ed->detectEdges(image);
-
-        // Detect lines
-        ed->detectLines(lines);
-        double duration_ms = double(getTickCount() - start) * 1000 / freq;
-        cout << "Elapsed time for EdgeDrawing detectLines " << duration_ms << " ms." << endl;
-
-        start = getTickCount();
-        // Detect circles and ellipses
-        ed->detectEllipses(ellipses);
-        duration_ms = double(getTickCount() - start) * 1000 / freq;
-        cout << "Elapsed time for EdgeDrawing detectEllipses " << duration_ms << " ms." << endl;
-    }
-
-    Mat edge_image_ed = Mat::zeros(image.size(), CV_8UC3);
-    vector<vector<Point> > segments = ed->getSegments();
-
-    for (size_t i = 0; i < segments.size(); i++)
-    {
-        const Point* pts = &segments[i][0];
-        int n = (int)segments[i].size();
-        polylines(edge_image_ed, &pts, &n, 1, false, Scalar((rand() & 255), (rand() & 255), (rand() & 255)), 1);
-    }
-
-    imshow("EdgeDrawing detected edges", edge_image_ed);
-
-    Mat line_image_ed(image);
-    fld->drawSegments(line_image_ed, lines);
-
-    // Draw circles and ellipses
-    for (size_t i = 0; i < ellipses.size(); i++)
-    {
-        Point center((int)ellipses[i][0], (int)ellipses[i][1]);
-        Size axes((int)ellipses[i][2] + (int)ellipses[i][3], (int)ellipses[i][2] + (int)ellipses[i][4]);
-        double angle(ellipses[i][5]);
-        Scalar color = ellipses[i][2] == 0 ? Scalar(255, 255, 0) : Scalar(0, 255, 0);
-
-        ellipse(line_image_ed, center, axes, angle, 0, 360, color, 2, LINE_AA);
-    }
-
-    imshow("EdgeDrawing result", line_image_ed);
     waitKey();
+
     return 0;
 }
diff --git a/modules/ximgproc/src/edge_drawing.cpp b/modules/ximgproc/src/edge_drawing.cpp
index 6a63f9d90b3..e31e8bd9ec5 100644
--- a/modules/ximgproc/src/edge_drawing.cpp
+++ b/modules/ximgproc/src/edge_drawing.cpp
@@ -22,8 +22,6 @@ mutable Mat_<uchar> dirImage;
 int gradThresh;
 int op;
 bool SumFlag;
-int* grads;
-bool PFmode;
 };
 
 void ComputeGradientBody::operator() (const Range& range) const
@@ -78,9 +76,6 @@ void ComputeGradientBody::operator() (const Range& range) const
 
             gradRow[x] = (ushort)sum;
 
-            if (PFmode)
-                grads[sum]++;
-
             if (sum >= gradThresh)
             {
                 if (gx >= gy)
@@ -92,6 +87,9 @@ void ComputeGradientBody::operator() (const Range& range) const
     }
 }
 
+// Look up table size for fast color space conversion
+#define LUT_SIZE (1024*4096)
+
 class EdgeDrawingImpl : public EdgeDrawing
 {
 public:
@@ -126,15 +124,38 @@ class EdgeDrawingImpl : public EdgeDrawing
     Mat smoothImage;
     uchar *edgeImg;   // pointer to edge image data
     uchar *smoothImg; // pointer to smoothed image data
-    int segmentNos;
     Mat srcImage;
 
+    int op;           // edge detection operator
+    int gradThresh;   // gradient threshold
+    int anchorThresh; // anchor point threshold
     double divForTestSegment;
     double* dH;
     int* grads;
     int np;
 
 private:
+    Mat L_Img;
+    Mat a_Img;
+    Mat b_Img;
+
+    uchar* smooth_L;
+    uchar* smooth_a;
+    uchar* smooth_b;
+
+    std::vector<std::vector<cv::Point>> segments;
+
+    static double LUT1[LUT_SIZE + 1];
+    static double LUT2[LUT_SIZE + 1];
+    static bool LUT_Initialized;
+
+    void MyRGB2LabFast();
+    void ComputeGradientMapByDiZenzo();
+    void smoothChannel(const Mat& src, Mat& dst, double sigma);
+
+    static void fixEdgeSegments(std::vector<std::vector<cv::Point>> map);
+    static void InitColorEDLib();
+
     void ComputeGradient();
     void ComputeAnchorPoints();
     void JoinAnchorPointsUsingSortedAnchors();
@@ -153,10 +174,7 @@ class EdgeDrawingImpl : public EdgeDrawing
     uchar *dirImg;    // pointer to direction image data
     ushort *gradImg;   // pointer to gradient image data
 
-    int op;           // edge detection operator
-    int gradThresh;   // gradient threshold
-    int anchorThresh; // anchor point threshold
-
+    int segmentNos;
     std::vector<EDLineSegment> lines;
     int linesNo;
     int min_line_len;
@@ -337,9 +355,9 @@ void EdgeDrawingImpl::write(cv::FileStorage& fs) const
 EdgeDrawingImpl::EdgeDrawingImpl()
 {
     params = EdgeDrawing::Params();
-    nfa = new NFALUT(1, 1/2, 1, 1);
-    dH = new double[MAX_GRAD_VALUE];
-    grads = new int[MAX_GRAD_VALUE];
+    nfa = new NFALUT(1, 1/8, 0.5);
+    dH = new double[ED_MAX_GRAD_VALUE];
+    grads = new int[ED_MAX_GRAD_VALUE];
 }
 
 EdgeDrawingImpl::~EdgeDrawingImpl()
@@ -351,7 +369,7 @@ EdgeDrawingImpl::~EdgeDrawingImpl()
 
 void EdgeDrawingImpl::detectEdges(InputArray src)
 {
-    CV_Assert(!src.empty() && src.type() == CV_8UC1);
+    CV_Assert(!src.empty() && (src.type() == CV_8UC1 || src.type() == CV_8UC3 || src.type() == CV_8UC4));
     gradThresh = params.GradientThresholdValue;
     anchorThresh = params.AnchorThresholdValue;
     op = params.EdgeDetectionOperator;
@@ -366,6 +384,9 @@ void EdgeDrawingImpl::detectEdges(InputArray src)
     if (anchorThresh < 0)
         anchorThresh = 0;
 
+    if (params.MinPathLength < 3)
+        params.MinPathLength = 3;
+
     segmentNos = 0;
     anchorNos = 0;
     anchorPoints.clear();
@@ -377,58 +398,157 @@ void EdgeDrawingImpl::detectEdges(InputArray src)
     height = srcImage.rows;
     width = srcImage.cols;
 
-    edgeImage = Mat(height, width, CV_8UC1, Scalar(0)); // initialize edge Image
-    gradImage = Mat(height, width, CV_16UC1); // gradImage contains short values
+    edgeImage = Mat::zeros(height, width, CV_8UC1);
+    gradImage = Mat::zeros(height, width, CV_16UC1); // gradImage contains short values
     dirImage = Mat(height, width, CV_8UC1);
-
-    if (params.Sigma < 1.0)
-        smoothImage = srcImage;
-    else if (params.Sigma == 1.0)
-        GaussianBlur(srcImage, smoothImage, Size(5, 5), params.Sigma);
-    else
-        GaussianBlur(srcImage, smoothImage, Size(), params.Sigma); // calculate kernel from sigma
-
-    // Assign Pointers from Mat's data
-    smoothImg = smoothImage.data;
-    gradImg = (ushort*)gradImage.data;
     edgeImg = edgeImage.data;
+    gradImg = (ushort*)gradImage.data;
     dirImg = dirImage.data;
 
-    if (params.PFmode)
+    if (srcImage.type() == CV_8UC1)
     {
-        memset(dH, 0, sizeof(double) * MAX_GRAD_VALUE);
-        memset(grads, 0, sizeof(int) * MAX_GRAD_VALUE);
-    }
+        if (params.Sigma < 1.0)
+            smoothImage = srcImage;
+        else if (params.Sigma == 1.0)
+            GaussianBlur(srcImage, smoothImage, Size(5, 5), params.Sigma);
+        else
+            GaussianBlur(srcImage, smoothImage, Size(), params.Sigma); // calculate kernel from sigma
+
+        smoothImg = smoothImage.data;
+
+        ComputeGradient();                    // COMPUTE GRADIENT & EDGE DIRECTION MAPS
+        ComputeAnchorPoints();                // COMPUTE ANCHORS
+        JoinAnchorPointsUsingSortedAnchors(); // JOIN ANCHORS
+
+        if (params.PFmode)
+        {
+            memset(gradImg, 0, sizeof(short) * width * height);
+            memset(grads, 0, sizeof(int) * ED_MAX_GRAD_VALUE);
+            memset(dH, 0, sizeof(double) * ED_MAX_GRAD_VALUE);
+            memset(edgeImg, 0, width * height); // clear edge image
 
-    ComputeGradient();                    // COMPUTE GRADIENT & EDGE DIRECTION MAPS
-    ComputeAnchorPoints();                // COMPUTE ANCHORS
-    JoinAnchorPointsUsingSortedAnchors(); // JOIN ANCHORS
+            GaussianBlur(srcImage, smoothImage, Size(), 0.4);
 
-    if (params.PFmode)
+            for (int i = 1; i < height - 1; i++) {
+                for (int j = 1; j < width - 1; j++) {
+
+                    int com1 = smoothImg[(i + 1) * width + j + 1] - smoothImg[(i - 1) * width + j - 1];
+                    int com2 = smoothImg[(i - 1) * width + j + 1] - smoothImg[(i + 1) * width + j - 1];
+
+                    int gx = abs(com1 + com2 + (smoothImg[i * width + j + 1] - smoothImg[i * width + j - 1]));
+                    int gy = abs(com1 - com2 + (smoothImg[(i + 1) * width + j] - smoothImg[(i - 1) * width + j]));
+
+                    int g = gx + gy;
+
+                    gradImg[i * width + j] = (ushort)g;
+                    grads[g]++;
+                }
+            }
+
+            // Compute probability function H
+            int size = (width - 2) * (height - 2);
+
+            for (int i = ED_MAX_GRAD_VALUE - 1; i > 0; i--)
+                grads[i - 1] += grads[i];
+
+            for (int i = 0; i < ED_MAX_GRAD_VALUE; i++)
+                dH[i] = (double)grads[i] / ((double)size);
+
+            divForTestSegment = 2.25; // Some magic number :-)
+
+            np = 0;
+            for (int i = 0; i < segmentNos; i++)
+            {
+                int len = (int)segmentPoints[i].size();
+                np += (len * (len - 1)) / 2;
+            }
+
+            // Validate segments
+            for (int i = 0; i < segmentNos; i++)
+                TestSegment(i, 0, (int)segmentPoints[i].size() - 1);
+
+            ExtractNewSegments();
+        }
+    }
+    else
     {
-        // Compute probability function H
-        int size = (width - 2) * (height - 2);
+        // Convert RGB2Lab
+        MyRGB2LabFast();
+
+        // Smooth Channels
+        smoothChannel(L_Img, L_Img, params.Sigma);
+        smoothChannel(a_Img, a_Img, params.Sigma);
+        smoothChannel(b_Img, b_Img, params.Sigma);
+
+        smooth_L = L_Img.data;
+        smooth_a = a_Img.data;
+        smooth_b = b_Img.data;
+
+        ComputeGradientMapByDiZenzo();        // Compute Gradient & Edge Direction Maps
+
+        // Compute anchors with the user supplied parameters
+        anchorThresh = 0; // anchorThresh used as zero while computing anchor points if selectStableAnchors set.
+        // Finding higher number of anchors is OK, because we have following validation steps in selectStableAnchors.
+        ComputeAnchorPoints();                // COMPUTE ANCHORS
+        anchorThresh = params.AnchorThresholdValue; // set it to its initial argument value for further anchor validation.
+        anchorPoints.clear(); // considering validation step below, it should constructed again.
+
+        for (int i = 1; i < height - 1; i++) {
+            for (int j = 1; j < width - 1; j++) {
+                if (edgeImg[i * width + j] != ANCHOR_PIXEL) continue;
+
+                // Take only "stable" anchors
+                // 0 degree edge
+                if (edgeImg[i * width + j - 1] && edgeImg[i * width + j + 1]) {
+                    int diff1 = gradImg[i * width + j] - gradImg[(i - 1) * width + j];
+                    int diff2 = gradImg[i * width + j] - gradImg[(i + 1) * width + j];
+                    if (diff1 >= anchorThresh && diff2 >= anchorThresh) edgeImg[i * width + j] = 255;
 
-        for (int i = MAX_GRAD_VALUE - 1; i > 0; i--)
-            grads[i - 1] += grads[i];
+                    continue;
+                }
 
-        for (int i = 0; i < MAX_GRAD_VALUE; i++)
-            dH[i] = (double)grads[i] / ((double)size);
+                  // 90 degree edge
+                if (edgeImg[(i - 1) * width + j] && edgeImg[(i + 1) * width + j]) {
+                    int diff1 = gradImg[i * width + j] - gradImg[i * width + j - 1];
+                    int diff2 = gradImg[i * width + j] - gradImg[i * width + j + 1];
+                    if (diff1 >= anchorThresh && diff2 >= anchorThresh) edgeImg[i * width + j] = 255;
 
-        divForTestSegment = 2.25; // Some magic number :-)
-        memset(edgeImg, 0, width * height); // clear edge image
-        np = 0;
-        for (int i = 0; i < segmentNos; i++)
-        {
-            int len = (int)segmentPoints[i].size();
-            np += (len * (len - 1)) / 2;
+                    continue;
+                }
+
+                  // 135 degree diagonal
+                if (edgeImg[(i - 1) * width + j - 1] && edgeImg[(i + 1) * width + j + 1]) {
+                    int diff1 = gradImg[i * width + j] - gradImg[(i - 1) * width + j + 1];
+                    int diff2 = gradImg[i * width + j] - gradImg[(i + 1) * width + j - 1];
+                    if (diff1 >= anchorThresh && diff2 >= anchorThresh) edgeImg[i * width + j] = 255;
+                    continue;
+                }
+
+                  // 45 degree diagonal
+                if (edgeImg[(i - 1) * width + j + 1] && edgeImg[(i + 1) * width + j - 1]) {
+                    int diff1 = gradImg[i * width + j] - gradImg[(i - 1) * width + j - 1];
+                    int diff2 = gradImg[i * width + j] - gradImg[(i + 1) * width + j + 1];
+                    if (diff1 >= anchorThresh && diff2 >= anchorThresh) edgeImg[i * width + j] = 255;
+                }
+            }
         }
 
-        // Validate segments
-        for (int i = 0; i < segmentNos; i++)
-            TestSegment(i, 0, (int)segmentPoints[i].size() - 1);
+        for (int i = 0; i < width * height; i++)
+            if (edgeImg[i] == ANCHOR_PIXEL)
+                edgeImg[i] = 0;
+            else if (edgeImg[i] == 255) {
+                edgeImg[i] = ANCHOR_PIXEL;
+                int y = i / width;
+                int x = i % width;
+                anchorPoints.push_back(Point(x, y)); // push validated anchor point to vector
+            }
+
+        anchorNos = (int)anchorPoints.size(); // get # of anchor pixels
+
+        JoinAnchorPointsUsingSortedAnchors(); // JOIN ANCHORS
 
-        ExtractNewSegments();
+        // Fix 1 pixel errors in the edge map
+        fixEdgeSegments(segments);
     }
 }
 
@@ -473,8 +593,6 @@ void EdgeDrawingImpl::ComputeGradient()
     body.gradThresh = gradThresh;
     body.SumFlag = params.SumFlag;
     body.op = op;
-    body.grads = grads;
-    body.PFmode = params.PFmode;
 
     parallel_for_(Range(1, smoothImage.rows - 1), body);
 }
@@ -562,24 +680,24 @@ void EdgeDrawingImpl::JoinAnchorPointsUsingSortedAnchors()
         {
             stack[++top].r = i;
             stack[top].c = j;
-            stack[top].dir = DOWN;
+            stack[top].dir = ED_DOWN;
             stack[top].parent = 0;
 
             stack[++top].r = i;
             stack[top].c = j;
-            stack[top].dir = UP;
+            stack[top].dir = ED_UP;
             stack[top].parent = 0;
         }
         else
         {
             stack[++top].r = i;
             stack[top].c = j;
-            stack[top].dir = RIGHT;
+            stack[top].dir = ED_RIGHT;
             stack[top].parent = 0;
 
             stack[++top].r = i;
             stack[top].c = j;
-            stack[top].dir = LEFT;
+            stack[top].dir = ED_LEFT;
             stack[top].parent = 0;
         }
 
@@ -609,7 +727,7 @@ void EdgeDrawingImpl::JoinAnchorPointsUsingSortedAnchors()
             len++;
             chainLen++;
 
-            if (dir == LEFT)
+            if (dir == ED_LEFT)
             {
                 while (dirImg[r * width + c] == EDGE_HORIZONTAL)
                 {
@@ -680,12 +798,12 @@ void EdgeDrawingImpl::JoinAnchorPointsUsingSortedAnchors()
 
                 stack[++top].r = r;
                 stack[top].c = c;
-                stack[top].dir = DOWN;
+                stack[top].dir = ED_DOWN;
                 stack[top].parent = noChains;
 
                 stack[++top].r = r;
                 stack[top].c = c;
-                stack[top].dir = UP;
+                stack[top].dir = ED_UP;
                 stack[top].parent = noChains;
 
                 len--;
@@ -695,7 +813,7 @@ void EdgeDrawingImpl::JoinAnchorPointsUsingSortedAnchors()
                 chains[parent].children[0] = noChains;
                 noChains++;
             }
-            else if (dir == RIGHT)
+            else if (dir == ED_RIGHT)
             {
                 while (dirImg[r * width + c] == EDGE_HORIZONTAL)
                 {
@@ -766,12 +884,12 @@ void EdgeDrawingImpl::JoinAnchorPointsUsingSortedAnchors()
 
                 stack[++top].r = r;
                 stack[top].c = c;
-                stack[top].dir = DOWN;  // Go down
+                stack[top].dir = ED_DOWN;  // Go down
                 stack[top].parent = noChains;
 
                 stack[++top].r = r;
                 stack[top].c = c;
-                stack[top].dir = UP;   // Go up
+                stack[top].dir = ED_UP;   // Go up
                 stack[top].parent = noChains;
 
                 len--;
@@ -782,7 +900,7 @@ void EdgeDrawingImpl::JoinAnchorPointsUsingSortedAnchors()
                 noChains++;
 
             }
-            else if (dir == UP)
+            else if (dir == ED_UP)
             {
                 while (dirImg[r * width + c] == EDGE_VERTICAL)
                 {
@@ -853,12 +971,12 @@ void EdgeDrawingImpl::JoinAnchorPointsUsingSortedAnchors()
 
                 stack[++top].r = r;
                 stack[top].c = c;
-                stack[top].dir = RIGHT;
+                stack[top].dir = ED_RIGHT;
                 stack[top].parent = noChains;
 
                 stack[++top].r = r;
                 stack[top].c = c;
-                stack[top].dir = LEFT;
+                stack[top].dir = ED_LEFT;
                 stack[top].parent = noChains;
 
                 len--;
@@ -939,12 +1057,12 @@ void EdgeDrawingImpl::JoinAnchorPointsUsingSortedAnchors()
 
                 stack[++top].r = r;
                 stack[top].c = c;
-                stack[top].dir = RIGHT;
+                stack[top].dir = ED_RIGHT;
                 stack[top].parent = noChains;
 
                 stack[++top].r = r;
                 stack[top].c = c;
-                stack[top].dir = LEFT;
+                stack[top].dir = ED_LEFT;
                 stack[top].parent = noChains;
 
                 len--;
@@ -1187,17 +1305,13 @@ void EdgeDrawingImpl::JoinAnchorPointsUsingSortedAnchors()
     delete[] pixels;
 }
 
-int* EdgeDrawingImpl::sortAnchorsByGradValue1()
-{
-    int SIZE = 128 * 256;
-    int* C = new int[SIZE];
-    memset(C, 0, sizeof(int) * SIZE);
+int* EdgeDrawingImpl::sortAnchorsByGradValue1() {
+    const int SIZE = 128 * 256;
+    std::vector<int> C(SIZE, 0);
 
     // Count the number of grad values
-    for (int i = 1; i < height - 1; i++)
-    {
-        for (int j = 1; j < width - 1; j++)
-        {
+    for (int i = 1; i < height - 1; i++) {
+        for (int j = 1; j < width - 1; j++) {
             if (edgeImg[i * width + j] != ANCHOR_PIXEL)
                 continue;
 
@@ -1207,16 +1321,15 @@ int* EdgeDrawingImpl::sortAnchorsByGradValue1()
     }
 
     // Compute indices
-    for (int i = 1; i < SIZE; i++)
+    for (int i = 1; i < SIZE; i++) {
         C[i] += C[i - 1];
+    }
 
     int noAnchors = C[SIZE - 1];
     int* A = new int[noAnchors];
 
-    for (int i = 1; i < height - 1; i++)
-    {
-        for (int j = 1; j < width - 1; j++)
-        {
+    for (int i = 1; i < height - 1; i++) {
+        for (int j = 1; j < width - 1; j++) {
             if (edgeImg[i * width + j] != ANCHOR_PIXEL)
                 continue;
 
@@ -1226,52 +1339,35 @@ int* EdgeDrawingImpl::sortAnchorsByGradValue1()
         }
     }
 
-    delete[] C;
     return A;
 }
 
-int EdgeDrawingImpl::LongestChain(Chain* chains, int root)
-{
-    if (root == -1 || chains[root].len == 0)
+int EdgeDrawingImpl::LongestChain(Chain* chains, int root) {
+    if (root == -1 || chains[root].len == 0) {
         return 0;
+    }
 
-    int len0 = 0;
-    if (chains[root].children[0] != -1)
-        len0 = LongestChain(chains, chains[root].children[0]);
-
-    int len1 = 0;
-    if (chains[root].children[1] != -1)
-        len1 = LongestChain(chains, chains[root].children[1]);
-
-    int max = 0;
+    int leftLength = LongestChain(chains, chains[root].children[0]);
+    int rightLength = LongestChain(chains, chains[root].children[1]);
 
-    if (len0 >= len1)
-    {
-        max = len0;
-        chains[root].children[1] = -1;
+    if (leftLength >= rightLength) {
+        chains[root].children[1] = -1;  // Invalidate the right child
+        return chains[root].len + leftLength;
     }
-    else
-    {
-        max = len1;
-        chains[root].children[0] = -1;
+    else {
+        chains[root].children[0] = -1;  // Invalidate the left child
+        return chains[root].len + rightLength;
     }
-
-    return chains[root].len + max;
 }
 
-int EdgeDrawingImpl::RetrieveChainNos(Chain* chains, int root, int chainNos[])
-{
+int EdgeDrawingImpl::RetrieveChainNos(Chain* chains, int root, int chainNos[]) {
     int count = 0;
 
-    while (root != -1)
-    {
-        chainNos[count] = root;
-        count++;
+    while (root != -1) {
+        chainNos[count++] = root;
 
-        if (chains[root].children[0] != -1)
-            root = chains[root].children[0];
-        else
-            root = chains[root].children[1];
+        // Move to the next child in the chain
+        root = (chains[root].children[0] != -1) ? chains[root].children[0] : chains[root].children[1];
     }
 
     return count;
@@ -1291,7 +1387,7 @@ void EdgeDrawingImpl::detectLines(OutputArray _lines)
     max_distance_between_two_lines = params.MaxDistanceBetweenTwoLines;
     max_error = params.MaxErrorThreshold;
 
-    if (min_line_len == -1) // If no initial value given, compute it
+    if (min_line_len < 0) // If no initial value given, compute it
         min_line_len = ComputeMinLineLength();
 
     if (min_line_len < 9) // avoids small line segments in the result. Might be deleted!
@@ -1318,7 +1414,7 @@ void EdgeDrawingImpl::detectLines(OutputArray _lines)
 
     JoinCollinearLines();
 
-    if (params.NFAValidation)
+    if (params.NFAValidation && srcImage.channels() < 3)
         ValidateLineSegments();
 
     // Delete redundant space from lines
@@ -1440,7 +1536,7 @@ void EdgeDrawingImpl::SplitSegment2Lines(double* x, double* y, int noPixels, int
                     index--;
                 ComputeClosestPoint(x[index], y[index], lastA, lastB, lastInvert, ex, ey);
 
-                if ((sx == ex) & (sy == ey))
+                if ((sx == ex) && (sy == ey))
                     break;
 
                 // Add the line segment to lines
@@ -1517,7 +1613,8 @@ void EdgeDrawingImpl::ValidateLineSegments()
     {
         int lutSize = (width + height) / 8;
         double prob = 1.0 / 8;  // probability of alignment
-        nfa = new NFALUT(lutSize, prob, width, height);
+        double logNT = 2.0 * (log10((double)width) + log10((double)height));
+        nfa = new NFALUT(lutSize, prob, logNT);
     }
 
     int* x = new int[(width + height) * 4];
@@ -2037,7 +2134,7 @@ double EdgeDrawingImpl::ComputeMinDistanceBetweenTwoLines(EDLineSegment* ls1, ED
     double dy = ls1->sy - ls2->sy;
     double d = sqrt(dx * dx + dy * dy);
     double min = d;
-    int which = SOUTH_SOUTH;
+    int which = ED_SOUTH_SOUTH;
 
     dx = ls1->sx - ls2->ex;
     dy = ls1->sy - ls2->ey;
@@ -2045,7 +2142,7 @@ double EdgeDrawingImpl::ComputeMinDistanceBetweenTwoLines(EDLineSegment* ls1, ED
     if (d < min)
     {
         min = d;
-        which = SOUTH_EAST;
+        which = ED_SOUTH_EAST;
     }
 
     dx = ls1->ex - ls2->sx;
@@ -2054,7 +2151,7 @@ double EdgeDrawingImpl::ComputeMinDistanceBetweenTwoLines(EDLineSegment* ls1, ED
     if (d < min)
     {
         min = d;
-        which = EAST_SOUTH;
+        which = ED_EAST_SOUTH;
     }
 
     dx = ls1->ex - ls2->ex;
@@ -2063,7 +2160,7 @@ double EdgeDrawingImpl::ComputeMinDistanceBetweenTwoLines(EDLineSegment* ls1, ED
     if (d < min)
     {
         min = d;
-        which = EAST_EAST;
+        which = ED_EAST_EAST;
     }
 
     if (pwhich)
@@ -2370,51 +2467,53 @@ void EdgeDrawingImpl::TestSegment(int i, int index1, int index2)
     TestSegment(i, start, index2);
 }
 
-//----------------------------------------------------------------------------------------------
-// After the validation of the edge segments, extracts the valid ones
-// In other words, updates the valid segments' pixel arrays and their lengths
+// After validating the edge segments, this function extracts the valid ones.
+// Specifically, it updates the valid segments' pixel arrays and their lengths based on validation.
 void EdgeDrawingImpl::ExtractNewSegments()
 {
     vector< vector<Point> > validSegments;
-    int noSegments = 0;
 
     for (int i = 0; i < segmentNos; i++) {
         int start = 0;
         while (start < (int)segmentPoints[i].size()) {
 
+            // Find the first valid start point
             while (start < (int)segmentPoints[i].size()) {
                 int r = segmentPoints[i][start].y;
                 int c = segmentPoints[i][start].x;
 
-                if (edgeImg[r * width + c]) break;
+                if (edgeImg[r * width + c]) break; // Found valid point
                 start++;
             }
 
             int end = start + 1;
+
+            // Find the end of the valid segment
             while (end < (int)segmentPoints[i].size()) {
                 int r = segmentPoints[i][end].y;
                 int c = segmentPoints[i][end].x;
 
-                if (edgeImg[r * width + c] == 0) break;
+                if (edgeImg[r * width + c] == 0) break; // End of segment
                 end++;
             }
 
+            // Length of the segment
             int len = end - start;
+
+            // Only accept segments of length >= 10
             if (len >= 10) {
-                // A new segment. Accepted only only long enough (whatever that means)
-                //segments[noSegments].pixels = &map->segments[i].pixels[start];
-                //segments[noSegments].noPixels = len;
-                validSegments.push_back(vector<Point>());
-                vector<Point> subVec(&segmentPoints[i][start], &segmentPoints[i][end - 1]);
-                validSegments[noSegments] = subVec;
-                noSegments++;
+                vector<Point> subVec(segmentPoints[i].begin() + start, segmentPoints[i].begin() + end); // Safer bounds
+                validSegments.push_back(subVec); // Add to valid segments
             }
+
+            // Move start to next unprocessed point
             start = end + 1;
         }
     }
 
+    // Replace the old segments with valid segments
     segmentPoints = validSegments;
-    segmentNos = noSegments;
+    segmentNos = (int)validSegments.size(); // Update number of segments
 }
 
 double EdgeDrawingImpl::NFA(double prob, int len)
@@ -3119,7 +3218,8 @@ void EdgeDrawingImpl::ValidateCircles(bool validate)
     {
         int lutSize = (width + height) / 8;
         double prob = 1.0 / 8;  // probability of alignment
-        nfa = new NFALUT(lutSize, prob, width, height); // create look up table
+        double logNT = 2.0 * (log10((double)width) + log10((double)height));
+        nfa = new NFALUT(lutSize, prob, logNT); // create look up table
     }
 
     // Validate circles & ellipses
@@ -3297,12 +3397,24 @@ void EdgeDrawingImpl::ValidateCircles(bool validate)
                            // This produces less false positives, but occationally misses on some valid circles
             }
         out:
-            // compute gx & gy
-            int com1 = smoothImg[(r + 1) * width + c + 1] - smoothImg[(r - 1) * width + c - 1];
-            int com2 = smoothImg[(r - 1) * width + c + 1] - smoothImg[(r + 1) * width + c - 1];
+            int com1, com2, gx, gy;
+            if (srcImage.channels() > 1)
+            {
+                com1 = smooth_L[(r + 1) * width + c + 1] - smooth_L[(r - 1) * width + c - 1];
+                com2 = smooth_L[(r - 1) * width + c + 1] - smooth_L[(r + 1) * width + c - 1];
+
+                gx = com1 + com2 + smooth_L[r * width + c + 1] - smooth_L[r * width + c - 1];
+                gy = com1 - com2 + smooth_L[(r + 1) * width + c] - smooth_L[(r - 1) * width + c];
+            }
+            else
+            {
+                com1 = smoothImg[(r + 1) * width + c + 1] - smoothImg[(r - 1) * width + c - 1];
+                com2 = smoothImg[(r - 1) * width + c + 1] - smoothImg[(r + 1) * width + c - 1];
+
+                gx = com1 + com2 + smoothImg[r * width + c + 1] - smoothImg[r * width + c - 1];
+                gy = com1 - com2 + smoothImg[(r + 1) * width + c] - smoothImg[(r - 1) * width + c];
+            }
 
-            int gx = com1 + com2 + smoothImg[r * width + c + 1] - smoothImg[r * width + c - 1];
-            int gy = com1 - com2 + smoothImg[(r + 1) * width + c] - smoothImg[(r - 1) * width + c];
             double pixelAngle = nfa->myAtan2((double)gx, (double)-gy);
 
             double derivX, derivY;
@@ -3778,7 +3890,7 @@ void EdgeDrawingImpl::JoinArcs1()
                             EX = arcs[CandidateArcNo].sx;
                             EY = arcs[CandidateArcNo].sy;
                             break;
-                        } //end-switch
+                        }
 
                         break; // Do not look at the other candidates
                     }
@@ -5745,5 +5857,276 @@ void EdgeDrawingImpl::ROTATE(double** a, int i, int j, int k, int l, double tau,
     a[k][l] = h + s * (g - h * tau);
 }
 
+void EdgeDrawingImpl::MyRGB2LabFast()
+{
+    const uchar* blueImg;
+    const uchar* greenImg;
+    const uchar* redImg;
+
+    // Split channels (OpenCV uses BGR)
+    vector<Mat> bgr(3);
+    split(srcImage, bgr);
+    blueImg = bgr[0].data;
+    greenImg = bgr[1].data;
+    redImg = bgr[2].data;
+
+    // Initialize LUTs if necessary
+    if (!LUT_Initialized)
+        InitColorEDLib();
+
+    L_Img.create(height, width, CV_8U);
+    a_Img.create(height, width, CV_8U);
+    b_Img.create(height, width, CV_8U);
+
+    std::vector<double> L(width * height);
+    std::vector<double> a(width * height);
+    std::vector<double> b(width * height);
+
+    // Conversion from RGB to Lab
+    for (int i = 0; i < width * height; i++)
+    {
+        double red = LUT1[static_cast<int>(redImg[i] / 255.0 * LUT_SIZE + 0.5)] * 100;
+        double green = LUT1[static_cast<int>(greenImg[i] / 255.0 * LUT_SIZE + 0.5)] * 100;
+        double blue = LUT1[static_cast<int>(blueImg[i] / 255.0 * LUT_SIZE + 0.5)] * 100;
+
+        double x = red * 0.4124564 + green * 0.3575761 + blue * 0.1804375;
+        double y = red * 0.2126729 + green * 0.7151522 + blue * 0.0721750;
+        double z = red * 0.0193339 + green * 0.1191920 + blue * 0.9503041;
+
+        x /= 95.047;
+        y /= 100.0;
+        z /= 108.883;
+
+        x = LUT2[static_cast<int>(x * LUT_SIZE + 0.5)];
+        y = LUT2[static_cast<int>(y * LUT_SIZE + 0.5)];
+        z = LUT2[static_cast<int>(z * LUT_SIZE + 0.5)];
+
+        L[i] = (116.0 * y) - 16;
+        a[i] = 500.0 * (x / y);
+        b[i] = 200.0 * (y - z);
+    }
+
+    // Scaling to [0, 255]
+    auto scale_to_uchar = [](std::vector<double>& src, uchar* dst, int size) {
+        double minVal = *std::min_element(src.begin(), src.end());
+        double maxVal = *std::max_element(src.begin(), src.end());
+        double scale = 255.0 / (maxVal - minVal);
+        for (int i = 0; i < size; i++) {
+            dst[i] = static_cast<uchar>((src[i] - minVal) * scale);
+        }
+    };
+
+    scale_to_uchar(L, L_Img.data, width * height);
+    scale_to_uchar(a, a_Img.data, width * height);
+    scale_to_uchar(b, b_Img.data, width * height);
+}
+
+void EdgeDrawingImpl::ComputeGradientMapByDiZenzo()
+{
+    int max = 0;
+
+    for (int i = 1; i < height - 1; i++) {
+        for (int j = 1; j < width - 1; j++) {
+
+            int com1, com2, gxCh1, gxCh2, gxCh3, gyCh1, gyCh2, gyCh3;
+
+            if(params.EdgeDetectionOperator == PREWITT)
+            {
+                // Prewitt for channel1
+                com1 = smooth_L[(i + 1) * width + j + 1] - smooth_L[(i - 1) * width + j - 1];
+                com2 = smooth_L[(i - 1) * width + j + 1] - smooth_L[(i + 1) * width + j - 1];
+
+                gxCh1 = com1 + com2 + (smooth_L[i * width + j + 1] - smooth_L[i * width + j - 1]);
+                gyCh1 = com1 - com2 + (smooth_L[(i + 1) * width + j] - smooth_L[(i - 1) * width + j]);
+
+                // Prewitt for channel2
+                com1 = smooth_a[(i + 1) * width + j + 1] - smooth_a[(i - 1) * width + j - 1];
+                com2 = smooth_a[(i - 1) * width + j + 1] - smooth_a[(i + 1) * width + j - 1];
+
+                gxCh2 = com1 + com2 + (smooth_a[i * width + j + 1] - smooth_a[i * width + j - 1]);
+                gyCh2 = com1 - com2 + (smooth_a[(i + 1) * width + j] - smooth_a[(i - 1) * width + j]);
+
+                // Prewitt for channel3
+                com1 = smooth_b[(i + 1) * width + j + 1] - smooth_b[(i - 1) * width + j - 1];
+                com2 = smooth_b[(i - 1) * width + j + 1] - smooth_b[(i + 1) * width + j - 1];
+
+                gxCh3 = com1 + com2 + (smooth_b[i * width + j + 1] - smooth_b[i * width + j - 1]);
+                gyCh3 = com1 - com2 + (smooth_b[(i + 1) * width + j] - smooth_b[(i - 1) * width + j]);
+            }
+            else
+            {
+                // Sobel for channel1
+                com1 = smooth_L[(i + 1) * width + j + 1] - smooth_L[(i - 1) * width + j - 1];
+                com2 = smooth_L[(i - 1) * width + j + 1] - smooth_L[(i + 1) * width + j - 1];
+
+                gxCh1 = com1 + com2 + 2 * (smooth_L[i * width + j + 1] - smooth_L[i * width + j - 1]);
+                gyCh1 = com1 - com2 + 2 * (smooth_L[(i + 1) * width + j] - smooth_L[(i - 1) * width + j]);
+
+                // Sobel for channel2
+                com1 = smooth_a[(i + 1) * width + j + 1] - smooth_a[(i - 1) * width + j - 1];
+                com2 = smooth_a[(i - 1) * width + j + 1] - smooth_a[(i + 1) * width + j - 1];
+
+                gxCh2 = com1 + com2 + 2 * (smooth_a[i * width + j + 1] - smooth_a[i * width + j - 1]);
+                gyCh2 = com1 - com2 + 2 * (smooth_a[(i + 1) * width + j] - smooth_a[(i - 1) * width + j]);
+
+                // Sobel for channel3
+                com1 = smooth_b[(i + 1) * width + j + 1] - smooth_b[(i - 1) * width + j - 1];
+                com2 = smooth_b[(i - 1) * width + j + 1] - smooth_b[(i + 1) * width + j - 1];
+
+                gxCh3 = com1 + com2 + 2 * (smooth_b[i * width + j + 1] - smooth_b[i * width + j - 1]);
+                gyCh3 = com1 - com2 + 2 * (smooth_b[(i + 1) * width + j] - smooth_b[(i - 1) * width + j]);
+            }
+
+            int gxx = gxCh1 * gxCh1 + gxCh2 * gxCh2 + gxCh3 * gxCh3;
+            int gyy = gyCh1 * gyCh1 + gyCh2 * gyCh2 + gyCh3 * gyCh3;
+            int gxy = gxCh1 * gyCh1 + gxCh2 * gyCh2 + gxCh3 * gyCh3;
+
+#if 1
+            // Di Zenzo's formulas from Gonzales & Woods - Page 337
+            double theta = atan2(2.0 * gxy, (double)(gxx - gyy)) / 2; // Gradient Direction
+            int grad = (int)(sqrt(((gxx + gyy) + (gxx - gyy) * cos(2 * theta) + 2 * gxy * sin(2 * theta)) / 2.0) + 0.5); // Gradient Magnitude
+#else
+            // Koschan & Abidi - 2005 - Signal Processing Magazine
+            double theta = atan2(2.0 * gxy, (double)(gxx - gyy)) / 2; // Gradient Direction
+
+            double cosTheta = cos(theta);
+            double sinTheta = sin(theta);
+            int grad = (int)(sqrt(gxx * cosTheta * cosTheta + 2 * gxy * sinTheta * cosTheta + gyy * sinTheta * sinTheta) + 0.5); // Gradient Magnitude
+#endif
+
+            // Gradient is perpendicular to the edge passing through the pixel
+            if (theta >= -3.14159 / 4 && theta <= 3.14159 / 4)
+                dirImg[i * width + j] = EDGE_VERTICAL;
+            else
+                dirImg[i * width + j] = EDGE_HORIZONTAL;
+
+            gradImg[i * width + j] = (ushort)grad;
+
+            if (grad > max)
+                max = grad;
+        }
+    }
+
+    // Scale the gradient values to 0-255
+    double scale = 255.0 / max;
+    for (int i = 0; i < width * height; i++)
+        gradImg[i] = (ushort)(gradImg[i] * scale);
+}
+
+void EdgeDrawingImpl::smoothChannel(const Mat& src, Mat& dst, double sigma)
+{
+    if (sigma == 1.0)
+        GaussianBlur(src, dst, Size(5, 5), 1);
+    else if (sigma == 1.5)
+        GaussianBlur(src, dst, Size(7, 7), 1.5);  // seems to be better?
+    else
+        GaussianBlur(src, dst, Size(), sigma);
+}
+
+//---------------------------------------------------------
+// Fix edge segments having one or two pixel fluctuations
+// An example one pixel problem getting fixed:
+//  x
+// x x --> xxx
+//
+// An example two pixel problem getting fixed:
+//  xx
+// x  x --> xxxx
+//
+void EdgeDrawingImpl::fixEdgeSegments(std::vector<std::vector<cv::Point>> map)
+{
+    /// First fix one pixel problems: There are four cases
+    for (int i = 0; i < (int)map.size(); i++) {
+        int cp = (int)map[i].size() - 2;  // Current pixel index
+        int n2 = 0;  // next next pixel index
+
+        while (n2 < (int)map[i].size()) {
+            int n1 = cp + 1; // next pixel
+
+            cp = cp % map[i].size(); // Roll back to the beginning
+            n1 = n1 % map[i].size(); // Roll back to the beginning
+
+            int r = map[i][cp].y;
+            int c = map[i][cp].x;
+
+            int r1 = map[i][n1].y;
+            int c1 = map[i][n1].x;
+
+            int r2 = map[i][n2].y;
+            int c2 = map[i][n2].x;
+
+            // 4 cases to fix
+            if (r2 == r - 2 && c2 == c) {
+                if (c1 != c) {
+                    map[i][n1].x = c;
+                }
+
+                cp = n2;
+                n2 += 2;
+
+            }
+            else if (r2 == r + 2 && c2 == c) {
+                if (c1 != c) {
+                    map[i][n1].x = c;
+                }
+
+                cp = n2;
+                n2 += 2;
+
+            }
+            else if (r2 == r && c2 == c - 2) {
+                if (r1 != r) {
+                    map[i][n1].y = r;
+                }
+
+                cp = n2;
+                n2 += 2;
+
+            }
+            else if (r2 == r && c2 == c + 2) {
+                if (r1 != r) {
+                    map[i][n1].y = r;
+                }
+
+                cp = n2;
+                n2 += 2;
+
+            }
+            else {
+                cp++;
+                n2++;
+            }
+        }
+    }
+}
+
+void EdgeDrawingImpl::InitColorEDLib()
+{
+    if (LUT_Initialized)
+        return;
+
+    double inc = 1.0 / LUT_SIZE;
+    for (int i = 0; i <= LUT_SIZE; i++) {
+        double d = i * inc;
+
+        if (d >= 0.04045) LUT1[i] = pow(((d + 0.055) / 1.055), 2.4);
+        else              LUT1[i] = d / 12.92;
+    }
+
+    inc = 1.0 / LUT_SIZE;
+    for (int i = 0; i <= LUT_SIZE; i++) {
+        double d = i * inc;
+
+        if (d > 0.008856) LUT2[i] = pow(d, 1.0 / 3.0);
+        else              LUT2[i] = (7.787 * d) + (16.0 / 116.0);
+    }
+
+    LUT_Initialized = true;
+}
+
+bool EdgeDrawingImpl::LUT_Initialized = false;
+double EdgeDrawingImpl::LUT1[LUT_SIZE + 1] = { 0 };
+double EdgeDrawingImpl::LUT2[LUT_SIZE + 1] = { 0 };
+
 } // namespace cv
 } // namespace ximgproc
diff --git a/modules/ximgproc/src/edge_drawing_common.hpp b/modules/ximgproc/src/edge_drawing_common.hpp
index 135b456615d..01dc5107ae6 100644
--- a/modules/ximgproc/src/edge_drawing_common.hpp
+++ b/modules/ximgproc/src/edge_drawing_common.hpp
@@ -13,15 +13,15 @@
 #define ANCHOR_PIXEL  254
 #define EDGE_PIXEL    255
 
-#define LEFT  1
-#define RIGHT 2
-#define UP    3
-#define DOWN  4
+#define ED_LEFT  1
+#define ED_RIGHT 2
+#define ED_UP    3
+#define ED_DOWN  4
 
-#define SOUTH_SOUTH 0
-#define SOUTH_EAST 1
-#define EAST_SOUTH 2
-#define EAST_EAST 3
+#define ED_SOUTH_SOUTH 0
+#define ED_SOUTH_EAST 1
+#define ED_EAST_SOUTH 2
+#define ED_EAST_EAST 3
 
 // Circular arc, circle thresholds
 #define VERY_SHORT_ARC_ERROR     0.40  // Used for very short arcs (>= CANDIDATE_CIRCLE_RATIO1 && < CANDIDATE_CIRCLE_RATIO2)
@@ -37,39 +37,45 @@
 // Ellipse thresholds
 #define CANDIDATE_ELLIPSE_RATIO  0.50  // 50% -- If 50% of the ellipse is detected, it may be candidate for validation
 #define ELLIPSE_ERROR            1.50  // Used for ellipses. (used to be 1.65 for what reason?)
-#define MAX_GRAD_VALUE 128*256
+#define ED_MAX_GRAD_VALUE 128*256
 
 using namespace std;
 using namespace cv;
 
+#define TABSIZE 100000
+#define MAX_LUT_SIZE 1024
+#define RELATIVE_ERROR_FACTOR 100.0
+
 class NFALUT
 {
 public:
 
-    NFALUT(int size, double _prob, int _w, int _h);
+    NFALUT(int size, double _prob, double _logNT);
     ~NFALUT();
 
     int* LUT; // look up table
     int LUTSize;
 
     double prob;
-    int w, h;
+    double logNT;
 
     bool checkValidationByNFA(int n, int k);
     static double myAtan2(double yy, double xx);
 
 private:
     double nfa(int n, int k);
-    static double Comb(double n, double k);
+    static double log_gamma_lanczos(double x);
+    static double log_gamma_windschitl(double x);
+    static double log_gamma(double x);
+    static int double_equal(double a, double b);
 };
 
-NFALUT::NFALUT(int size, double _prob, int _w, int _h)
+NFALUT::NFALUT(int size, double _prob, double _logNT)
 {
     LUTSize = size > 60 ? 60 : size;
     LUT = new int[LUTSize];
-    w = _w;
-    h = _h;
     prob = _prob;
+    logNT = _logNT;
 
     LUT[0] = 1;
     int j = 1;
@@ -77,24 +83,23 @@ NFALUT::NFALUT(int size, double _prob, int _w, int _h)
     {
         LUT[i] = LUTSize + 1;
         double ret = nfa(i, j);
-        if (ret >= 1.0)
+        if (ret < 0)
         {
             while (j < i)
             {
                 j++;
                 ret = nfa(i, j);
-                if (ret <= 1.0)
+                if (ret >= 0)
                     break;
             }
 
-            if (ret >= 1.0)
+            if (ret < 0)
                 continue;
         }
         LUT[i] = j;
     }
 }
 
-
 NFALUT::~NFALUT()
 {
     delete[] LUT;
@@ -103,7 +108,7 @@ NFALUT::~NFALUT()
 bool NFALUT::checkValidationByNFA(int n, int k)
 {
     if (n >= LUTSize)
-        return nfa(n, k) <= 1.0;
+        return nfa(n, k) >= 0.0;
     else
         return k >= LUT[n];
 }
@@ -118,28 +123,140 @@ double NFALUT::myAtan2(double yy, double xx)
     return angle / 180 * CV_PI;
 }
 
-double NFALUT::Comb(double n, double k)   //fast combination computation
+double NFALUT::nfa(int n, int k)
 {
-    if (k > n)
-        return 0;
+    static double inv[TABSIZE];   /* table to keep computed inverse values */
+    double tolerance = 0.1;       /* an error of 10% in the result is accepted */
+    double log1term, term, bin_term, mult_term, bin_tail, err, p_term;
+    int i;
+
+    /* check parameters */
+    if (n < 0 || k<0 || k>n || prob <= 0.0 || prob >= 1.0) return -1.0;
+
+    /* trivial cases */
+    if (n == 0 || k == 0) return -logNT;
+    if (n == k) return -logNT - (double)n * log10(prob);
+
+    /* probability term */
+    p_term = prob / (1.0 - prob);
+
+    /* compute the first term of the series */
+    /*
+    binomial_tail(n,k,p) = sum_{i=k}^n bincoef(n,i) * p^i * (1-p)^{n-i}
+    where bincoef(n,i) are the binomial coefficients.
+    But
+    bincoef(n,k) = gamma(n+1) / ( gamma(k+1) * gamma(n-k+1) ).
+    We use this to compute the first term. Actually the log of it.
+    */
+    log1term = log_gamma((double)n + 1.0) - log_gamma((double)k + 1.0)
+        - log_gamma((double)(n - k) + 1.0)
+        + (double)k * log(prob) + (double)(n - k) * log(1.0 - prob);
+    term = exp(log1term);
+
+    /* in some cases no more computations are needed */
+    if (double_equal(term, 0.0)) {              /* the first term is almost zero */
+        if ((double)k > (double)n * prob)     /* at begin or end of the tail?  */
+            return -log1term / M_LN10 - logNT;  /* end: use just the first term  */
+        else
+            return -logNT;                      /* begin: the tail is roughly 1  */
+    }
+
+      /* compute more terms if needed */
+    bin_tail = term;
+    for (i = k + 1; i <= n; i++) {
+        /*
+        As
+        term_i = bincoef(n,i) * p^i * (1-p)^(n-i)
+        and
+        bincoef(n,i)/bincoef(n,i-1) = n-1+1 / i,
+        then,
+        term_i / term_i-1 = (n-i+1)/i * p/(1-p)
+        and
+        term_i = term_i-1 * (n-i+1)/i * p/(1-p).
+        1/i is stored in a table as they are computed,
+        because divisions are expensive.
+        p/(1-p) is computed only once and stored in 'p_term'.
+        */
+        bin_term = (double)(n - i + 1) * (i < TABSIZE ?
+            (inv[i] != 0.0 ? inv[i] : (inv[i] = 1.0 / (double)i)) :
+            1.0 / (double)i);
+
+        mult_term = bin_term * p_term;
+        term *= mult_term;
+        bin_tail += term;
+
+        if (bin_term < 1.0) {
+            /* When bin_term<1 then mult_term_j<mult_term_i for j>i.
+            Then, the error on the binomial tail when truncated at
+            the i term can be bounded by a geometric series of form
+            term_i * sum mult_term_i^j.                            */
+            err = term * ((1.0 - pow(mult_term, (double)(n - i + 1))) /
+                (1.0 - mult_term) - 1.0);
+
+            /* One wants an error at most of tolerance*final_result, or:
+            tolerance * abs(-log10(bin_tail)-logNT).
+            Now, the error that can be accepted on bin_tail is
+            given by tolerance*final_result divided by the derivative
+            of -log10(x) when x=bin_tail. that is:
+            tolerance * abs(-log10(bin_tail)-logNT) / (1/bin_tail)
+            Finally, we truncate the tail if the error is less than:
+            tolerance * abs(-log10(bin_tail)-logNT) * bin_tail        */
+            if (err < tolerance * fabs(-log10(bin_tail) - logNT) * bin_tail) break;
+        }
+    }
+
+    return -log10(bin_tail) - logNT;
+}
 
-    double r = 1;
-    for (double d = 1; d <= k; ++d)
+double NFALUT::log_gamma_lanczos(double x)
+{
+    static double q[7] = { 75122.6331530, 80916.6278952, 36308.2951477,
+        8687.24529705, 1168.92649479, 83.8676043424,
+        2.50662827511 };
+    double a = (x + 0.5) * log(x + 5.5) - (x + 5.5);
+    double b = 0.0;
+    int n;
+
+    for (n = 0; n < 7; n++)
     {
-        r *= n--;
-        r /= d;
+        a -= log(x + (double)n);
+        b += q[n] * pow(x, (double)n);
     }
-    return r;
+    return a + log(b);
 }
 
-double NFALUT::nfa(int n, int k)
+double NFALUT::log_gamma_windschitl(double x)
 {
-    double sum = 0;
-    double p = 0.125;
-    for (int i = k; i <= n; i++)
-        sum += Comb(n, i) * pow(p, i) * pow(1 - p, n - i);
+    return 0.918938533204673 + (x - 0.5) * log(x) - x
+        + 0.5 * x * log(x * sinh(1 / x) + 1 / (810.0 * pow(x, 6.0)));
+}
+
+double NFALUT::log_gamma(double x)
+{
+    return x > 15 ? log_gamma_windschitl(x) : log_gamma_lanczos(x);
+}
+
+int NFALUT::double_equal(double a, double b)
+{
+    double abs_diff, aa, bb, abs_max;
 
-    return sum * w * w * h * h;
+    /* trivial case */
+    if (a == b) return true;
+
+    abs_diff = fabs(a - b);
+    aa = fabs(a);
+    bb = fabs(b);
+    abs_max = aa > bb ? aa : bb;
+
+    /* DBL_MIN is the smallest normalized number, thus, the smallest
+    number whose relative error is bounded by DBL_EPSILON. For
+    smaller numbers, the same quantization steps as for DBL_MIN
+    are used. Then, for smaller numbers, a meaningful "relative"
+    error should be computed by dividing the difference by DBL_MIN. */
+    if (abs_max < DBL_MIN) abs_max = DBL_MIN;
+
+    /* equal if relative error <= factor x eps */
+    return (abs_diff / abs_max) <= (RELATIVE_ERROR_FACTOR * DBL_EPSILON);
 }
 
 struct StackNode
@@ -220,18 +337,19 @@ struct mEllipse {
 // Ax^2 + Bxy + Cy^2 + Dx + Ey + F = 0
 //
 struct EllipseEquation {
-	double coeff[7];  // coeff[1] = A
-
-	EllipseEquation() {
-		for (int i = 0; i<7; i++) coeff[i] = 0;
-	}
-
-	double A() { return coeff[1]; }
-	double B() { return coeff[2]; }
-	double C() { return coeff[3]; }
-	double D() { return coeff[4]; }
-	double E() { return coeff[5]; }
-	double F() { return coeff[6]; }
+    static constexpr int COEFF_SIZE = 7;
+    double coeff[COEFF_SIZE];  // coeff[1] = A
+
+    // Constructor using an initialization list
+    EllipseEquation() : coeff{} {}
+
+    // Accessor functions marked as const
+    double A() const { return coeff[1]; }
+    double B() const { return coeff[2]; }
+    double C() const { return coeff[3]; }
+    double D() const { return coeff[4]; }
+    double E() const { return coeff[5]; }
+    double F() const { return coeff[6]; }
 };
 
 // ================================ CIRCLES ================================
diff --git a/modules/ximgproc/test/test_fld.cpp b/modules/ximgproc/test/test_fld.cpp
index 24022a38087..738597b4aae 100644
--- a/modules/ximgproc/test/test_fld.cpp
+++ b/modules/ximgproc/test/test_fld.cpp
@@ -7,7 +7,7 @@ namespace opencv_test { namespace {
 
 const Size img_size(320, 240);
 const int FLD_TEST_SEED = 0x134679;
-const int EPOCHS = 10;
+const int EPOCHS = 5;
 
 class FLDBase : public testing::Test
 {
@@ -44,6 +44,7 @@ class ximgproc_ED: public FLDBase
         {
             detector = createEdgeDrawing();
         }
+        string filename = cvtest::TS::ptr()->get_data_path() + "cv/imgproc/beads.jpg";
     protected:
         Ptr<EdgeDrawing> detector;
 
@@ -275,16 +276,18 @@ TEST_F(ximgproc_ED, rotatedRect)
     ASSERT_EQ(EPOCHS, passedtests);
 }
 
-TEST_F(ximgproc_ED, ManySmallCircles)
+TEST_F(ximgproc_ED, detectLinesAndEllipses)
 {
-    string picture_name = "cv/imgproc/beads.jpg";
+    Mat gray_image;
+    vector<Vec6d> ellipses;
 
-    string filename = cvtest::TS::ptr()->get_data_path() + picture_name;
-    test_image = imread(filename, IMREAD_GRAYSCALE);
+    test_image = imread(filename);
     EXPECT_FALSE(test_image.empty()) << "Invalid test image: " << filename;
 
-    vector<Vec6d> ellipses;
-    detector->detectEdges(test_image);
+    cvtColor(test_image, test_image, COLOR_BGR2BGRA);
+    cvtColor(test_image, gray_image, COLOR_BGR2GRAY);
+
+    detector->detectEdges(gray_image);
     detector->detectEllipses(ellipses);
     detector->detectLines(lines);
 
@@ -295,5 +298,34 @@ TEST_F(ximgproc_ED, ManySmallCircles)
     EXPECT_GE(lines.size(), lines_size);
     EXPECT_LE(lines.size(), lines_size + 2);
     EXPECT_EQ(ellipses.size(), ellipses_size);
+
+    detector->params.PFmode = true;
+
+    detector->detectEdges(gray_image);
+    detector->detectEllipses(ellipses);
+    detector->detectLines(lines);
+
+    segments_size = 2717;
+    lines_size = 6197;
+    ellipses_size = 2446;
+    EXPECT_EQ(detector->getSegments().size(), segments_size);
+    EXPECT_GE(lines.size(), lines_size);
+    EXPECT_LE(lines.size(), lines_size + 2);
+    EXPECT_EQ(ellipses.size(), ellipses_size);
+
+    detector->params.MinLineLength = 10;
+
+    detector->detectEdges(test_image);
+    detector->detectEllipses(ellipses);
+    detector->detectLines(lines);
+    detector->detectEllipses(ellipses);
+    segments_size = 6230;
+    lines_size = 11133;
+    ellipses_size = 2431;
+    EXPECT_EQ(detector->getSegments().size(), segments_size);
+    EXPECT_GE(lines.size(), lines_size);
+    EXPECT_LE(lines.size(), lines_size + 2);
+    EXPECT_GE(ellipses.size(), ellipses_size);
+    EXPECT_LE(ellipses.size(), ellipses_size + 2);
 }
 }} // namespace

From 57764e18493eade8d6ca4206072b5b8ea70b8ada Mon Sep 17 00:00:00 2001
From: shyama7004 <bisoyi7004@gmail.com>
Date: Thu, 10 Apr 2025 20:28:11 +0530
Subject: [PATCH 2/6] enhance multi_camera_tutorial for clarity

---
 .../ccalib/tutorials/multi_camera_tutorial.markdown    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/ccalib/tutorials/multi_camera_tutorial.markdown b/modules/ccalib/tutorials/multi_camera_tutorial.markdown
index d94da981b0a..9e8bf88192f 100644
--- a/modules/ccalib/tutorials/multi_camera_tutorial.markdown
+++ b/modules/ccalib/tutorials/multi_camera_tutorial.markdown
@@ -5,7 +5,7 @@ This tutorial will show how to use the multiple camera calibration toolbox. This
 
 Random Pattern Calibration Object
 -------------------------------
-The random pattern is an image that is randomly generated. It is "random" so that it has many feature points. After generating it, one print it out and use it as a calibration object. The following two images are random pattern and a photo taken for it.
+The random pattern is an image that is randomly generated. It is "random" so that it has many feature points. After generating it, print it out and use it as a calibration object. The following two images are random pattern and a photo taken for it.
 
 ![image](img/random_pattern.jpg)
 ![image](img/pattern_img.jpg)
@@ -14,7 +14,7 @@ To generate a random pattern, use the class ```cv::randpattern::RandomPatternGen
 ```
 cv::randpattern::RandomPatternGenerator generator(width, height);
 generator.generatePattern();
-pattern = generator.getPattern();
+cv::Mat pattern = generator.getPattern();
 ```
 Here ```width``` and ```height``` are width and height of pattern image. After getting the pattern, print it out and take some photos of it.
 
@@ -26,7 +26,7 @@ finder.computeObjectImagePoints(vecImg);
 vector<Mat> objectPoints = finder.getObjectPoints();
 vector<Mat> imagePoints = finder.getImagePoints();
 ```
-Here variable ```patternWidth``` and ```patternHeight``` are physical pattern width and height with some user defined unit. ```vecImg``` is a vector of images that stores calibration images.
+Here the variables ```patternWidth``` and ```patternHeight``` refer to the physical dimensions of the calibration object in the chosen unit of measurement. ```vecImg``` is a vector of images that stores calibration images.
 
 Second, use calibration functions like ```cv::calibrateCamera``` or ```cv::omnidir::calibrate``` to calibrate camera.
 
@@ -34,7 +34,7 @@ Multiple Cameras Calibration
 -------------------------------
 Now we move to multiple camera calibration, so far this toolbox must use random pattern object.
 
-To calibrate multiple cameras, we first need to take some photos of random pattern. Of cause, to calibrate the extrinsic parameters, one pattern need to be viewed by multiple cameras (at least two) at the same time. Another thing is that to help the program know which camera and which pattern the photo is taken, the image file should be named as "cameraIdx-timestamp.*". Photos with same timestamp means that they are the same object taken by several cameras. In addition, cameraIdx should start from 0. Some examples of files names are "0-129.png", "0-187.png", "1-187", "2-129".
+To calibrate multiple cameras, we first need to take some photos of random pattern. Of course, to calibrate the extrinsic parameters, one pattern needs to be viewed by multiple cameras (at least two) at the same time. Another thing is that to help the program know which camera and which pattern the photo is taken, the image file should be named as "cameraIdx-timestamp.*". Photos with same timestamp means that they are the same object taken by several cameras. In addition, cameraIdx should start from 0. Some examples of files names are "0-129.png", "0-187.png", "1-187", "2-129".
 
 Then, we can run multiple cameras calibration as
 ```
@@ -42,4 +42,4 @@ cv::multicalib::MultiCameraCalibration multiCalib(cameraType, nCamera, inputFile
 multiCalib.run();
 multiCalib.writeParameters(outputFilename);
 ```
-Here ```cameraType``` indicates the camera type, ```multicalib::MultiCameraCalibration::PINHOLE``` and ```multicalib::MultiCameraCalibration::OMNIDIRECTIONAL``` are supported. For omnidirectional camera, you can refer to ```cv::omnidir``` module for detail. ```nCamera``` is the number of camers. ```inputFilename``` is the name of a file generated by ```imagelist_creator``` from ```opencv/sample```. It stores names of random pattern and calibration images, the first file name is the name of random pattern. ```patternWidth``` and ```patternHeight``` are physical width and height of pattern. ```showFeatureExtraction``` is a flags to indicate whether show feature extraction process. ```nMiniMatches``` is a minimal points that should be detected in each frame, otherwise this frame will be abandoned. ```outputFilename``` is a xml file name to store parameters.
+Here ```cameraType``` indicates the camera type, ```multicalib::MultiCameraCalibration::PINHOLE``` and ```multicalib::MultiCameraCalibration::OMNIDIRECTIONAL``` are supported. For omnidirectional camera, you can refer to ```cv::omnidir``` module for detail. ```nCamera``` is the number of cameras. ```inputFilename``` is the name of a file generated by ```imagelist_creator``` from ```opencv/sample```. It stores names of random pattern and calibration images, the first file name is the name of random pattern. ```patternWidth``` and ```patternHeight``` represents the physical width and height of the pattern. ```showFeatureExtraction``` is a boolean flag that determines whether the feature extraction process is displayed. ```nMiniMatches``` is the minimum number of points that should be detected in each frame, otherwise this frame will be abandoned. ```outputFilename``` is an XML that will store the calibration parameters.

From 6329974d4dd23f736f5cea6145ecc245d7e53940 Mon Sep 17 00:00:00 2001
From: Troels Ynddal <troelsy@users.noreply.github.com>
Date: Thu, 19 Jun 2025 15:38:53 +0200
Subject: [PATCH 3/6] Merge pull request #3943 from troelsy:4.x

Add Otsu's method to cv::cuda::threshold #3943

I implemented Otsu's method in CUDA for a separate project and want to add it to cv::cuda::threshold

I have made an effort to use existing OpenCV functions in my code, but I had some trouble with `ThresholdTypes` and `cv::cuda::calcHist`. I couldn't figure out how to include `precomp.hpp` to get the definition of `ThresholdTypes`. For `cv::cuda::calcHist` I tried adding `opencv_cudaimgproc`, but it creates a circular dependency on `cudaarithm`. I have include a simple implementation of `calcHist` so the code runs, but I would like input on how to use `cv::cuda::calcHist` instead.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 .../cudaarithm/include/opencv2/cudaarithm.hpp |  10 +-
 modules/cudaarithm/src/cuda/threshold.cu      | 222 +++++++++++++++++-
 .../test/test_element_operations.cpp          |  50 +++-
 .../opencv2/cudev/util/saturate_cast.hpp      |   2 +
 .../include/opencv2/cudev/warp/shuffle.hpp    |  10 +
 5 files changed, 289 insertions(+), 5 deletions(-)

diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
index e5ab2cf2575..80b4dd8fac7 100644
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -546,12 +546,16 @@ static inline void scaleAdd(InputArray src1, double alpha, InputArray src2, Outp
 
 /** @brief Applies a fixed-level threshold to each array element.
 
+The special value cv::THRESH_OTSU may be combined with one of the other types. In this case, the function determines the
+optimal threshold value using the Otsu's and uses it instead of the specified threshold. The function returns the
+computed threshold value in addititon to the thresholded matrix.
+The Otsu's method is implemented only for 8-bit matrices.
+
 @param src Source array (single-channel).
-@param dst Destination array with the same size and type as src .
+@param dst Destination array with the same size and type as src.
 @param thresh Threshold value.
 @param maxval Maximum value to use with THRESH_BINARY and THRESH_BINARY_INV threshold types.
-@param type Threshold type. For details, see threshold . The THRESH_OTSU and THRESH_TRIANGLE
-threshold types are not supported.
+@param type Threshold type. For details, see threshold. The THRESH_TRIANGLE threshold type is not supported.
 @param stream Stream for the asynchronous version.
 
 @sa threshold
diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index cfa88dfae7c..c5586e61ab7 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -95,12 +95,232 @@ namespace
     }
 }
 
-double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream)
+
+__global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long long *sums)
+{
+    const uint32_t n_bins = 256;
+
+    __shared__ uint shared_memory_ts[n_bins];
+    __shared__ unsigned long long shared_memory_s[n_bins];
+
+    int bin_idx = threadIdx.x;
+    int threshold = blockIdx.x;
+
+    uint threshold_sum_above = 0;
+    unsigned long long sum_above = 0;
+
+    if (bin_idx >= threshold)
+    {
+        uint value = histogram[bin_idx];
+        threshold_sum_above = value;
+        sum_above = value * bin_idx;
+    }
+
+    blockReduce<n_bins>(shared_memory_ts, threshold_sum_above, bin_idx, plus<uint>());
+    blockReduce<n_bins>(shared_memory_s, sum_above, bin_idx, plus<unsigned long long>());
+
+    if (bin_idx == 0)
+    {
+        threshold_sums[threshold] = threshold_sum_above;
+        sums[threshold] = sum_above;
+    }
+}
+
+__global__ void
+otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
+{
+    const uint32_t n_bins = 256;
+
+    __shared__ signed long long shared_memory_a[n_bins];
+    __shared__ signed long long shared_memory_b[n_bins];
+
+    int bin_idx = threadIdx.x;
+    int threshold = blockIdx.x;
+
+    uint n_samples = threshold_sums[0];
+    uint n_samples_above = threshold_sums[threshold];
+    uint n_samples_below = n_samples - n_samples_above;
+
+    unsigned long long total_sum = sums[0];
+    unsigned long long sum_above = sums[threshold];
+    unsigned long long sum_below = total_sum - sum_above;
+
+    float threshold_variance_above_f32 = 0;
+    float threshold_variance_below_f32 = 0;
+    if (bin_idx >= threshold)
+    {
+        float mean = (float) sum_above / n_samples_above;
+        float sigma = bin_idx - mean;
+        threshold_variance_above_f32 = sigma * sigma;
+    }
+    else
+    {
+        float mean = (float) sum_below / n_samples_below;
+        float sigma = bin_idx - mean;
+        threshold_variance_below_f32 = sigma * sigma;
+    }
+
+    uint bin_count = histogram[bin_idx];
+    signed long long threshold_variance_above_i64 = (signed long long)(threshold_variance_above_f32 * bin_count);
+    signed long long threshold_variance_below_i64 = (signed long long)(threshold_variance_below_f32 * bin_count);
+    blockReduce<n_bins>(shared_memory_a, threshold_variance_above_i64, bin_idx, plus<signed long long>());
+    blockReduce<n_bins>(shared_memory_b, threshold_variance_below_i64, bin_idx, plus<signed long long>());
+
+    if (bin_idx == 0)
+    {
+        variance[threshold] = make_float2(threshold_variance_above_i64, threshold_variance_below_i64);
+    }
+}
+
+
+__global__ void
+otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
+{
+    const uint32_t n_thresholds = 256;
+
+    __shared__ float shared_memory[n_thresholds / WARP_SIZE];
+
+    int threshold = threadIdx.x;
+
+    uint n_samples = threshold_sums[0];
+    uint n_samples_above = threshold_sums[threshold];
+    uint n_samples_below = n_samples - n_samples_above;
+
+    float threshold_mean_above = (float)n_samples_above / n_samples;
+    float threshold_mean_below = (float)n_samples_below / n_samples;
+
+    float2 variances = variance[threshold];
+    float variance_above = variances.x / n_samples_above;
+    float variance_below = variances.y / n_samples_below;
+
+    float above = threshold_mean_above * variance_above;
+    float below = threshold_mean_below * variance_below;
+    float score = above + below;
+
+    float original_score = score;
+
+    blockReduce<n_thresholds>(shared_memory, score, threshold, minimum<float>());
+
+    if (threshold == 0)
+    {
+        shared_memory[0] = score;
+    }
+    __syncthreads();
+
+    score = shared_memory[0];
+
+    // We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we
+    // know which threshold it is
+    if (original_score == score)
+    {
+        *otsu_threshold = threshold - 1;
+    }
+}
+
+void compute_otsu(uint *histogram, uint *otsu_threshold, Stream &stream)
+{
+    const uint n_bins = 256;
+    const uint n_thresholds = 256;
+
+    cudaStream_t cuda_stream = StreamAccessor::getStream(stream);
+
+    dim3 block_all(n_bins);
+    dim3 grid_all(n_thresholds);
+    dim3 block_score(n_thresholds);
+    dim3 grid_score(1);
+
+    BufferPool pool(stream);
+    GpuMat gpu_threshold_sums(1, n_bins, CV_32SC1, pool.getAllocator());
+    GpuMat gpu_sums(1, n_bins, CV_64FC1, pool.getAllocator());
+    GpuMat gpu_variances(1, n_bins, CV_32FC2, pool.getAllocator());
+
+    otsu_sums<<<grid_all, block_all, 0, cuda_stream>>>(
+        histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
+    otsu_variance<<<grid_all, block_all, 0, cuda_stream>>>(
+        gpu_variances.ptr<float2>(), histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
+    otsu_score<<<grid_score, block_score, 0, cuda_stream>>>(
+        otsu_threshold, gpu_threshold_sums.ptr<uint>(), gpu_variances.ptr<float2>());
+}
+
+// TODO: Replace this is cv::cuda::calcHist
+template <uint n_bins>
+__global__ void histogram_kernel(
+    uint *histogram, const uint8_t *image, uint width,
+    uint height, uint pitch)
+{
+    __shared__ uint local_histogram[n_bins];
+
+    uint x = blockIdx.x * blockDim.x + threadIdx.x;
+    uint y = blockIdx.y * blockDim.y + threadIdx.y;
+    uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+    if (tid < n_bins)
+    {
+        local_histogram[tid] = 0;
+    }
+
+    __syncthreads();
+
+    if (x < width && y < height)
+    {
+        uint8_t value = image[y * pitch + x];
+        atomicInc(&local_histogram[value], 0xFFFFFFFF);
+    }
+
+    __syncthreads();
+
+    if (tid < n_bins)
+    {
+        cv::cudev::atomicAdd(&histogram[tid], local_histogram[tid]);
+    }
+}
+
+// TODO: Replace this with cv::cuda::calcHist
+void calcHist(
+    const GpuMat src, GpuMat histogram, Stream stream)
+{
+    const uint n_bins = 256;
+
+    cudaStream_t cuda_stream = StreamAccessor::getStream(stream);
+
+    dim3 block(128, 4, 1);
+    dim3 grid = dim3(divUp(src.cols, block.x), divUp(src.rows, block.y), 1);
+    CV_CUDEV_SAFE_CALL(cudaMemsetAsync(histogram.ptr<uint>(), 0, n_bins * sizeof(uint), cuda_stream));
+    histogram_kernel<n_bins>
+        <<<grid, block, 0, cuda_stream>>>(
+            histogram.ptr<uint>(), src.ptr<uint8_t>(), (uint) src.cols, (uint) src.rows, (uint) src.step);
+}
+
+double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream &stream)
 {
     GpuMat src = getInputMat(_src, stream);
 
     const int depth = src.depth();
 
+    const int THRESH_OTSU = 8;
+    if ((type & THRESH_OTSU) == THRESH_OTSU)
+    {
+        CV_Assert(depth == CV_8U);
+        CV_Assert(src.channels() == 1);
+
+        BufferPool pool(stream);
+
+        // Find the threshold using Otsu and then run the normal thresholding algorithm
+        GpuMat gpu_histogram(256, 1, CV_32SC1, pool.getAllocator());
+        calcHist(src, gpu_histogram, stream);
+
+        GpuMat gpu_otsu_threshold(1, 1, CV_32SC1, pool.getAllocator());
+        compute_otsu(gpu_histogram.ptr<uint>(), gpu_otsu_threshold.ptr<uint>(), stream);
+
+        cv::Mat mat_otsu_threshold;
+        gpu_otsu_threshold.download(mat_otsu_threshold, stream);
+        stream.waitForCompletion();
+
+        // Overwrite the threshold value with the Otsu value and remove the Otsu flag from the type
+        type = type & ~THRESH_OTSU;
+        thresh = (double) mat_otsu_threshold.at<int>(0);
+    }
+
     CV_Assert( depth <= CV_64F );
     CV_Assert( type <= 4 /*THRESH_TOZERO_INV*/ );
 
diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp
index a15ad7b3ee5..f3ce3b52020 100644
--- a/modules/cudaarithm/test/test_element_operations.cpp
+++ b/modules/cudaarithm/test/test_element_operations.cpp
@@ -2529,7 +2529,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, AddWeighted, testing::Combine(
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Threshold
 
-CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
+CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV, cv::THRESH_OTSU)
 #define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
 
 PARAM_TEST_CASE(Threshold, cv::cuda::DeviceInfo, cv::Size, MatType, Channels, ThreshOp, UseRoi)
@@ -2577,6 +2577,54 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Threshold, testing::Combine(
     ALL_THRESH_OPS,
     WHOLE_SUBMAT));
 
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// ThresholdOtsu
+
+PARAM_TEST_CASE(ThresholdOtsu, cv::cuda::DeviceInfo, cv::Size, MatType, Channels, ThreshOp, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int channel;
+    int threshOp;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        channel = GET_PARAM(3);
+        threshOp = GET_PARAM(4) | cv::THRESH_OTSU;
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(ThresholdOtsu, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_MAKE_TYPE(type, channel));
+
+    cv::cuda::GpuMat dst = createMat(src.size(), src.type(), useRoi);
+    double otsu_gpu = cv::cuda::threshold(loadMat(src, useRoi), dst, 0, 255, threshOp);
+
+    cv::Mat dst_gold;
+    double otsu_cpu = cv::threshold(src, dst_gold, 0, 255, threshOp);
+
+    ASSERT_DOUBLE_EQ(otsu_gpu, otsu_cpu);
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, ThresholdOtsu, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U)),
+    testing::Values(Channels(1)),
+    ALL_THRESH_OPS,
+    WHOLE_SUBMAT));
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // InRange
 
diff --git a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
index 59fd8da45ac..c256e7d908c 100644
--- a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
@@ -62,6 +62,8 @@ template <typename T> __device__ __forceinline__ T saturate_cast(ushort v) { ret
 template <typename T> __device__ __forceinline__ T saturate_cast(short v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(uint v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(int v) { return T(v); }
+template <typename T> __device__ __forceinline__ T saturate_cast(signed long long v) { return T(v); }
+template <typename T> __device__ __forceinline__ T saturate_cast(unsigned long long v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(float v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(double v) { return T(v); }
 
diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
index f00d1f850d0..0de5351fff3 100644
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@@ -332,6 +332,16 @@ __device__ __forceinline__ uint shfl_down(uint val, uint delta, int width = warp
     return (uint) __shfl_down((int) val, delta, width);
 }
 
+__device__ __forceinline__ signed long long shfl_down(signed long long val, uint delta, int width = warpSize)
+{
+    return __shfl_down(val, delta, width);
+}
+
+__device__ __forceinline__ unsigned long long shfl_down(unsigned long long val, uint delta, int width = warpSize)
+{
+    return (unsigned long long) __shfl_down(val, delta, width);
+}
+
 __device__ __forceinline__ float shfl_down(float val, uint delta, int width = warpSize)
 {
     return __shfl_down(val, delta, width);

From b72527fbcaeaff29b43cd387abb619052194786e Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@opencv.ai>
Date: Fri, 27 Jun 2025 17:04:23 +0300
Subject: [PATCH 4/6] Fixed out-of-bound access in CUDA Otsu threshold
 implementation.

---
 modules/cudaarithm/src/cuda/threshold.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index c5586e61ab7..6199081ba9e 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -178,7 +178,7 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
 {
     const uint32_t n_thresholds = 256;
 
-    __shared__ float shared_memory[n_thresholds / WARP_SIZE];
+    __shared__ float shared_memory[n_thresholds];
 
     int threshold = threadIdx.x;
 

From d6e2199e6959bcf795e2a3d2f714e9ac5ef67400 Mon Sep 17 00:00:00 2001
From: Aakash Preetam <quic_apreetam@quicinc.com>
Date: Thu, 3 Jul 2025 15:51:17 +0530
Subject: [PATCH 5/6] Modified QcAllocator to store File Descriptor

---
 modules/fastcv/include/opencv2/fastcv/allocator.hpp | 5 ++++-
 modules/fastcv/src/allocator.cpp                    | 8 +++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/modules/fastcv/include/opencv2/fastcv/allocator.hpp b/modules/fastcv/include/opencv2/fastcv/allocator.hpp
index a70666723ca..1b568bde332 100644
--- a/modules/fastcv/include/opencv2/fastcv/allocator.hpp
+++ b/modules/fastcv/include/opencv2/fastcv/allocator.hpp
@@ -36,12 +36,15 @@ class QcResourceManager {
 /**
  * @brief Qualcomm's custom allocator.
  * This allocator uses Qualcomm's memory management functions.
+ *
+ * Note: The userdata field of cv::UMatData is used to store the file descriptor (fd) of the allocated memory.
+ *
  */
 class QcAllocator : public cv::MatAllocator {
     public:
         QcAllocator();
         ~QcAllocator();
-    
+
         cv::UMatData* allocate(int dims, const int* sizes, int type, void* data0, size_t* step, cv::AccessFlag flags, cv::UMatUsageFlags usageFlags) const CV_OVERRIDE;
         bool allocate(cv::UMatData* u, cv::AccessFlag accessFlags, cv::UMatUsageFlags usageFlags) const CV_OVERRIDE;
         void deallocate(cv::UMatData* u) const CV_OVERRIDE;
diff --git a/modules/fastcv/src/allocator.cpp b/modules/fastcv/src/allocator.cpp
index 83147d2354a..6658b00d4c6 100644
--- a/modules/fastcv/src/allocator.cpp
+++ b/modules/fastcv/src/allocator.cpp
@@ -55,13 +55,19 @@ cv::UMatData* QcAllocator::allocate(int dims, const int* sizes, int type,
         }
         total *= sizes[i];
     }
-    uchar* data = data0 ? (uchar*)data0 : (uchar*)fcvHwMemAlloc(total, 16);
+
+    int fd = -1;
+    uchar* data = data0 ? (uchar*)data0 : (uchar*)fcvHwMemAlloc(total, 16, &fd);
     cv::UMatData* u = new cv::UMatData(this);
     u->data = u->origdata = data;
     u->size = total;
     if(data0)
         u->flags |= cv::UMatData::USER_ALLOCATED;
 
+    // Store FD in userdata (cast to void*)
+    if (fd >= 0)
+    u->userdata = reinterpret_cast<void*>(static_cast<intptr_t>(fd));
+
     // Add to active allocations
     cv::fastcv::QcResourceManager::getInstance().addAllocation(data);
 

From 84024547e35e8e66e3a011c2b3adb3149760c4af Mon Sep 17 00:00:00 2001
From: Troels Ynddal <troelsy@users.noreply.github.com>
Date: Thu, 17 Jul 2025 08:36:33 +0200
Subject: [PATCH 6/6] Merge pull request #3970 from troelsy:4.x
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix race condition in Otsu's method #3970

I found that in some cases, Otsu's method can have two or more equal thresholds. In case of the CUDA implementation, this would create a race-condition where the last thread to write the output gets to choose which Otsu threshold to use. The resulting image from cv::cuda::threshold would be the same, but the returned threshold would vary for each run.

I solve this by first doing a poll to check if there are multiple matches with `__syncthreads_count`. If not, we can return the threshold as before. If there are more than one, the kernel does a minimum reduction to find the lowest threshold that matches the otsu score. It doesn't matter which threshold you choose as long as it is consistent, so I choose the smallest one.

I'm unsure when `__syncthreads_count` as introduced, but it is compatible with CC≥2.0. CUDA Toolkit's archive only goes back to version 8.0, but it was documented back then (https://docs.nvidia.com/cuda/archive)


### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 modules/cudaarithm/src/cuda/threshold.cu      | 46 ++++++++++++++-----
 .../test/test_element_operations.cpp          | 44 ++++++++++++++++++
 2 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index 6199081ba9e..5c60161af98 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -98,7 +98,7 @@ namespace
 
 __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long long *sums)
 {
-    const uint32_t n_bins = 256;
+    const uint n_bins = 256;
 
     __shared__ uint shared_memory_ts[n_bins];
     __shared__ unsigned long long shared_memory_s[n_bins];
@@ -109,7 +109,7 @@ __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long l
     uint threshold_sum_above = 0;
     unsigned long long sum_above = 0;
 
-    if (bin_idx >= threshold)
+    if (bin_idx > threshold)
     {
         uint value = histogram[bin_idx];
         threshold_sum_above = value;
@@ -129,7 +129,7 @@ __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long l
 __global__ void
 otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
 {
-    const uint32_t n_bins = 256;
+    const uint n_bins = 256;
 
     __shared__ signed long long shared_memory_a[n_bins];
     __shared__ signed long long shared_memory_b[n_bins];
@@ -147,7 +147,7 @@ otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned
 
     float threshold_variance_above_f32 = 0;
     float threshold_variance_below_f32 = 0;
-    if (bin_idx >= threshold)
+    if (bin_idx > threshold)
     {
         float mean = (float) sum_above / n_samples_above;
         float sigma = bin_idx - mean;
@@ -172,11 +172,35 @@ otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned
     }
 }
 
+template <uint n_thresholds>
+__device__ bool has_lowest_score(
+    uint threshold, float original_score, float score, uint *shared_memory
+) {
+    // It may happen that multiple threads have the same minimum score. In that case, we want to find the thread with
+    // the lowest threshold. This is done by calling '__syncthreads_count' to count how many threads have a score
+    // that matches to the minimum score found. Since this is rare, we will optimize towards the common case where only
+    // one thread has the minimum score. If multiple threads have the same minimum score, we will find the minimum
+    // threshold that satifies the condition
+    bool has_match = original_score == score;
+    uint matches = __syncthreads_count(has_match);
+
+    if(matches > 1) {
+        // If this thread has a match, we use it; otherwise we give it a value that is larger than the maximum
+        // threshold, so it will never get picked
+        uint min_threshold = has_match ? threshold : n_thresholds;
+
+        blockReduce<n_thresholds>(shared_memory, min_threshold, threshold, minimum<uint>());
+
+        return min_threshold == threshold;
+    } else {
+        return has_match;
+    }
+}
 
 __global__ void
 otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
 {
-    const uint32_t n_thresholds = 256;
+    const uint n_thresholds = 256;
 
     __shared__ float shared_memory[n_thresholds];
 
@@ -190,8 +214,8 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
     float threshold_mean_below = (float)n_samples_below / n_samples;
 
     float2 variances = variance[threshold];
-    float variance_above = variances.x / n_samples_above;
-    float variance_below = variances.y / n_samples_below;
+    float variance_above = n_samples_above > 0 ? variances.x / n_samples_above : 0.0f;
+    float variance_below = n_samples_below > 0 ? variances.y / n_samples_below : 0.0f;
 
     float above = threshold_mean_above * variance_above;
     float below = threshold_mean_below * variance_below;
@@ -209,11 +233,11 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
 
     score = shared_memory[0];
 
-    // We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we
-    // know which threshold it is
-    if (original_score == score)
+    // We found the minimum score, but in some cases multiple threads can have the same score, so we need to find the
+    // lowest threshold
+    if (has_lowest_score<n_thresholds>(threshold, original_score, score, (uint *) shared_memory))
     {
-        *otsu_threshold = threshold - 1;
+        *otsu_threshold = threshold;
     }
 }
 
diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp
index f3ce3b52020..f4628ad7f1f 100644
--- a/modules/cudaarithm/test/test_element_operations.cpp
+++ b/modules/cudaarithm/test/test_element_operations.cpp
@@ -2625,6 +2625,50 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, ThresholdOtsu, testing::Combine(
     WHOLE_SUBMAT));
 
 
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// ThresholdOtsuMulti Multiple Valid Thresholds
+
+PARAM_TEST_CASE(ThresholdOtsuMulti, cv::cuda::DeviceInfo, cv::Size, MatType, Channels)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int channel;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        channel = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(ThresholdOtsuMulti, Accuracy)
+{
+    cv::Mat src = Mat(size, CV_MAKE_TYPE(type, channel), cv::Scalar(0));
+    src.colRange(src.cols / 2, src.cols).setTo(255);
+
+    cv::cuda::GpuMat dst = createMat(src.size(), src.type());
+    double otsu_gpu = cv::cuda::threshold(loadMat(src), dst, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU);
+
+    cv::Mat dst_gold;
+    double otsu_cpu = cv::threshold(src, dst_gold, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU);
+
+    ASSERT_DOUBLE_EQ(otsu_gpu, otsu_cpu);
+    ASSERT_DOUBLE_EQ(otsu_gpu, 0);
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, ThresholdOtsuMulti, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(cv::Size(1, 100)),
+    testing::Values(MatDepth(CV_8U)),
+    testing::Values(Channels(1))));
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // InRange