PEC-CSS · ken1000minus7 · Feb 9, 2023 · Feb 10, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -60,4 +60,7 @@ add_library(slowmokit
         src/slowmokit/methods/metrics/recall.hpp
         src/slowmokit/methods/metrics/recall.cpp
         src/slowmokit/methods/metrics/mean_squared_error.hpp
-        src/slowmokit/methods/metrics/mean_squared_error.cpp)
+        src/slowmokit/methods/metrics/mean_squared_error.cpp
+        src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp
+        src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp
+        src/slowmokit/methods/cluster/DBSCAN.hpp)
diff --git a/docs/methods/cluster/DBSCAN.md b/docs/methods/cluster/DBSCAN.md
@@ -0,0 +1,52 @@
+# DBSCAN
+
+DBSCAN stands for **D**ensity **B**ased **S**patial **C**lustering of **A**pplications with **N**oise
+
+The model clusters the given training set based on density of the given data points i.e. a point belongs to a cluster based on how close it is to its neighbouring points. This model is capable of finding arbitrary shaped clusters and identifying outliers.
+
+## Parameters
+
+| Name        | Definition                                                                                  | Defaults | Type          |
+|-------------| ------------------------------------------------------------------------------------------- |----------|---------------|
+| `eps`       | Measure of how close a point should be to be considered in the vicinity of another point  | 0.5      | `long double` |
+| `minSamples` | Minimum number of points that should lie in the vicinity of a point for it to be considered a core point       | 5        | `int`         |
+
+## Attributes
+
+| Name   | Definition                                                                         | Shape                             |
+|--------|------------------------------------------------------------------------------------|-----------------------------------|
+| `labels` | Labels assigned to each data point of the training set fitted into the model | No of data points in training set |
+
+## Methods
+
+| Name                                 | Definition                             | Return value  |
+|--------------------------------------|----------------------------------------|---------------|
+| `fit(std::vector<std::vector<T>> x)` | Fits and clusters the given training set                                       | `void`        |
+| `fitPredict(vector<T> x)`            | Fits and clusters the given training set and returns the labels assigned to each data point         | `vector<int>` |
+| `getLabels()`                        | Returns the labels assigned to each data point of the training set fitted into the model | `vector<int>` |
+
+## Example
+
+```cpp
+DBSCAN<double> db(0.6, 4);
+std::vector<std::vector<double>> x = {
+            {1, 2},
+            {3, 4},
+            {2.5, 4},
+            {1.5, 2.5},
+            {3, 5},
+            {2.8, 4.5},
+            {2.5, 4.5},
+            {1.2, 2.5},
+            {1, 3},
+            {1, 5},
+            {1, 2.5},
+            {5, 6},
+            {4, 3}
+};
+std::vector<int> labels = db.fitPredict(x);
+std::cout << "X  Y  Cluster\n";
+for(int i = 0; i < x.size(); i++) {
+    std::cout << x[i][0] << "  " << x[i][1] << "  " << labels[i] << "\n";
+}
+```
diff --git a/examples/cluster/DBSCAN.cpp b/examples/cluster/DBSCAN.cpp
@@ -0,0 +1,26 @@
+//#include "../../src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp"
+//
+//int main() {
+//    DBSCAN<double> db(0.6, 4);
+//    std::vector<std::vector<double>> x = {
+//            {1, 2},
+//            {3, 4},
+//            {2.5, 4},
+//            {1.5, 2.5},
+//            {3, 5},
+//            {2.8, 4.5},
+//            {2.5, 4.5},
+//            {1.2, 2.5},
+//            {1, 3},
+//            {1, 5},
+//            {1, 2.5},
+//            {5, 6},
+//            {4, 3}
+//    };
+//    std::vector<int> labels = db.fitPredict(x);
+//    std::cout << "X  Y  Cluster\n";
+//    for(int i = 0; i < x.size(); i++) {
+//        std::cout << x[i][0] << "  " << x[i][1] << "  " << labels[i] << "\n";
+//    }
+//    return 0;
+//}
diff --git a/src/slowmokit.hpp b/src/slowmokit.hpp
@@ -20,5 +20,6 @@
 #include "slowmokit/methods/neighbors/bernoulli_nb.hpp"
 #include "slowmokit/methods/neighbors/gaussian_nb.hpp"
 #include "slowmokit/methods/neighbors/knn.hpp"
+#include "slowmokit/methods/cluster/DBSCAN.hpp"
 
 #endif // SLOWMOKIT_HPP
diff --git a/src/slowmokit/methods/cluster/DBSCAN.hpp b/src/slowmokit/methods/cluster/DBSCAN.hpp
@@ -0,0 +1,12 @@
+/**
+ * @file methods/cluster/DBSCAN.hpp
+ *
+ * Easy include for DBSCAN algorithm
+ */
+
+#ifndef SLOWMOKIT_DBSCAN_HPP
+#define SLOWMOKIT_DBSCAN_HPP
+
+#include "DBSCAN/DBSCAN.hpp"
+
+#endif //SLOWMOKIT_DBSCAN_HPP
diff --git a/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp b/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp
@@ -0,0 +1,88 @@
+/**
+ * @file methods/neighbors/DBSCAN/DBSCAN.cpp
+ *
+ * Implementation of the DBSCAN class
+ */
+
+#include "DBSCAN.hpp"
+
+template<class T>
+DBSCAN<T>::DBSCAN(long double eps, int minSamples) {
+    if(eps < 0 || minSamples < 0) {
+        throw std::invalid_argument("Values can't be negative");
+    }
+    this->eps = eps;
+    this->minSamples = minSamples;
+}
+
+template<class T>
+long double DBSCAN<T>::euclideanDistance(std::vector<T> p1, std::vector<T> p2) {
+    long double distance = 0.0;
+    if(p1.size() != p2.size()) {
+        throw std::invalid_argument("Feature vectors are unequal in size");
+    }
+    int n = p1.size();
+    for(int i = 0; i < n; i++) {
+        distance += (long double) (p1[i] - p2[i]) * (p1[i] - p2[i]);
+    }
+    return sqrtl(distance);
+}
+
+template<class T>
+void DBSCAN<T>::cluster(int i, std::vector<int> &core, std::vector<std::vector<int>> &neighbours, int &label) {
+    if(labels[i] != -1) {
+        return;
+    }
+    labels[i] = label;
+    if(core[i] != 0) {
+        for(int j : neighbours[i]) {
+            cluster(j, core, neighbours, label);
+        }
+    }
+}
+
+template<class T>
+void DBSCAN<T>::fit(std::vector<std::vector<T>> x) {
+    int n = x.size();
+
+    std::vector<int> core(n);
+    std::vector<std::vector<int>> neighbours(n, std::vector<int>());
+
+    labels = std::vector<int>(n, -1);
+
+    for(int i = 0; i < n; i++) {
+        std::vector<int> neighbourIndices;
+        for(int j = 0; j < n; j++) {
+            if(i == j) {
+                continue;
+            }
+            if(euclideanDistance(x[i], x[j]) <= eps) {
+                neighbourIndices.push_back(j);
+            }
+        }
+        int const samples = neighbourIndices.size();
+        if(samples >= minSamples) {
+            core[i]++;
+            neighbours[i] = neighbourIndices;
+        }
+    }
+    int clusters = 0;
+    for(int i = 0; i < n; i++) {
+        if(core[i] == 0 || labels[i] != -1) {
+            continue;
+        }
+        cluster(i, core, neighbours, clusters);
+        clusters++;
+    }
+}
+
+template<class T>
+std::vector<int> DBSCAN<T>::fitPredict(std::vector<std::vector<T>> x) {
+    fit(x);
+    return labels;
+}
+
+template<class T>
+std::vector<int> DBSCAN<T>::getLabels() {
+    return labels;
+}
diff --git a/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp b/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp
@@ -0,0 +1,82 @@
+/**
+ * @file methods/cluster/DBSCAN/DBSCAN.hpp
+ *
+ * The header file for DBSCAN
+ */
+#ifndef SLOWMOKIT_DBSCAN_HPP
+#define SLOWMOKIT_DBSCAN_HPP
+
+#include "core.hpp"
+/**
+ * Class carrying implementation of the DBSCAN clustering algorithm
+ * @tparam T type of the data to be clustered
+ */
+template<class T>
+class DBSCAN
+{
+private:
+
+    /**
+     * Measure of how close a point should be to be considered in the vicinity of another point, default value is 0.5
+     */
+    long double eps;
+
+    /**
+     * Minimum number of points that should lie in the vicinity of a point for it to be considered a core point, default value is 5
+     */
+    int minSamples;
+
+    /**
+     * Labels assigned to each data point after fitting, the values range from 0 to clusters - 1, outliers are assigned -1
+     */
+    std::vector<int> labels;
+
+    /**
+     * Evaluates the euclidean distance between two feature vectors
+     * @param p1 the first feature vector
+     * @param p2 the second feature vector
+     * @return the euclidean distance between the two vectors
+     * @throws invalid_argument exception when the feature vectors are unequal in size
+     */
+    long double euclideanDistance(std::vector<T> p1, std::vector<T> p2);
+
+    /**
+     * Helper function for recursively clustering the points using DBSCAN
+     * @param i index of the the point that is to be assigned a cluster
+     * @param core boolean vector indicating whether a point is a core point or not
+     * @param neighbours 2D vector carrying neighbours of each of the core points
+     * @param label label of the cluster to be assigned to this point
+     */
+    void cluster(int i, std::vector<int> &core, std::vector<std::vector<int>> &neighbours, int &label);
+
+public:
+
+    /**
+     * Constructor for creating an instance of the DBSCAN class
+     * @param eps measure of how close a point should be to be considered in the vicinity of another point, default is 0.5
+     * @param minSamples minimum number of points that should lie in the vicinity of a point for it to be considered a core point, default is 5
+     * @throws invalid_argument exception when eps or minSamples is less than 0
+     */
+    DBSCAN(long double eps = 0.5, int minSamples = 5);
+
+    /**
+     * Fits and clusters the given training set
+     * @param x list of feature vectors to be clustered
+     */
+    void fit(std::vector<std::vector<T>> x);
+
+    /**
+     * Fits and clusters the given training set and returns the labels assigned to each data point
+     * @param x list of feature vectors
+     * @return vector of labels assigned to each data point
+     */
+    std::vector<int> fitPredict(std::vector<std::vector<T>> x);
+
+    /**
+     * Returns the labels assigned to each data point of the training set fitted into the model
+     * @return vector of labels assigned to each data point
+     */
+    std::vector<int> getLabels();
+};
+
+#endif //SLOWMOKIT_DBSCAN_HPP