Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,7 @@ add_library(slowmokit
src/slowmokit/methods/metrics/recall.hpp
src/slowmokit/methods/metrics/recall.cpp
src/slowmokit/methods/metrics/mean_squared_error.hpp
src/slowmokit/methods/metrics/mean_squared_error.cpp)
src/slowmokit/methods/metrics/mean_squared_error.cpp
src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp
src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp
src/slowmokit/methods/cluster/DBSCAN.hpp)
52 changes: 52 additions & 0 deletions docs/methods/cluster/DBSCAN.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# DBSCAN

DBSCAN stands for **D**ensity **B**ased **S**patial **C**lustering of **A**pplications with **N**oise

The model clusters the given training set based on density of the given data points i.e. a point belongs to a cluster based on how close it is to its neighbouring points. This model is capable of finding arbitrary shaped clusters and identifying outliers.

## Parameters

| Name | Definition | Defaults | Type |
|-------------| ------------------------------------------------------------------------------------------- |----------|---------------|
| `eps` | Measure of how close a point should be to be considered in the vicinity of another point | 0.5 | `long double` |
| `minSamples` | Minimum number of points that should lie in the vicinity of a point for it to be considered a core point | 5 | `int` |

## Attributes

| Name | Definition | Shape |
|--------|------------------------------------------------------------------------------------|-----------------------------------|
| `labels` | Labels assigned to each data point of the training set fitted into the model | No of data points in training set |

## Methods

| Name | Definition | Return value |
|--------------------------------------|----------------------------------------|---------------|
| `fit(std::vector<std::vector<T>> x)` | Fits and clusters the given training set | `void` |
| `fitPredict(vector<T> x)` | Fits and clusters the given training set and returns the labels assigned to each data point | `vector<int>` |
| `getLabels()` | Returns the labels assigned to each data point of the training set fitted into the model | `vector<int>` |

## Example

```cpp
DBSCAN<double> db(0.6, 4);
std::vector<std::vector<double>> x = {
{1, 2},
{3, 4},
{2.5, 4},
{1.5, 2.5},
{3, 5},
{2.8, 4.5},
{2.5, 4.5},
{1.2, 2.5},
{1, 3},
{1, 5},
{1, 2.5},
{5, 6},
{4, 3}
};
std::vector<int> labels = db.fitPredict(x);
std::cout << "X Y Cluster\n";
for(int i = 0; i < x.size(); i++) {
std::cout << x[i][0] << " " << x[i][1] << " " << labels[i] << "\n";
}
```
26 changes: 26 additions & 0 deletions examples/cluster/DBSCAN.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
//#include "../../src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp"
//
//int main() {
// DBSCAN<double> db(0.6, 4);
// std::vector<std::vector<double>> x = {
// {1, 2},
// {3, 4},
// {2.5, 4},
// {1.5, 2.5},
// {3, 5},
// {2.8, 4.5},
// {2.5, 4.5},
// {1.2, 2.5},
// {1, 3},
// {1, 5},
// {1, 2.5},
// {5, 6},
// {4, 3}
// };
// std::vector<int> labels = db.fitPredict(x);
// std::cout << "X Y Cluster\n";
// for(int i = 0; i < x.size(); i++) {
// std::cout << x[i][0] << " " << x[i][1] << " " << labels[i] << "\n";
// }
// return 0;
//}
1 change: 1 addition & 0 deletions src/slowmokit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@
#include "slowmokit/methods/neighbors/bernoulli_nb.hpp"
#include "slowmokit/methods/neighbors/gaussian_nb.hpp"
#include "slowmokit/methods/neighbors/knn.hpp"
#include "slowmokit/methods/cluster/DBSCAN.hpp"

#endif // SLOWMOKIT_HPP
12 changes: 12 additions & 0 deletions src/slowmokit/methods/cluster/DBSCAN.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/**
* @file methods/cluster/DBSCAN.hpp
*
* Easy include for DBSCAN algorithm
*/

#ifndef SLOWMOKIT_DBSCAN_HPP
#define SLOWMOKIT_DBSCAN_HPP

#include "DBSCAN/DBSCAN.hpp"

#endif //SLOWMOKIT_DBSCAN_HPP
88 changes: 88 additions & 0 deletions src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/**
* @file methods/neighbors/DBSCAN/DBSCAN.cpp
*
* Implementation of the DBSCAN class
*/

#include "DBSCAN.hpp"

template<class T>
DBSCAN<T>::DBSCAN(long double eps, int minSamples) {
if(eps < 0 || minSamples < 0) {
throw std::invalid_argument("Values can't be negative");
}
this->eps = eps;
this->minSamples = minSamples;
}

template<class T>
long double DBSCAN<T>::euclideanDistance(std::vector<T> p1, std::vector<T> p2) {
long double distance = 0.0;
if(p1.size() != p2.size()) {
throw std::invalid_argument("Feature vectors are unequal in size");
}
int n = p1.size();
for(int i = 0; i < n; i++) {
distance += (long double) (p1[i] - p2[i]) * (p1[i] - p2[i]);
}
return sqrtl(distance);
}

template<class T>
void DBSCAN<T>::cluster(int i, std::vector<int> &core, std::vector<std::vector<int>> &neighbours, int &label) {
if(labels[i] != -1) {
return;
}
labels[i] = label;
if(core[i] != 0) {
for(int j : neighbours[i]) {
cluster(j, core, neighbours, label);
}
}
}

template<class T>
void DBSCAN<T>::fit(std::vector<std::vector<T>> x) {
int n = x.size();

std::vector<int> core(n);
std::vector<std::vector<int>> neighbours(n, std::vector<int>());

labels = std::vector<int>(n, -1);

for(int i = 0; i < n; i++) {
std::vector<int> neighbourIndices;
for(int j = 0; j < n; j++) {
if(i == j) {
continue;
}
if(euclideanDistance(x[i], x[j]) <= eps) {
neighbourIndices.push_back(j);
}
}
int const samples = neighbourIndices.size();
if(samples >= minSamples) {
core[i]++;
neighbours[i] = neighbourIndices;
}
}
int clusters = 0;
for(int i = 0; i < n; i++) {
if(core[i] == 0 || labels[i] != -1) {
continue;
}
cluster(i, core, neighbours, clusters);
clusters++;
}
}

template<class T>
std::vector<int> DBSCAN<T>::fitPredict(std::vector<std::vector<T>> x) {
fit(x);
return labels;
}

template<class T>
std::vector<int> DBSCAN<T>::getLabels() {
return labels;
}
82 changes: 82 additions & 0 deletions src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/**
* @file methods/cluster/DBSCAN/DBSCAN.hpp
*
* The header file for DBSCAN
*/
#ifndef SLOWMOKIT_DBSCAN_HPP
#define SLOWMOKIT_DBSCAN_HPP

#include "core.hpp"
/**
* Class carrying implementation of the DBSCAN clustering algorithm
* @tparam T type of the data to be clustered
*/
template<class T>
class DBSCAN
{
private:

/**
* Measure of how close a point should be to be considered in the vicinity of another point, default value is 0.5
*/
long double eps;

/**
* Minimum number of points that should lie in the vicinity of a point for it to be considered a core point, default value is 5
*/
int minSamples;

/**
* Labels assigned to each data point after fitting, the values range from 0 to clusters - 1, outliers are assigned -1
*/
std::vector<int> labels;

/**
* Evaluates the euclidean distance between two feature vectors
* @param p1 the first feature vector
* @param p2 the second feature vector
* @return the euclidean distance between the two vectors
* @throws invalid_argument exception when the feature vectors are unequal in size
*/
long double euclideanDistance(std::vector<T> p1, std::vector<T> p2);

/**
* Helper function for recursively clustering the points using DBSCAN
* @param i index of the the point that is to be assigned a cluster
* @param core boolean vector indicating whether a point is a core point or not
* @param neighbours 2D vector carrying neighbours of each of the core points
* @param label label of the cluster to be assigned to this point
*/
void cluster(int i, std::vector<int> &core, std::vector<std::vector<int>> &neighbours, int &label);

public:

/**
* Constructor for creating an instance of the DBSCAN class
* @param eps measure of how close a point should be to be considered in the vicinity of another point, default is 0.5
* @param minSamples minimum number of points that should lie in the vicinity of a point for it to be considered a core point, default is 5
* @throws invalid_argument exception when eps or minSamples is less than 0
*/
DBSCAN(long double eps = 0.5, int minSamples = 5);

/**
* Fits and clusters the given training set
* @param x list of feature vectors to be clustered
*/
void fit(std::vector<std::vector<T>> x);

/**
* Fits and clusters the given training set and returns the labels assigned to each data point
* @param x list of feature vectors
* @return vector of labels assigned to each data point
*/
std::vector<int> fitPredict(std::vector<std::vector<T>> x);

/**
* Returns the labels assigned to each data point of the training set fitted into the model
* @return vector of labels assigned to each data point
*/
std::vector<int> getLabels();
};

#endif //SLOWMOKIT_DBSCAN_HPP