Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
5f2e901
Checkpoint: Switching to DM UI project.
Oct 13, 2025
994e99f
Checkpoint: Switching to DM project.
Oct 14, 2025
ea6430f
Checkpoing: Switching to DM project.
Oct 16, 2025
cf0dbb5
Finish implementing major features for the cluster driver.
Oct 27, 2025
a861fb4
Upgrade memory handling in the cluster driver.
Oct 28, 2025
b4634f3
Begin adding query files to search for duplicates.
Oct 30, 2025
63a4dc2
Add warning for providing an invalid parameter.
Nov 7, 2025
22e55a3
Merge branch 'master' into dups
Nov 7, 2025
4b656a4
Improve exp_functions() to use central schema verification.
Nov 13, 2025
fa28afa
Add ClusterDriverRequirements (forgot to commit them before).
Nov 14, 2025
81a1d2f
Clean up unintended usage of glyph.h
Nov 14, 2025
e624d40
Attempt to reduce issues from ambiguously signed chars.
Nov 14, 2025
b0e000b
All tests now pass.
Nov 17, 2025
0874365
Re-apply reduced weight for duplicate pairs (temporarily turned off l…
Nov 17, 2025
01d918a
Clean up.
Nov 17, 2025
42a65f1
Update licences.
Nov 17, 2025
b281037
Clean up.
Nov 17, 2025
ee0bca7
Add "show_less" option to the cache method (skips printing uncomputed…
Nov 19, 2025
0c9eb2c
Update cluster library to use dynamic memory for any data over a coup…
Nov 19, 2025
394764e
Remove necessary requests for the driver name in objQueryFetch().
Nov 19, 2025
9b8cc19
Fix bugs that caused regressions after the updates to the cluster lib…
Nov 20, 2025
17156b7
Fix an invalid free (nmFree used instead of nmSysFree()).
Nov 20, 2025
648e30a
Merge branch 'master' into dups
Nov 20, 2025
29640a1
Minor improvements and clean up.
Nov 20, 2025
0fa62d3
Correct minor mistakes.
Nov 20, 2025
d3b571c
Merge branch 'master' into dups
Nov 22, 2025
06bae81
Implement a more extendable schema verification system.
Nov 21, 2025
13fd4b7
Replace old schema verification with the new system.
Nov 21, 2025
e83c15f
Expand the new schema verification system with extra data validation …
Nov 21, 2025
070cfe3
Clean up, bug fixes, and naming convention updates.
Nov 21, 2025
8795aaf
Add tests for log and power functions.
Nov 22, 2025
2e948d8
Add exp_fn_i_get_number().
Nov 22, 2025
4c347be
Add exp_fn_i_do_math() to bring the power of schema verification to l…
Nov 22, 2025
d177522
Minor clean up.
Lightning11wins Nov 25, 2025
7b49a5b
Address Greg's comments
Lightning11wins Dec 11, 2025
e9c10a5
Merge branch 'exp-schema' into dups
Lightning11wins Dec 11, 2025
b6abca7
Finish exp_functions.c work.
Lightning11wins Dec 11, 2025
8c86b5f
Organize docs.
Lightning11wins Dec 12, 2025
63fa5ba
Fix wrong stAddValue() info caused by reading old code.
Lightning11wins Dec 12, 2025
d0d4f54
Clean up stale TOODs.
Lightning11wins Dec 12, 2025
3b86627
Fix more styling mistakes.
Lightning11wins Dec 12, 2025
6b83c67
Fix indentation mistakes (thanks Centrallix Indent extension).
Lightning11wins Dec 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,4 @@ perf.data.old
.idea/
.vscode/
centrallix-os/tmp/*
centrallix-os/datasets/
2 changes: 1 addition & 1 deletion centrallix-doc/Widgets/widgets.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3731,7 +3731,7 @@ myTabControl "widget/tab"

<childproperty name="title" type="string">The title of the column to be displayed in the header row.</childproperty>

<childproperty name="type" type="string">The type of the column: "text", "check", or "image". "text" is a normal column, and displays the textual value of the data element. "check" displays a checkmark if the data is non-zero (integers) or for strings if the value is non-empty and not "N" or "No". "image" displays the image referred to by the pathname contained in the data value.</childproperty>
<childproperty name="type" type="string">The type of the column: "text", "check", "image", or "progress". "text" is a normal column, and displays the textual value of the data element. "check" displays a checkmark if the data is non-zero (integers) or for strings if the value is non-empty and not "N" or "No". "image" displays the image referred to by the pathname contained in the data value. "progress" displays a progress bar, with additional fields such as bar_color, bar_textcollor, and bar_padding.</childproperty>

<childproperty name="width" type="integer">width of the column.</childproperty>

Expand Down
4 changes: 2 additions & 2 deletions centrallix-lib/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ CFLAGS=@CFLAGS@ @DEFS@ -Iinclude -DCXLIB_INTERNAL -DNM_USE_SYSMALLOC -Wall $(PRO
MTCFLAGS=@CFLAGS@ @DEFS@ -Iinclude -DCXLIB_INTERNAL -DNM_USE_SYSMALLOC -Wall $(PROFILE) $(COVERAGE) -g -O0
TCFLAGS=$(patsubst -DNDEBUG,,$(CFLAGS))

XSTATICFILES=mtask.o mtlexer.o memstr.o xarray.o xhash.o xstring.o mtsession.o newmalloc.o xhashqueue.o bdqs_transport.o xhandle.o xringqueue.o cxsec.o smmalloc.o qprintf.o strtcpy.o util.o
XSTATICFILES=mtask.o mtlexer.o memstr.o xarray.o xhash.o xstring.o mtsession.o newmalloc.o xhashqueue.o bdqs_transport.o xhandle.o xringqueue.o cxsec.o smmalloc.o clusters.o qprintf.o strtcpy.o util.o
STATICFILES=$(patsubst %,src/%,$(XSTATICFILES))

XDYNAMICFILES=mtask.lo mtlexer.lo memstr.lo xarray.lo xhash.lo xstring.lo mtsession.lo newmalloc.lo xhashqueue.lo bdqs_transport.lo xhandle.lo xringqueue.lo cxsec.lo smmalloc.lo qprintf.lo strtcpy.lo util.lo
XDYNAMICFILES=mtask.lo mtlexer.lo memstr.lo xarray.lo xhash.lo xstring.lo mtsession.lo newmalloc.lo xhashqueue.lo bdqs_transport.lo xhandle.lo xringqueue.lo cxsec.lo smmalloc.lo clusters.lo qprintf.lo strtcpy.lo util.lo
DYNAMICFILES=$(patsubst %,src/%,$(XDYNAMICFILES))

INCLUDEFILES:=$(wildcard include/*.h)
Expand Down
146 changes: 146 additions & 0 deletions centrallix-lib/include/clusters.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#ifndef CLUSTERS_H
#define CLUSTERS_H

/************************************************************************/
/* Centrallix Application Server System */
/* Centrallix Core */
/* */
/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */
/* */
/* This program is free software; you can redistribute it and/or modify */
/* it under the terms of the GNU General Public License as published by */
/* the Free Software Foundation; either version 2 of the License, or */
/* (at your option) any later version. */
/* */
/* This program is distributed in the hope that it will be useful, */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
/* GNU General Public License for more details. */
/* */
/* You should have received a copy of the GNU General Public License */
/* along with this program; if not, write to the Free Software */
/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */
/* 02111-1307 USA */
/* */
/* A copy of the GNU General Public License has been included in this */
/* distribution in the file "COPYING". */
/* */
/* Module: lib_cluster.c, lib_cluster.h */
/* Author: Israel Fuller */
/* Creation: September 29, 2025 */
/* Description Clustering library used to cluster and search data with */
/* cosine similarity and Levenshtein similarity (aka. edit */
/* distance). Used by the "clustering driver". */
/* For more information on how to use this library, see */
/* string-similarity.md in the centrallix-sysdoc folder. */
/************************************************************************/

#include <stdlib.h>
#include <stdbool.h>

#ifdef CXLIB_INTERNAL
#include "xarray.h"
#else
#include "cxlib/xarray.h"
#endif

/*** This value defines the number of dimensions used for a sparse
*** vector. The higher the number, the fewer collisions will be
*** encountered when using these vectors for cosine comparisons.
*** This is also called the vector table size, if viewing the
*** vector as a hash table of character pairs.
***
*** 2147483629 is the signed int max, and is also a prime number.
*** Using this value ensures that the longest run of 0s will not
*** cause an int underflow with the current encoding scheme.
***
*** Unfortunately, we can't use a number this large yet because
*** kmeans algorithm creates densely allocated centroids with
*** `CA_NUM_DIMS` dimensions, so a large number causes it to fail.
*** This, we use 251 as the largest prime number less than 256,
*** giving us a decent balance between collision reduction and
*** kmeans centroid performance/memory overhead.
***/
#define CA_NUM_DIMS 251

/// LINK ../../centrallix-sysdoc/string_comparison.md#cosine_charsets
/** The character used to create a pair with the first and last characters of a string. **/
#define CA_BOUNDARY_CHAR (unsigned char)('a' - 1)

/** Types. **/
typedef int* pVector; /* Sparse vector. */
typedef double* pCentroid; /* Dense centroid. */
#define pCentroidSize CA_NUM_DIMS * sizeof(double)

/** Duplocate information. **/
typedef struct
{
void* key1;
void* key2;
double similarity;
}
Dup, *pDup;

/** Registering all defined types for debugging. **/
#define ca_init() \
nmRegister(sizeof(pVector), "pVector"); \
nmRegister(sizeof(pCentroid), "pCentroid"); \
nmRegister(pCentroidSize, "Centroid"); \
nmRegister(sizeof(Dup), "Dup")

/** Edit distance function. **/
int ca_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length);

/** Vector functions. **/
pVector ca_build_vector(const char* str);
unsigned int ca_sparse_len(const pVector vector);
void ca_print_vector(const pVector vector);
void ca_free_vector(pVector sparse_vector);

/** Kmeans function. **/
int ca_kmeans(
pVector* vectors,
const unsigned int num_vectors,
const unsigned int num_clusters,
const unsigned int max_iter,
const double min_improvement,
unsigned int* labels,
double* vector_sims);

/** Vector helper macros. **/
#define ca_is_empty(vector) (vector[0] == -CA_NUM_DIMS)
#define ca_has_no_pairs(vector) \
({ \
__typeof__ (vector) _v = (vector); \
_v[0] == -172 && _v[1] == 11 && _v[2] == -78; \
})

/** Comparison functions (see ca_search()). **/
double ca_cos_compare(void* v1, void* v2);
double ca_lev_compare(void* str1, void* str2);
bool ca_eql(pVector v1, pVector v2);

/** Similarity search functions. **/
void* ca_most_similar(
void* target,
void** data,
const unsigned int num_data,
const double (*similarity)(void*, void*),
const double threshold);
pXArray ca_sliding_search(
void** data,
const unsigned int num_data,
const unsigned int window_size,
const double (*similarity)(void*, void*),
const double dupe_threshold,
void** maybe_keys,
pXArray dups);
pXArray ca_complete_search(
void** data,
const unsigned int num_data,
const double (*similarity)(void*, void*),
const double dupe_threshold,
void** maybe_keys,
pXArray dups);

#endif /* End of .h file. */
79 changes: 79 additions & 0 deletions centrallix-lib/include/glyph.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#ifndef GLYPH_H
#define GLYPH_H

/************************************************************************/
/* Centrallix Application Server System */
/* Centrallix Core */
/* */
/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */
/* */
/* This program is free software; you can redistribute it and/or modify */
/* it under the terms of the GNU General Public License as published by */
/* the Free Software Foundation; either version 2 of the License, or */
/* (at your option) any later version. */
/* */
/* This program is distributed in the hope that it will be useful, */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
/* GNU General Public License for more details. */
/* */
/* You should have received a copy of the GNU General Public License */
/* along with this program; if not, write to the Free Software */
/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */
/* 02111-1307 USA */
/* */
/* A copy of the GNU General Public License has been included in this */
/* distribution in the file "COPYING". */
/* */
/* Module: glyph.h */
/* Author: Israel Fuller */
/* Creation: October 27, 2025 */
/* Description: A simple debug visualizer to make pretty patterns in */
/* developer's terminal which can be surprisingly useful */
/* for debugging algorithms. */
/************************************************************************/

#include <stdlib.h>

/** Uncomment to activate glyphs. **/
/** Should not be enabled in production code on the master branch. */
// #define ENABLE_GLYPHS

#ifdef ENABLE_GLYPHS
#define glyph_print(s) printf("%s", s);

/*** Initialize a simple debug visualizer to make pretty patterns in the
*** developer's terminal. Great for when you need to run a long task and
*** want a super simple way to make sure it's still working.
***
*** @attention - Relies on storing data in variables in scope, so calling
*** glyph() requires a call to glyph_init() previously in the same scope.
***
*** @param name The symbol name of the visualizer.
*** @param str The string printed for the visualization.
*** @param interval The number of invocations of glyph() required to print.
*** @param flush Whether to flush on output.
***/
#define glyph_init(name, str, interval, flush) \
const char* vis_##name##_str = str; \
const unsigned int vis_##name##_interval = interval; \
const bool vis_##name##_flush = flush; \
unsigned int vis_##name##_i = 0u;

/*** Invoke a visualizer.
***
*** @param name The name of the visualizer to invoke.
***/
#define glyph(name) \
if (++vis_##name##_i % vis_##name##_interval == 0) \
{ \
glyph_print(vis_##name##_str); \
if (vis_##name##_flush) fflush(stdout); \
}
#else
#define glyph_print(str)
#define glyph_init(name, str, interval, flush)
#define glyph(name)
#endif

#endif /* End of .h file. */
Loading