diff --git a/.gitignore b/.gitignore index cbfe20f1..bddd6b09 100644 --- a/.gitignore +++ b/.gitignore @@ -62,3 +62,4 @@ perf.data.old .idea/ .vscode/ centrallix-os/tmp/* +centrallix-os/datasets/ diff --git a/centrallix-doc/Widgets/widgets.xml b/centrallix-doc/Widgets/widgets.xml index b6b50afd..f38f178d 100644 --- a/centrallix-doc/Widgets/widgets.xml +++ b/centrallix-doc/Widgets/widgets.xml @@ -3731,7 +3731,7 @@ myTabControl "widget/tab" The title of the column to be displayed in the header row. - The type of the column: "text", "check", or "image". "text" is a normal column, and displays the textual value of the data element. "check" displays a checkmark if the data is non-zero (integers) or for strings if the value is non-empty and not "N" or "No". "image" displays the image referred to by the pathname contained in the data value. + The type of the column: "text", "check", "image", or "progress". "text" is a normal column, and displays the textual value of the data element. "check" displays a checkmark if the data is non-zero (integers) or for strings if the value is non-empty and not "N" or "No". "image" displays the image referred to by the pathname contained in the data value. "progress" displays a progress bar, with additional fields such as bar_color, bar_textcollor, and bar_padding. width of the column. diff --git a/centrallix-lib/Makefile.in b/centrallix-lib/Makefile.in index a7197622..91b670e8 100644 --- a/centrallix-lib/Makefile.in +++ b/centrallix-lib/Makefile.in @@ -59,14 +59,14 @@ LIBS = @LIBS@ ## PROFILE=@PROFILE@ COVERAGE=@COVERAGE@ -CFLAGS=@CFLAGS@ @DEFS@ -Iinclude -DCXLIB_INTERNAL -DNM_USE_SYSMALLOC -Wall $(PROFILE) $(COVERAGE) -g +CFLAGS=@CFLAGS@ @DEFS@ -Iinclude -DCXLIB_INTERNAL -DNM_USE_SYSMALLOC -Wall $(PROFILE) $(COVERAGE) -g -lm MTCFLAGS=@CFLAGS@ @DEFS@ -Iinclude -DCXLIB_INTERNAL -DNM_USE_SYSMALLOC -Wall $(PROFILE) $(COVERAGE) -g -O0 TCFLAGS=$(patsubst -DNDEBUG,,$(CFLAGS)) -XSTATICFILES=mtask.o mtlexer.o memstr.o xarray.o xhash.o xstring.o mtsession.o newmalloc.o xhashqueue.o bdqs_transport.o xhandle.o xringqueue.o cxsec.o smmalloc.o qprintf.o strtcpy.o util.o +XSTATICFILES=mtask.o mtlexer.o memstr.o xarray.o xhash.o xstring.o mtsession.o newmalloc.o xhashqueue.o bdqs_transport.o xhandle.o xringqueue.o cxsec.o smmalloc.o clusters.o qprintf.o strtcpy.o util.o STATICFILES=$(patsubst %,src/%,$(XSTATICFILES)) -XDYNAMICFILES=mtask.lo mtlexer.lo memstr.lo xarray.lo xhash.lo xstring.lo mtsession.lo newmalloc.lo xhashqueue.lo bdqs_transport.lo xhandle.lo xringqueue.lo cxsec.lo smmalloc.lo qprintf.lo strtcpy.lo util.lo +XDYNAMICFILES=mtask.lo mtlexer.lo memstr.lo xarray.lo xhash.lo xstring.lo mtsession.lo newmalloc.lo xhashqueue.lo bdqs_transport.lo xhandle.lo xringqueue.lo cxsec.lo smmalloc.lo clusters.lo qprintf.lo strtcpy.lo util.lo DYNAMICFILES=$(patsubst %,src/%,$(XDYNAMICFILES)) INCLUDEFILES:=$(wildcard include/*.h) diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h new file mode 100644 index 00000000..06c5075f --- /dev/null +++ b/centrallix-lib/include/clusters.h @@ -0,0 +1,137 @@ +#ifndef CLUSTERS_H +#define CLUSTERS_H + +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: lib_cluster.c, lib_cluster.h */ +/* Author: Israel Fuller */ +/* Creation: September 29, 2025 */ +/* Description: Clustering library used to cluster and search data with */ +/* cosine similarity and Levenshtein similarity (aka. edit */ +/* distance). Used by the "clustering driver". */ +/* For more information on how to use this library, see */ +/* string-similarity.md in the centrallix-sysdoc folder. */ +/************************************************************************/ + +#include +#include + +#ifdef CXLIB_INTERNAL +#include "xarray.h" +#else +#include "cxlib/xarray.h" +#endif + +/*** 2147483629 is the signed int max, and is also a prime number. + *** Using this value ensures that the longest run of 0s will not + *** cause an int underflow with the current encoding scheme. + *** + *** Unfortunately, we can't use a number this large yet because + *** kmeans algorithm creates densely allocated centroids with + *** `CA_NUM_DIMS` dimensions, so a large number causes it to fail. + ***/ +#define CA_NUM_DIMS 251 //2147483629 /* aka. The vector table size. */ + +/// LINK ../../centrallix-sysdoc/string_comparison.md#cosine_charsets +/** The character used to create a pair with the first and last characters of a string. **/ +#define CA_BOUNDARY_CHAR (unsigned char)('a' - 1) + +/** Types. **/ +typedef int* pVector; /* Sparse vector. */ +typedef double* pCentroid; /* Dense centroid. */ +#define pCentroidSize CA_NUM_DIMS * sizeof(double) + +/** Duplocate information. **/ +typedef struct + { + void* key1; + void* key2; + double similarity; + } + Dup, *pDup; + +/** Registering all defined types for debugging. **/ +#define ca_init() \ + nmRegister(sizeof(pVector), "pVector"); \ + nmRegister(sizeof(pCentroid), "pCentroid"); \ + nmRegister(pCentroidSize, "Centroid"); \ + nmRegister(sizeof(Dup), "Dup") + +/** Edit distance function. **/ +int ca_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length); + +/** Vector functions. **/ +pVector ca_build_vector(const char* str); +unsigned int ca_sparse_len(const pVector vector); +void ca_print_vector(const pVector vector); +void ca_free_vector(pVector sparse_vector); + +/** Kmeans function. **/ +int ca_kmeans( + pVector* vectors, + const unsigned int num_vectors, + const unsigned int num_clusters, + const unsigned int max_iter, + const double min_improvement, + unsigned int* labels, + double* vector_sims); + +/** Vector helper macros. **/ +#define ca_is_empty(vector) (vector[0] == -CA_NUM_DIMS) +#define ca_has_no_pairs(vector) \ + ({ \ + __typeof__ (vector) _v = (vector); \ + _v[0] == -172 && _v[1] == 11 && _v[2] == -78; \ + }) + +/** Comparison functions (see ca_search()). **/ +double ca_cos_compare(void* v1, void* v2); +double ca_lev_compare(void* str1, void* str2); +bool ca_eql(pVector v1, pVector v2); + +/** Similarity search functions. **/ +void* ca_most_similar( + void* target, + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double threshold); +pXArray ca_sliding_search( + void** data, + const unsigned int num_data, + const unsigned int window_size, + const double (*similarity)(void*, void*), + const double dupe_threshold, + void** maybe_keys, + pXArray dups); +pXArray ca_complete_search( + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double dupe_threshold, + void** maybe_keys, + pXArray dups); + +#endif /* End of .h file. */ diff --git a/centrallix-lib/include/glyph.h b/centrallix-lib/include/glyph.h new file mode 100644 index 00000000..cfafd394 --- /dev/null +++ b/centrallix-lib/include/glyph.h @@ -0,0 +1,78 @@ +#ifndef GLYPH_H +#define GLYPH_H + +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: glyph.h */ +/* Author: Israel Fuller */ +/* Creation: October 27, 2025 */ +/* Description: A simple debug visualizer to make pretty patterns in */ +/* developer's terminal which can be surprisingly useful */ +/* for debugging algorithms. */ +/************************************************************************/ + +#include + +/** Uncomment to activate glyphs. **/ +/** Should not be enabled in production code on the master branch. */ +// #define ENABLE_GLYPHS + +#ifdef ENABLE_GLYPHS +#define glyph_print(s) printf("%s", s); +/*** Initialize a simple debug visualizer to make pretty patterns in the + *** developer's terminal. Great for when you need to run a long task and + *** want a super simple way to make sure it's still working. + *** + *** @attention - Relies on storing data in variables in scope, so calling + *** glyph() requires a call to glyph_init() previously in the same scope. + *** + *** @param name The symbol name of the visualizer. + *** @param str The string printed for the visualization. + *** @param interval The number of invocations of glyph() required to print. + *** @param flush Whether to flush on output. + ***/ +#define glyph_init(name, str, interval, flush) \ + const char* vis_##name##_str = str; \ + const unsigned int vis_##name##_interval = interval; \ + const bool vis_##name##_flush = flush; \ + unsigned int vis_##name##_i = 0u; + +/*** Invoke a visualizer. + *** + *** @param name The name of the visualizer to invoke. + ***/ +#define glyph(name) \ + if (++vis_##name##_i % vis_##name##_interval == 0) \ + { \ + glyph_print(vis_##name##_str); \ + if (vis_##name##_flush) fflush(stdout); \ + } +#else +#define glyph_print(str) +#define glyph_init(name, str, interval, flush) +#define glyph(name) +#endif + +#endif /* End of .h file. */ diff --git a/centrallix-lib/include/util.h b/centrallix-lib/include/util.h index df4ba0d5..c2e36596 100644 --- a/centrallix-lib/include/util.h +++ b/centrallix-lib/include/util.h @@ -2,33 +2,187 @@ #define UTILITY_H /************************************************************************/ -/* Centrallix Application Server System */ -/* Centrallix Base Library */ -/* */ -/* Copyright (C) 1998-2011 LightSys Technology Services, Inc. */ -/* */ -/* You may use these files and this library under the terms of the */ -/* GNU Lesser General Public License, Version 2.1, contained in the */ -/* included file "COPYING". */ -/* */ -/* Module: (util.c,.h) */ -/* Author: Micah Shennum */ -/* Date: May 26, 2011 */ -/* Description: Collection of utilities */ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 1998-2011 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: util.c, util.h */ +/* Author: Micah Shennum and Israel Fuller */ +/* Date: May 26, 2011 */ +/* Description: Collection of utilities including: */ +/* - Utilities for parsing numbers. */ +/* - The timer utility for benchmarking code. */ +/* - snprint_bytes() for formatting a byte count. */ +/* - snprint_llu() for formatting large numbers. */ +/* - fprint_mem() for printing memory stats. */ +/* - min() and max() for handling numbers. */ +/* - The check functions for reliably printing debug data. */ /************************************************************************/ #ifdef __cplusplus extern "C" { #endif - int strtoi(const char *nptr, char **endptr, int base); unsigned int strtoui(const char *nptr, char **endptr, int base); + + /*** snprint_bytes() allows one to pick between CS units, where the kibibyte + *** (KiB) is 1024 bytes, and metric units where the kilobyte (KB) is 1000 bytes. + *** Fun Fact: Windows uses kibibytes, but displays them as KB. + ***/ + #define UTIL_USE_METRIC false + char* snprint_bytes(char* buf, const size_t buf_size, unsigned int bytes); + char* snprint_llu(char* buf, size_t buflen, unsigned long long value); + void fprint_mem(FILE* out); + + typedef struct + { + double start, total; + } + Timer, *pTimer; + + pTimer timer_init(pTimer timer); + pTimer timer_new(void); + pTimer timer_start(pTimer timer); + pTimer timer_stop(pTimer timer); + double timer_get(pTimer timer); + pTimer timer_reset(pTimer timer); + void timer_de_init(pTimer timer); + void timer_free(pTimer timer); + + double round_to(double value, int decimals); #ifdef __cplusplus } #endif -#endif /* UTILITY_H */ +#ifndef __cplusplus +#include + +/*** TODO: Greg - Can we assume this code will always be compiled with GCC? + *** If not, then the __typeof__, __LINE__, and __FILE__ syntaxes might be a + *** portability concern. + ***/ + +/*** @brief Returns the smaller of two values. + *** + *** @param a The first value. + *** @param b The second value. + *** @return The smaller of the two values. + *** + *** @note This macro uses GCC extensions to ensure type safety. + ***/ +#define min(a, b) \ + ({ \ + __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + (_a <= _b) ? _a : _b; \ + }) + +/*** @brief Returns the larger of two values. + *** + *** @param a The first value. + *** @param b The second value. + *** @return The larger of the two values. + *** + *** @note This macro uses GCC extensions to ensure type safety. + ***/ +#define max(a, b) \ + ({ \ + __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + (_a >= _b) ? _a : _b; \ + }) + +/** File name macro, expanding functionality like __FILE__ and __LINE__. **/ +#define __FILENAME__ \ + ({ \ + const char* last_directory = strrchr(__FILE__, '/'); \ + ((last_directory != NULL) ? last_directory + 1 : __FILE__); \ + }) + +/** Error Handling. **/ +void print_err(int code, const char* function_name, const char* file_name, const int line_number); +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is not zero. Not intended for user errors. + *** + *** @param result The result of the function we're checking. + *** @returns Whether the passed function succeeded. + ***/ +#define check(result) \ + ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ + __typeof__ (result) _r = (result); \ + const bool success = (_r == 0); \ + if (!success) print_err(_r, #result, __FILE__, __LINE__); \ + success; \ + }) + +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is negative. Not intended for user errors. + *** + *** @param result The result of the function we're checking. + *** @returns Whether the passed function succeeded. + ***/ +#define check_neg(result) \ + ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ + __typeof__ (result) _r = (result); \ + const bool success = (_r >= 0); \ + if (!success) print_err(_r, #result, __FILE__, __LINE__); \ + success; \ + }) + +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is -1. Not intended for user errors. + *** + *** @param result The result of the function we're checking. + *** @returns Whether the passed function succeeded. + ***/ +#define check_weak(result) \ + ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ + __typeof__ (result) _r = (result); \ + const bool success = (_r != -1); \ + if (!success) print_err(_r, #result, __FILE__, __LINE__); \ + success; \ + }) + +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is a NAN double. Not intended for user errors. + *** + *** @param result The result of the function we're checking. + *** @returns result + ***/ +#define check_double(result) \ + ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ + __typeof__ (result) _r = (result); \ + if (isnan(_r)) print_err(0, #result, __FILE__, __LINE__); \ + _r; \ + }) + +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is a NULL pointer. Not intended for user errors. + *** + *** @param result The result of the function we're checking. + *** @returns result + ***/ +#define check_ptr(result) \ + ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ + __typeof__ (result) _r = (result); \ + if (_r == NULL) print_err(0, #result, __FILE__, __LINE__); \ + _r; \ + }) + +#endif /* __cplusplus */ + +#endif /* UTILITY_H */ diff --git a/centrallix-lib/include/xhash.h b/centrallix-lib/include/xhash.h index 1b5d8459..65b90057 100644 --- a/centrallix-lib/include/xhash.h +++ b/centrallix-lib/include/xhash.h @@ -1,7 +1,6 @@ #ifndef _XHASH_H #define _XHASH_H - /************************************************************************/ /* Centrallix Application Server System */ /* Centrallix Base Library */ @@ -55,6 +54,7 @@ int xhAdd(pXHashTable this, char* key, char* data); int xhRemove(pXHashTable this, char* key); char* xhLookup(pXHashTable this, char* key); int xhClear(pXHashTable this, int (*free_fn)(), void* free_arg); +int xhForEach(pXHashTable this, int (*callback_fn)(pXHashEntry, void*), void* each_arg); +int xhClearKeySafe(pXHashTable this, void (*free_fn)(pXHashEntry, void*), void* free_arg); #endif /* _XHASH_H */ - diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c new file mode 100644 index 00000000..62191c89 --- /dev/null +++ b/centrallix-lib/src/clusters.c @@ -0,0 +1,1080 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: lib_cluster.c, lib_cluster.h */ +/* Author: Israel Fuller */ +/* Creation: September 29, 2025 */ +/* Description: Clustering library used to cluster and search data with */ +/* cosine similarity and Levenshtein similarity (aka. edit */ +/* distance). Used by the "clustering driver". */ +/* For more information on how to use this library, see */ +/* string-similarity.md in the centrallix-sysdoc folder. */ +/************************************************************************/ + +/** This file has additional documentation in string_similarity.md. **/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "clusters.h" +#include "newmalloc.h" +#include "util.h" +#include "xarray.h" + +/*** Gets the hash, representing a pair of ASCII characters, represented by unsigned ints. + *** Thank you to professor John Delano for this hashing algorithm. + *** + *** @param c1 The first character in the pair. + *** @param c2 The second character in the pair. + *** @returns The resulting hash. + ***/ +static unsigned int hash_char_pair(const unsigned char c1, const unsigned char c2) + { + const double sum = (c1 * c1 * c1) + (c2 * c2 * c2); + const double scale = ((double)c1 + 1.0) / ((double)c2 + 1.0); + const unsigned int hash = (unsigned int)round(sum * scale) - 1u; + return hash % CA_NUM_DIMS; + } + +/*** An internal struct for temporarily storing character pairs while building + *** sparse vectors. + *** + *** @param c1 The first character in the character pair. + *** @param c2 The second character in the character pair. + *** @param hash The hash for the two characters, calculated by calling the + *** hash_char_pair() function (above). + **/ +typedef struct + { + unsigned char c1, c2; + unsigned int hash; + } + CharPair, *pCharPair; + +/*** Internal function to compare two character pairs to allow us to sort them + *** by hash (ascending). + *** + *** @param p1 The first pCharPair. + *** @param p2 The second pCharPair. + *** @returns An int > 0 if p1's hash is larger. + *** An int < 0 if p2's hash is larger. + *** 0 if p1 and p2 have identical hashes. + ***/ +static int charpair_cmp(const void *p1, const void *p2) + { + const CharPair *a = p1, *b = p2; + return a->hash - b->hash; + } + +/*** Builds a vector using a string. + *** + *** Vectors are based on the frequencies of character pairs in the string. + *** Space characters and punctuation characters (see code for list) are ignored, + *** and all characters are converted to lowercase. Character 96, which is just + *** before 'a' in the ASCII table (and maps to '`') is used to make pairs on the + *** start and end of strings. The only supported characters for the passed char* + *** are spaces, punctuation, uppercase and lowercase letters, and numbers. + *** + *** This results in the following modified ASCII table: + *** ```csv + *** #, char, #, char, #, char + *** 97, a, 109, m, 121, y + *** 98, b, 110, n, 122, z + *** 99, c, 111, o, 123, 0 + *** 100, d, 112, p, 124, 1 + *** 101, e, 113, q, 125, 2 + *** 102, f, 114, r, 126, 3 + *** 103, g, 115, s, 127, 4 + *** 104, h, 116, t, 128, 5 + *** 105, i, 117, u, 129, 6 + *** 106, j, 118, v, 130, 7 + *** 107, k, 119, w, 131, 8 + *** 108, l, 120, x, 132, 9 + *** ``` + *** Thus, any number from 96 (the start/end character) to 132 ('9') is a valid + *** input to get_char_pair_hash(). + *** + *** After hashing each character pair, we add some number from 1 to 13 to the + *** coresponding dimension. However, for most names, this results in a lot of + *** zeros and a FEW positive numbers. Thus, after creating the dense vector, + *** we convert it to a sparse vector in which a negative number replaces a run + *** of that many zeros. Consider the following example: + *** + *** Dense pVector: `[1,0,0,0,3,0]` + *** + *** Sparse pVector: `[1,-3,3,-1]` + *** + *** Using these sparse vectors greatly reduces the required memory and gives + *** approximately an x5 boost to performance when traversing vectors, at the + *** cost of more algorithmically complex code. + *** + *** @param str The string to be divided into pairs and hashed to make the vector. + *** @returns The sparse vector built using the hashed character pairs. + ***/ +pVector ca_build_vector(const char* str) + { + /** Guard against a segfault. **/ + if (str == NULL) return NULL; + + /** Allocate memory. **/ + unsigned int num_chars = 0u; + unsigned char* chars = check_ptr(nmSysMalloc((strlen(str) + 2u) * sizeof(unsigned char))); + if (chars == NULL) goto err; + + /** Begin adding char pairs (in order). **/ + chars[num_chars++] = CA_BOUNDARY_CHAR; /* Starting boundary character. */ + for (const char* char_ptr = str; *char_ptr != '\0'; char_ptr++) + { + char maybe_char = *char_ptr; + if (maybe_char < 0) fprintf(stderr, "Warning: Unexpected negative char '%c' in string: \"%s\"\n", maybe_char, str); + unsigned char c = (unsigned char)maybe_char; + + /** Always consider boundary character in string. **/ + if (c == CA_BOUNDARY_CHAR) goto skip_checks; + + /** Ignore insignificant characters: spaces and punctuation. **/ + if (isspace(c)) continue; /* space, \n, \v, \f, \r */ + if (ispunct(c)) continue; /* !"#$%&'()*+,-./:;<=>?@[\]^_{|}~ */ + + skip_checks: + /** Shift numbers to the end of the lowercase letters. **/ + if ('0' <= c && c <= '9') c += 75u; + + /** Store the character. **/ + chars[num_chars++] = tolower(c); + } + chars[num_chars++] = CA_BOUNDARY_CHAR; /* Ending boundary character. */ + + /** Compute hash values for char pairs. **/ + CharPair* char_pairs = check_ptr(nmSysMalloc(num_chars * sizeof(CharPair))); + if (char_pairs == NULL) goto err_free_chars; + const unsigned int num_pairs = num_chars - 1u; + for (unsigned int i = 0u; i < num_pairs; i++) + { + /** Store characters. **/ + char_pairs[i].c1 = chars[i]; + char_pairs[i].c2 = chars[i + 1]; + + /** Hash the character pair into an index (dimension). **/ + /** Note that the passed value should always be between 97 ('a') and 132 ('9'). **/ + char_pairs[i].hash = hash_char_pair(chars[i], chars[i + 1]); + } + + /** Free unused memory. **/ + nmSysFree(chars); + chars = NULL; + + /** Sort char_pairs by hash value. **/ + qsort(char_pairs, num_pairs, sizeof(CharPair), charpair_cmp); + + /** Allocate space for the sparse vector. **/ + pVector sparse_vector = check_ptr(nmSysMalloc((num_pairs * 2u + 1u) * sizeof(int))); + if (sparse_vector == NULL) goto err_free_char_pairs; + + /** Build the sparse vector. **/ + unsigned int cur = 0u, dim = 0u; + for (unsigned int i = 0u; i < num_pairs;) + { + unsigned int hash = char_pairs[i].hash; + + /** Proceed through the pairs until we find a unique hash. **/ + /** Dividing value by 2 each time reduces the impact of repeated pairs. **/ + int value = 0; + for (; i < num_pairs && char_pairs[i].hash == hash; i++) + { + value /= 2; /* Reduce impact of repeated pairs. */ + value += ((unsigned int)char_pairs[i].c1 + (unsigned int)char_pairs[i].c2) % 13u + 1u; + } + + /** Skip zeros to reach the dimension index specified by the hash. **/ + unsigned int num_zeros = hash - dim; + if (num_zeros > 0u) + { + sparse_vector[cur++] = (int)-num_zeros; + dim = hash; + } + + /** Add the value to the sparse vector. **/ + sparse_vector[cur++] = value; + dim++; + } + if (dim != CA_NUM_DIMS) sparse_vector[cur++] = -(CA_NUM_DIMS - dim); + + /** Free unused memory. **/ + nmSysFree(char_pairs); + char_pairs = NULL; + + /** Trim extra space wasted by identical hashes. **/ + pVector trimmed_sparse_vector = check_ptr(nmSysRealloc(sparse_vector, cur * sizeof(int))); + if (trimmed_sparse_vector == NULL) goto err_free_sparse_vector; + sparse_vector = NULL; /* Mark memory freed by nmSysRealloc() no longer valid. */ + + /** Return the result. **/ + return trimmed_sparse_vector; + + err_free_sparse_vector: + if (sparse_vector != NULL) nmSysFree(sparse_vector); + + err_free_char_pairs: + if (char_pairs != NULL) nmSysFree(char_pairs); + + err_free_chars: + if (chars != NULL) nmSysFree(chars); + + err: + return NULL; + } + +/*** Free memory allocated to store a sparse vector. + *** + *** @param sparse_vector The sparse vector being freed. + ***/ +void ca_free_vector(pVector sparse_vector) + { + nmSysFree(sparse_vector); + } + +/*** Compute the length of a sparsely allocated vector. + *** + *** @param vector The vector. + *** @returns The computed length. + ***/ +unsigned int ca_sparse_len(const pVector vector) + { + unsigned int i = 0u; + for (unsigned int dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = vector[i++]; + + /** Negative val represents -val 0s in the array, so skip that many values. **/ + if (val < 0) dim += (unsigned)(-val); + + /** We have a param_value, but we don't need to do anything with it. **/ + else dim++; + } + return i; + } + +/*** Print the underlying implementation values sparsely allocated + *** vector (for debugging). + *** + *** @param vector The vector. + ***/ +void ca_print_vector(const pVector vector) + { + const unsigned int len = ca_sparse_len(vector); + printf("Vector: [%d", vector[0]); + for (unsigned int i = 1u; i < len; i++) + printf(", %d", vector[i]); + printf("]"); + } + +/*** Compute the magnitude of a sparsely allocated vector. + *** + *** @param vector The vector. + *** @returns The computed magnitude. + ***/ +static double magnitude_sparse(const pVector vector) + { + unsigned int magnitude = 0u; + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = vector[i++]; + + /** Negative val represents -val 0s in the array, so skip that many values. **/ + if (val < 0) dim += (unsigned)(-val); + + /** We have a param_value, so square it and add it to the magnitude. **/ + else { magnitude += (unsigned)(val * val); dim++; } + } + return sqrt((double)magnitude); + } + +/*** Compute the magnitude of a densely allocated centroid. + *** + *** @param centroid The centroid. + *** @returns The computed magnitude. + ***/ +static double magnitude_dense(const pCentroid centroid) + { + double magnitude = 0.0; + for (int i = 0; i < CA_NUM_DIMS; i++) + magnitude += centroid[i] * centroid[i]; + return sqrt(magnitude); + } + +/*** Parse a token from a sparsely allocated vector and write the param_value and + *** number of remaining values to the passed locations. + *** + *** @param token The sparse vector token being parsed. + *** @param remaining The location to save the remaining number of characters. + *** @param param_value The location to save the param_value of the token. + ***/ +static void parse_vector_token(const int token, unsigned int* remaining, unsigned int* param_value) + { + if (token < 0) + { + /** This run contains -token zeros. **/ + *remaining = (unsigned)(-token); + *param_value = 0u; + } + else + { + /** This run contains one param_value. **/ + *remaining = 1u; + *param_value = (unsigned)(token); + } + } + +/*** Calculate the similarity on sparsely allocated vectors. Comparing + *** any string to an empty string should always return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param v2 Sparse vector #2. + *** @returns Similarity between 0 and 1 where + *** 1 indicates identical and + *** 0 indicates completely different. + ***/ +static double sparse_similarity(const pVector v1, const pVector v2) + { + /** Calculate dot product. **/ + unsigned int vec1_remaining = 0u, vec2_remaining = 0u; + unsigned int dim = 0u, i1 = 0u, i2 = 0u, dot_product = 0u; + while (dim < CA_NUM_DIMS) + { + unsigned int val1 = 0u, val2 = 0u; + if (vec1_remaining == 0u) parse_vector_token(v1[i1++], &vec1_remaining, &val1); + if (vec2_remaining == 0u) parse_vector_token(v2[i2++], &vec2_remaining, &val2); + + /*** Accumulate the dot_product. If either vector is 0 here, + *** the total is 0 and this statement does nothing. + ***/ + dot_product += val1 * val2; + + /** Consume overlap from both runs. **/ + unsigned int overlap = min(vec1_remaining, vec2_remaining); + vec1_remaining -= overlap; + vec2_remaining -= overlap; + dim += overlap; + } + + /** Optional optimization to speed up nonsimilar vectors. **/ + if (dot_product == 0u) return 0.0; + + /** Return the difference score. **/ + return (double)dot_product / (magnitude_sparse(v1) * magnitude_sparse(v2)); + } + +/*** Calculate the difference on sparsely allocated vectors. Comparing + *** any string to an empty string should always return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param v2 Sparse vector #2. + *** @returns Similarity between 0 and 1 where + *** 1 indicates completely different and + *** 0 indicates identical. + ***/ +#define sparse_dif(v1, v2) (1.0 - sparse_similarity(v1, v2)) + +/*** Calculate the similarity between a sparsely allocated vector + *** and a densely allocated centroid. Comparing any string to an + *** empty string should always return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param c1 Dense centroid #2. + *** @returns Similarity between 0 and 1 where + *** 1 indicates identical and + *** 0 indicates completely different. + ***/ +static double sparse_similarity_to_centroid(const pVector v1, const pCentroid c2) + { + /** Calculate dot product. **/ + double dot_product = 0.0; + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = v1[i++]; + + /** Negative val represents -val 0s in the array, so skip that many values. **/ + if (val < 0) dim += (unsigned)(-val); + + /** We have a param_value, so square it and add it to the magnitude. **/ + else dot_product += (double)val * c2[dim++]; + } + + /** Return the difference score. **/ + return dot_product / (magnitude_sparse(v1) * magnitude_dense(c2)); + } + +/*** Calculate the difference between a sparsely allocated vector + *** and a densely allocated centroid. Comparing any string to an + *** empty string should always return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param c1 Dense centroid #2. + *** @returns Difference between 0 and 1 where + *** 1 indicates completely different and + *** 0 indicates identical. + ***/ +#define sparse_dif_to_centroid(v1, c2) (1.0 - sparse_similarity_to_centroid(v1, c2)) + +/*** Computes Levenshtein distance between two strings. + *** + *** @param str1 The first string. + *** @param str2 The second string. + *** @param length1 The length of the first string. + *** @param length1 The length of the first string. + *** @returns The edit distance between the two strings, or a negative value on error. + *** + *** @attention - `Tip`: Pass 0 for the length of either string to infer it + *** using the null terminating character. Conversely, character arrays + *** with no null terminator are allowed if an explicit length is specified. + *** + *** @attention - `Complexity`: O(nm), where n and m are the lengths of str1 + *** and str2 (respectively). + ***/ +int ca_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) + { + int result = -1; + + /** Detect string lengths, if necessary. **/ + const size_t str1_len = (str1_length == 0u) ? strlen(str1) : str1_length; + const size_t str2_len = (str2_length == 0u) ? strlen(str2) : str2_length; + + /** Optimization: Handle identical string pointers. **/ + if (str1 == str2) + return (str1_len > str2_len) + ? (str1_len - str2_len) + : (str2_len - str1_len); + + /*** lev_matrix: + *** For all i and j, d[i][j] will hold the Levenshtein distance between + *** the first i characters of s and the first j characters of t. + *** + *** As they say, no dynamic programming algorithm is complete without a + *** matrix that you fill out and it has the answer in the final location. + ***/ + unsigned int** lev_matrix = check_ptr(nmSysMalloc((str1_len + 1) * sizeof(unsigned int*))); + if (lev_matrix == NULL) goto end; + for (unsigned int i = 0u; i < str1_len + 1u; i++) + { + lev_matrix[i] = check_ptr(nmSysMalloc((str2_len + 1) * sizeof(unsigned int))); + if (lev_matrix[i] == NULL) goto end; + } + + /*** Base case #0: + *** Transforming an empty string into an empty string has 0 cost. + ***/ + lev_matrix[0][0] = 0u; + + /*** Base case #1: + *** Any source prefixe can be transformed into an empty string by + *** dropping each character. + ***/ + for (unsigned int i = 1u; i <= str1_len; i++) + lev_matrix[i][0] = i; + + /*** Base case #2: + *** Any target prefixes can be transformed into an empty string by + *** inserting each character. + ***/ + for (unsigned int j = 1u; j <= str2_len; j++) + lev_matrix[0][j] = j; + + /** General Case. **/ + for (unsigned int i = 1u; i <= str1_len; i++) + { + for (unsigned int j = 1u; j <= str2_len; j++) + { + /** If the characters are equal, no change is needed. **/ + if (str1[i - 1] == str2[j - 1]) + lev_matrix[i][j] = lev_matrix[i - 1][j - 1]; + + /*** We need to make a change, so use the oppereration with the + *** lowest cost out of delete, insert, replace, or swap. + ***/ + else + { + unsigned int cost_delete = lev_matrix[i - 1][j] + 1u; + unsigned int cost_insert = lev_matrix[i][j - 1] + 1u; + unsigned int cost_replace = lev_matrix[i-1][j-1] + 1u; + + /** If a swap is possible, calculate the cost. **/ + bool can_swap = ( + i > 1 && j > 1 && + str1[i - 1] == str2[j - 2] && + str1[i - 2] == str2[j - 1] + ); + unsigned int cost_swap = (can_swap) ? lev_matrix[i - 2][j - 2] + 1 : UINT_MAX; + + /** Assign the best operation. **/ + lev_matrix[i][j] = min(min(min(cost_delete, cost_insert), cost_replace), cost_swap); + } + } + } + + /** Store result. **/ + unsigned int unsigned_result = lev_matrix[str1_len][str2_len]; + if (unsigned_result > INT_MAX) + { + fprintf(stderr, + "Warning: Integer overflow detected in ca_edit_dist(\"%s\", \"%s\", %lu, %lu) = %u > %d\n", + str1, str2, str1_length, str2_length, unsigned_result, INT_MAX + ); + } + result = (int)unsigned_result; + + /** Cleanup. **/ + end: + if (lev_matrix != NULL) + { + for (unsigned int i = 0u; i < str1_len + 1u; i++) + { + if (lev_matrix[i] == NULL) break; + else nmSysFree(lev_matrix[i]); + } + nmSysFree(lev_matrix); + } + + /** Done. **/ + return result; + } + +/*** Compares two strings using their cosie similarity, returning a value + *** between `0.0` (completely different) and `1.0` (identical). If either + *** OR BOTH strings are NULL, this function returns `0.0`. + *** + *** @attention - This function takes `void*` instead of `pVector` so that it + *** can be used as the similarity function in the ca_search() function + *** family without needing a messy typecast to avoid the compiler warning. + *** + *** @param v1 A `pVector` to the first string to compare. + *** @param v2 A `pVector` to the second string to compare. + *** @returns The cosine similarity between the two strings. + ***/ +double ca_cos_compare(void* v1, void* v2) + { + if (v1 == v2) return 1.0; + + /** Input validation checks. **/ + const pVector vec1 = v1, vec2 = v2; + const bool v1_empty = (vec1 == NULL || ca_is_empty(vec1) || ca_has_no_pairs(vec1)); + const bool v2_empty = (vec2 == NULL || ca_is_empty(vec2) || ca_has_no_pairs(vec2)); + if (v1_empty && v2_empty) return 1.0; + if (v1_empty && !v2_empty) return 0.0; + if (!v1_empty && v2_empty) return 0.0; + + /** Apply rounding to avoid annoying floating point issues before returning. **/ + return round(sparse_similarity(vec1, vec2) * 1000000) / 1000000; + } + +/*** Compares two strings using their Levenshtein edit distance to compute a + *** similarity between `0.0` (completely different) and `1.0` (identical). + *** If both strings are empty, this function returns `1.0` (identical). If + *** either OR BOTH strings are NULL, this function returns `0.0`. + *** + *** @attention - This function takes `void*` instead of `char*` so that it + *** can be used as the similarity function in the ca_search() function + *** family without needing a messy typecast to avoid the compiler warning. + *** + *** @param str1 A `char*` to the first string to compare. + *** @param str2 A `char*` to the second string to compare. + *** @returns The levenshtein similarity between the two strings, or NAN on failure. + ***/ +double ca_lev_compare(void* str1, void* str2) + { + /** Input validation checks. **/ + if (str1 == NULL || str2 == NULL) return 0.0; + if (str1 == str2) return 1.0; + + /** Handle string length. **/ + const size_t len1 = strlen(str1); + const size_t len2 = strlen(str2); + if (len1 == 0lu && len2 == 0lu) return 1.0; + if (len1 != 0lu && len2 == 0lu) return 0.0; + if (len1 == 0lu && len2 != 0lu) return 0.0; + + /** Compute levenshtein edit distance. **/ + const int edit_dist = ca_edit_dist((const char*)str1, (const char*)str2, len1, len2); + if (!check_neg(edit_dist)) return NAN; + + /** Normalize edit distance into a similarity measure. **/ + const double normalized_similarity = 1.0 - (double)edit_dist / (double)max(len1, len2); + + /** Apply rounding to avoid annoying floating point issues before returning. **/ + return round(normalized_similarity * 1000000) / 1000000; + } + +/*** Check if two sparse vectors are identical. + *** + *** @param v1 The first vector. + *** @param v2 The second vector. + *** @returns true if they are equal, + *** false if any element is different. + ***/ +bool ca_eql(pVector v1, pVector v2) + { + const unsigned int len = ca_sparse_len(v1); + for (unsigned int i = 0u; i < len; i++) + if (v1[i] != v2[i]) return false; + return true; + } + +/*** Calculate the average size of all clusters in a set of vectors. + *** + *** @param vectors The vectors of the dataset (allocated sparsely). + *** @param num_vectors The number of vectors in the dataset. + *** @param labels The clusters to which vectors are assigned. + *** @param centroids The locations of the centroids (allocated densely). + *** @param num_clusters The number of centroids (k). + *** @returns The average cluster size. + ***/ +static double get_cluster_size( + pVector* vectors, + const unsigned int num_vectors, + unsigned int* labels, + pCentroid* centroids, + const unsigned int num_clusters) + { + double result = NAN; + + /** Allocate space to store clusters as averages are computed. **/ + /*** We use nmMalloc() here because this function is usually called + *** repeatedly with the same number of clusters in the k-means loop. + *** Also, it is likely that k-means may be invoked multiple times with + *** the same k value, leading to additional caching benefits. + ***/ + double* cluster_sums = check_ptr(nmMalloc(num_clusters * sizeof(double))); + unsigned int* cluster_counts = check_ptr(nmMalloc(num_clusters * sizeof(unsigned int))); + if (cluster_sums == NULL) goto end; + if (cluster_counts == NULL) goto end; + for (unsigned int i = 0u; i < num_clusters; i++) + { + cluster_sums[i] = 0.0; + cluster_counts[i] = 0u; + } + + /** Sum the difference from each vector to its cluster centroid. **/ + for (unsigned int i = 0u; i < num_vectors; i++) + { + const unsigned int label = labels[i]; + cluster_sums[label] += sparse_dif_to_centroid(vectors[i], centroids[label]); + cluster_counts[label]++; + } + + /** Add up the average cluster size. **/ + double cluster_total = 0.0; + unsigned int num_valid_clusters = 0u; + for (unsigned int label = 0u; label < num_clusters; label++) + { + const unsigned int cluster_count = cluster_counts[label]; + if (cluster_count == 0u) continue; + + cluster_total += cluster_sums[label] / cluster_count; + num_valid_clusters++; + } + + /** Calculate average sizes. **/ + result = cluster_total / num_valid_clusters; + + end: + /** Clean up. **/ + if (cluster_sums != NULL) nmFree(cluster_sums, num_clusters * sizeof(double)); + if (cluster_counts != NULL) nmFree(cluster_counts, num_clusters * sizeof(unsigned int)); + + return result; + } + +/*** Executes the k-means clustering algorithm. Selects NUM_CLUSTERS random + *** vectors as initial centroids. Then points are assigned to the nearest + *** centroid, after which centroids are moved to the center of their points. + *** + *** @param vectors The vectors to cluster. + *** @param num_vectors The number of vectors to cluster. + *** @param num_clusters The number of clusters to generate. + *** @param max_iter The max number of iterations. + *** @param min_improvement The minimum amount of improvement that must be met + *** each clustering iteration. If there is less improvement, the algorithm + *** will stop. Pass any value less than -1 to fully disable this feature. + *** @param labels Stores the final cluster identities of the vectors after + *** clustering is completed. Each value will be `0 <= n < num_clusters`. + *** @param vector_sims An array of num_vectors elements, allocated by the + *** caller, where index i stores the similarity of vector i to its assigned + *** cluster. Passing NULL skips evaluation of these values. + *** + *** @attention - Assumes: num_vectors is the length of vectors. + *** @attention - Assumes: num_clusters is the length of labels. + *** + *** @attention - Issue: At larger numbers of clustering iterations, some + *** clusters have a size of negative infinity. In this implementation, + *** the bug is mitigated by setting a small number of max iterations, + *** such as 16 instead of 100. + *** @attention - Issue: Clusters do not appear to improve much after the first + *** iteration, which puts the efficacy of the algorithm into question. This + *** may be due to the uneven density of a typical dataset. However, the + *** clusters still offer useful information. + *** + *** Complexity: + *** + *** - `O(kd + k + i*(k + n*(k+d) + kd))` + *** + *** - `O(kd + k + ik + ink + ind + ikd)` + *** + *** - `O(nk + nd)` + ***/ +int ca_kmeans( + pVector* vectors, + const unsigned int num_vectors, + const unsigned int num_clusters, + const unsigned int max_iter, + const double min_improvement, + unsigned int* labels, + double* vector_sims) + { + /** Setup stuff. **/ + bool successful = false; + unsigned int cluster_counts[num_clusters]; + memset(labels, 0u, num_vectors * sizeof(unsigned int)); + + /** Allocate space to store centroids and new_centroids. **/ + /** Dynamic allocation is required because these densely allocated arrays might be up to 500KB! **/ + const size_t centroids_size = num_clusters * sizeof(pCentroid); + pCentroid* centroids = check_ptr(nmMalloc(centroids_size)); + pCentroid* new_centroids = check_ptr(nmMalloc(centroids_size)); + if (centroids == NULL) goto end; + if (new_centroids == NULL) goto end; + memset(centroids, 0, centroids_size); + memset(new_centroids, 0, centroids_size); + for (unsigned int i = 0u; i < num_clusters; i++) + { + centroids[i] = check_ptr(nmMalloc(pCentroidSize)); + new_centroids[i] = check_ptr(nmMalloc(pCentroidSize)); + if (centroids[i] == NULL) goto end; + if (new_centroids[i] == NULL) goto end; + memset(centroids[i], 0, pCentroidSize); + memset(new_centroids[i], 0, pCentroidSize); + } + + /** Select random vectors to use as the initial centroids. **/ + srand(time(NULL)); + for (unsigned int i = 0u; i < num_clusters; i++) + { + /** Pick a random vector. **/ + const pVector vector = vectors[rand() % num_vectors]; + + /** Sparse copy the vector to expand it into a densely allocated centroid. **/ + pCentroid centroid = centroids[i]; + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int token = vector[i++]; + if (token > 0) centroid[dim++] = (double)token; + else for (unsigned int j = 0u; j < (unsigned)-token; j++) centroid[dim++] = 0.0; + } + } + + /** Main kmeans loop. **/ + double old_average_cluster_size = 1.0; + for (unsigned int iter = 0u; iter < max_iter; iter++) + { + bool changed = false; + + /** Reset new centroids. **/ + for (unsigned int i = 0u; i < num_clusters; i++) + { + cluster_counts[i] = 0u; + for (unsigned int dim = 0; dim < CA_NUM_DIMS; dim++) + new_centroids[i][dim] = 0.0; + } + + /** Assign each point to the nearest centroid. **/ + for (unsigned int i = 0u; i < num_vectors; i++) + { + const pVector vector = vectors[i]; + double min_dist = DBL_MAX; + unsigned int best_centroid_label = 0u; + + // Find nearest centroid. + for (unsigned int j = 0u; j < num_clusters; j++) + { + const double dist = sparse_dif_to_centroid(vector, centroids[j]); + if (dist < min_dist) + { + min_dist = dist; + best_centroid_label = j; + } + } + + /** Update label to new centroid, if necessary. **/ + if (labels[i] != best_centroid_label) + { + labels[i] = best_centroid_label; + changed = true; + } + + /** Accumulate values for new centroid calculation. **/ + pCentroid best_centroid = new_centroids[best_centroid_label]; + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = vector[i++]; + if (val < 0) dim += (unsigned)(-val); + else best_centroid[dim++] += (double)val; + } + cluster_counts[best_centroid_label]++; + } + + /** Stop if centroids didn't change. **/ + if (!changed) break; + + /** Update centroids. **/ + for (unsigned int i = 0u; i < num_clusters; i++) + { + if (cluster_counts[i] == 0u) continue; + pCentroid centroid = centroids[i]; + const pCentroid new_centroid = new_centroids[i]; + const unsigned int cluster_count = cluster_counts[i]; + for (unsigned int dim = 0u; dim < CA_NUM_DIMS; dim++) + centroid[dim] = new_centroid[dim] / cluster_count; + } + + /** Is there enough improvement? **/ + if (min_improvement < -1) continue; /** Skip check if it will never end the loop. **/ + const double average_cluster_size = check_double(get_cluster_size(vectors, num_vectors, labels, centroids, num_clusters)); + if (isnan(average_cluster_size)) goto end; + const double improvement = old_average_cluster_size - average_cluster_size; + if (improvement < min_improvement) break; + old_average_cluster_size = average_cluster_size; + } + + /** Compute vector similarities, if requested. **/ + if (vector_sims != NULL) + { + for (unsigned int i = 0u; i < num_vectors; i++) + vector_sims[i] = sparse_similarity_to_centroid(vectors[i], centroids[labels[i]]); + } + + /** Success. **/ + successful = true; + + /** Clean up. **/ + end: + if (centroids != NULL) + { + for (unsigned int i = 0u; i < num_clusters; i++) + { + if (centroids[i] != NULL) nmFree(centroids[i], pCentroidSize); + else break; + } + nmFree(centroids, num_clusters * sizeof(pCentroid)); + } + if (new_centroids != NULL) + { + for (unsigned int i = 0u; i < num_clusters; i++) + { + if (new_centroids[i] != NULL) nmFree(new_centroids[i], pCentroidSize); + else break; + } + nmFree(new_centroids, num_clusters * sizeof(pCentroid)); + } + return (successful) ? 0 : -1; + } + +/*** Finds the data that is the most similar to the target and returns + *** it if the similarity meets the threshold. + *** + *** @param target The target data to compare to the rest of the data. + *** @param data The rest of the data, compared against the target to + *** find the data that is the most similar. + *** @param num_data The number of elements in data. Specify 0 to detect + *** length on a null terminated array of data. + *** @param similarity A function which takes two data items of the type + *** of the data param and returns their similarity. + *** @param threshold The minimum similarity threshold. If the most similar + *** data does not meet this threshold, the function returns NULL. + *** @returns A pointer to the most similar piece of data found in the data + *** array, or NULL if the most similar data did not meet the threshold. + ***/ +void* ca_most_similar( + void* target, + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double threshold) + { + /** Error cases. **/ + if (target == NULL + || data == NULL + || similarity == NULL + || num_data == 0u + || (threshold < 0.0 || 1.0 < threshold) + || isnan(threshold) + ) return NULL; + + /** Search for the most similar string. **/ + void* most_similar = NULL; + double best_sim = -INFINITY; + for (unsigned int i = 0u; (num_data == 0u) ? (data[i] != NULL) : (i < num_data); i++) + { + const double sim = similarity(target, data[i]); + if (isnan(sim)) continue; /* Skip this comparison. */ + if (sim > best_sim && sim >= threshold) + { + most_similar = data[i]; + best_sim = sim; + } + } + return most_similar; + } + + +/*** Runs a sliding search over the provided data, comparing each element to + *** the following `window_size` elements, invoking the passed comparison + *** function just under `window_size * num_data` times. If any comparison + *** yields a similarity greater than the threshold, it is stored in the + *** xArray returned by this function. + *** + *** @param data The data to be searched. + *** @param num_data The number of data items in data. + *** @param window_size The size of the sliding window used for the search. + *** @param similarity A function which takes two data items of the type of + *** the data param and returns their similarity. + *** @param threshold The minimum threshold required for a duplocate to be + *** included in the returned xArray. + *** @param maybe_keys A pointer to an array of keys, with one key per data. + *** These will be used to fill in the key1 and key2 attributes for each + *** struct. If this variable is null, these values are also left null. + *** @param maybe_dups A pointer to an xArray in which dups should be found. + *** Pass NULL to allocate a new one. + *** @returns An xArray holding all of the duplocates found, or NULL if an + *** error occurs. + ***/ +pXArray ca_sliding_search( + void** data, + const unsigned int num_data, + const unsigned int window_size, + const double (*similarity)(void*, void*), + const double threshold, + void** maybe_keys, + pXArray maybe_dups) + { + /** Error cases. **/ + if (data == NULL + || num_data == 0 + || window_size == 0 + || similarity == NULL + || (threshold < 0.0 || 1.0 < threshold) + || isnan(threshold) + ) return NULL; + + /** Allocate space for dups (if necessary). **/ + pXArray dups = maybe_dups; + if (dups == NULL) + { + /** Guess that we will need space for num_data * 2 dups. **/ + const int guess_size = num_data * 2; + dups = check_ptr(xaNew(guess_size)); + if (dups == NULL) goto err; + } + const int num_starting_dups = dups->nItems; + + /** Search for dups. **/ + for (unsigned int i = 0u; i < num_data; i++) + { + const unsigned int window_start = i + 1u; + const unsigned int window_end = min(i + window_size, num_data); + for (unsigned int j = window_start; j < window_end; j++) + { + const double sim = check_double(similarity(data[i], data[j])); + if (isnan(sim) || sim < 0.0 || 1.0 < sim) + { + fprintf(stderr, "Invalid similarity %g %lf.\n", sim, sim); + goto err_free_dups; + } + if (sim > threshold) /* Dup found! */ + { + Dup* dup = (Dup*)check_ptr(nmMalloc(sizeof(Dup))); + if (dup == NULL) goto err_free_dups; + if (maybe_keys != NULL) + { + dup->key1 = maybe_keys[i]; + dup->key2 = maybe_keys[j]; + } + dup->similarity = sim; + if (!check_neg(xaAddItem(dups, (void*)dup))) goto err_free_dups; + } + } + } + + /** Success. **/ + return dups; + + /** Error cleanup. **/ + err_free_dups: + /** Free the dups that we added to the XArray. **/ + while (dups->nItems > num_starting_dups) + nmFree(dups->Items[--dups->nItems], sizeof(Dup)); + if (maybe_dups == NULL) check(xaDeInit(dups)); /* Failure ignored. */ + + err: + return NULL; + } + +/*** Runs a complete search over the provided data, comparing each element to + *** each other element, invoking the passed comparison function `num_data^2` + *** times. If any comparison yields a similarity greater than the threshold, + *** it is stored in the xArray returned by this function. + *** + *** @param data The data to be searched. + *** @param num_data The number of data items in data. + *** @param similarity A function which takes two data items of the type of + *** the data param and returns their similarity. + *** @param threshold The minimum threshold required for a duplocate to be + *** included in the returned xArray. + *** @param maybe_keys A pointer to an array of keys, with one key per data. + *** These will be used to fill in the key1 and key2 attributes for each + *** struct. If this variable is null, these values are also left null. + *** @param maybe_dups A pointer to an xArray in which dups should be found. + *** Pass NULL to allocate a new one. + *** @returns An xArray holding all of the duplocates found. If maybe_dups is + *** not NULL, this will be that xArray, to allow for chaining. + ***/ +pXArray ca_complete_search( + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double threshold, + void** maybe_keys, + pXArray maybe_dups) + { + return ca_sliding_search(data, num_data, num_data, similarity, threshold, maybe_keys, maybe_dups); + } + +/** Scope cleanup. **/ +#undef sparse_dif +#undef sparse_dif_to_centroid diff --git a/centrallix-lib/src/mtlexer.c b/centrallix-lib/src/mtlexer.c index e92ea49f..39a69cc1 100644 --- a/centrallix-lib/src/mtlexer.c +++ b/centrallix-lib/src/mtlexer.c @@ -7,6 +7,7 @@ #include #include #include + #include "newmalloc.h" #include "mtask.h" #include "mtlexer.h" @@ -907,7 +908,9 @@ mlxNextToken(pLxSession this) } else { - mssError(1,"MLX","Unexpected character encountered"); + char buf[4]; + snprintf(buf, sizeof(buf), "%c", ch); // mssError() does not support %c. + mssError(1, "MLX", "Unexpected character encountered: '%s'", buf); this->TokType = MLX_TOK_ERROR; break; } @@ -1305,4 +1308,3 @@ mlxSetOffset(pLxSession this, unsigned long new_offset) return 0; } - diff --git a/centrallix-lib/src/util.c b/centrallix-lib/src/util.c index 629b59c7..e27820b4 100644 --- a/centrallix-lib/src/util.c +++ b/centrallix-lib/src/util.c @@ -1,24 +1,36 @@ /************************************************************************/ -/* Centrallix Application Server System */ -/* Centrallix Base Library */ -/* */ -/* Copyright (C) 1998-2011 LightSys Technology Services, Inc. */ -/* */ -/* You may use these files and this library under the terms of the */ -/* GNU Lesser General Public License, Version 2.1, contained in the */ -/* included file "COPYING". */ -/* */ -/* Module: (util.c,.h) */ -/* Author: Micah Shennum */ -/* Date: May 26, 2011 */ -/* Description: Collection of utilities */ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 1998-2011 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: util.c, util.h */ +/* Author: Micah Shennum and Israel Fuller */ +/* Date: May 26, 2011 */ +/* Description: Collection of utilities including: */ +/* - Utilities for parsing numbers. */ +/* - The timer utility for benchmarking code. */ +/* - snprint_bytes() for formatting a byte count. */ +/* - snprint_llu() for formatting large numbers. */ +/* - fprint_mem() for printing memory stats. */ +/* - min() and max() for handling numbers. */ +/* - The check functions for reliably printing debug data. */ /************************************************************************/ +#include +#include +#include +#include #include - #include -#include -#include +#include +#include + +#include "newmalloc.h" #include "util.h" /** @@ -77,3 +89,178 @@ unsigned int strtoui(const char *nptr, char **endptr, int base){ //return as tmp; return (unsigned int)tmp; } + +#define nUnits 6u +static char* units_cs[nUnits] = {"bytes", "KiB", "MiB", "GiB"}; +static char* units_metric[nUnits] = {"bytes", "KB", "MB", "GB"}; + +/*** Displays a size in bytes using the largest unit where the result would be + *** at least 1.0. Note that units larger than GB and GiB are not supported + *** because the largest possible unsigned int is 4,294,967,295, which is + *** exactly 4 GiB (or approximately 4.29 GB). + *** + *** @param buf The buffer to which new text will be written, using snprintf(). + *** @param buf_size The amount of space in the buffer, passed to snprintf(). + *** It is recommended to have at least 12 characters available. + *** @param bytes The number of bytes, which will be formatted and written + *** to the buffer.. + *** @returns buf, for chaining. + ***/ +char* snprint_bytes(char* buf, const size_t buf_size, unsigned int bytes) + { + char** units = (UTIL_USE_METRIC) ? units_metric : units_cs; + const double unit_size = (UTIL_USE_METRIC) ? 1000.0 : 1024.0; + + /** Search for the largest unit where the value would be at least 1. **/ + const double size = (double)bytes; + for (unsigned char i = nUnits; i >= 1u; i--) + { + const double denominator = pow(unit_size, i); + if (size >= denominator) + { + const double converted_size = size / denominator; + if (converted_size >= 100.0) + snprintf(buf, buf_size, "%.5g %s", converted_size, units[i]); + else if (converted_size >= 10.0) + snprintf(buf, buf_size, "%.4g %s", converted_size, units[i]); + else /* if (converted_size >= 1.0) - Always true. */ + snprintf(buf, buf_size, "%.3g %s", converted_size, units[i]); + return buf; + } + } + + /** None of the larger units work, so we just use bytes. **/ + snprintf(buf, buf_size, "%u %s", bytes, units[0]); + + return buf; + } +#undef nUints + +char* snprint_llu(char* buf, size_t buflen, unsigned long long value) + { + if (buflen == 0) return NULL; + if (value == 0) + { + if (buflen > 1) { buf[0] = '0'; buf[1] = '\0'; } + else buf[0] = '\0'; + return buf; + } + + char tmp[32]; + unsigned int ti = 0; + while (value > 0 && ti < sizeof(tmp) - 1) + { + if (ti % 4 == 3) tmp[ti++] = ','; + tmp[ti++] = '0' + (value % 10); + value /= 10; + } + tmp[ti] = '\0'; + + unsigned int outlen = min(ti, buflen - 1u); + for (unsigned int i = 0u; i < outlen; i++) buf[i] = tmp[ti - i - 1]; + buf[outlen] = '\0'; + return buf; + } + +void fprint_mem(FILE* out) + { + FILE* fp = fopen("/proc/self/statm", "r"); + if (fp == NULL) { perror("fopen()"); return; } + + long size, resident, share, text, lib, data, dt; + if (fscanf(fp, "%ld %ld %ld %ld %ld %ld %ld", + &size, &resident, &share, &text, &lib, &data, &dt) != 7) + { + fprintf(stderr, "Failed to read memory info\n"); + fclose(fp); + return; + } + fclose(fp); + + long page_size = sysconf(_SC_PAGESIZE); // in bytes + long resident_bytes = resident * page_size; + + const size_t buf_siz = 16u; + char buf[buf_siz]; + snprint_bytes(buf, buf_siz, (unsigned int)resident_bytes); + + fprintf(out, "Memory used: %ld bytes (%s)\n", resident_bytes, buf); + fprintf(out, "Share %ldb, Text %ldb, Lib %ldb, Data %ldb\n", share, text, lib, data); + } + +static double get_time(void) + { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (double)ts.tv_sec + (double)ts.tv_nsec / 1.0e9f; + } + +pTimer timer_init(pTimer timer) + { + if (timer == NULL) return NULL; + timer->start = NAN; + timer->total = 0.0; + return timer; + } + +pTimer timer_new(void) + { + return timer_init(nmMalloc(sizeof(Timer))); + } + +pTimer timer_start(pTimer timer) + { + if (!timer) return timer; + timer->start = get_time(); + return timer; + } + +pTimer timer_stop(pTimer timer) + { + if (!timer) return timer; + timer->total += get_time() - timer->start; + timer->start = NAN; /* Stop the timer. */ + return timer; + } + +double timer_get(pTimer timer) + { + if (timer == NULL) return NAN; + return (isnan(timer->start)) + ? timer->total /* Timer is stopped. */ + : timer->total + (get_time() - timer->start); /* Timer is running. */ + } + +pTimer timer_reset(pTimer timer) + { + return timer_init(timer); + } + +void timer_de_init(pTimer timer) {} + +void timer_free(pTimer timer) + { + timer_de_init(timer); + nmFree(timer, sizeof(Timer)); + } + +double round_to(double value, int decimals) + { + const double mul = pow(10, decimals); + return round(value * mul) / mul; + } + +/*** Function for failing on error, assuming the error came from a library or + *** system function call, so that the error buffer is set to a valid value. + ***/ +void print_err(int code, const char* function_name, const char* file_name, const int line_number) + { + /** Create a descriptive error message. **/ + char error_buf[BUFSIZ]; + snprintf(error_buf, sizeof(error_buf), "%s:%d: %s failed", file_name, line_number, function_name); + + /** Print it with as much info as we can reasonably find. **/ + if (errno != 0) perror(error_buf); + else if (code != 0) fprintf(stderr, "%s (error code %d).\n", error_buf, code); + else fprintf(stderr, "%s.\n", error_buf); + } diff --git a/centrallix-lib/src/xhash.c b/centrallix-lib/src/xhash.c index afeb432b..46ef3a6f 100644 --- a/centrallix-lib/src/xhash.c +++ b/centrallix-lib/src/xhash.c @@ -290,4 +290,74 @@ xhClear(pXHashTable this, int (*free_fn)(), void* free_arg) return 0; } +/*** Executes an operation on each entry of the hash table entry. + *** + *** @param this The affected hash table. + *** @param callback_fn A callback function to be called on each hash table + *** entry. It takes 2 parameters: the current hash table entry and a void* + *** argument specified using each_arg. If any invocation of the callback + *** function returns a value other than 0, xhForEach() will immediately + *** fail, returning that value as the error code. + *** @param each_arg An additional argument which will be passed to each + *** invocation of the callback function. + *** @returns 0 if the function executes successfully. + *** 1 if the callback function is NULL. + *** n (where n != 0) if the callback function returns n. + ***/ +int +xhForEach(pXHashTable this, int (*callback_fn)(pXHashEntry, void*), void* each_arg) + { + if (callback_fn == NULL) return 1; + + for (int row = 0; row < this->nRows; row++) + { + pXHashEntry entry = (pXHashEntry)(this->Rows.Items[row]); + while (entry != NULL) + { + pXHashEntry next = entry->Next; + const int ret = callback_fn(entry, each_arg); + if (ret != 0) return ret; + entry = next; + } + } + + return 0; + } + +static int +xhiFreeEntry(pXHashEntry entry, void* arg) + { + /*** The passed void* actually points to a void* array with 2 elements. + *** The first element is a function pointer to the free function, which + *** we invoke using the provided entry and the free_arg, specified as the + *** second element of the array. + *** + *** Interestingly, you can write this code in one line like this: + *** ((void (*)(pXHashEntry, void*))((void**)arg)[0])(entry, ((void**)arg)[1]); + *** But I value code readability, so fortunately, I can't be THAT cleaver... + ***/ + void** args = (void**)arg; + void (*free_fn)(pXHashEntry, void*) = args[0]; + free_fn(entry, args[1]); + + /** Free the entry. **/ + nmFree(entry, sizeof(XHashEntry)); + + return 0; + } +int +xhClearKeySafe(pXHashTable this, void (*free_fn)(pXHashEntry, void*), void* free_arg) + { + /** Free each row. **/ + void* args[2] = {free_fn, free_arg}; + const int ret = xhForEach(this, xhiFreeEntry, args); + + /** Mark all rows as empty. **/ + for (int i = 0; i < this->nRows; i++) + this->Rows.Items[i] = NULL; + this->nItems = 0; + + /** We are successful only if the free function didn't fail. **/ + return ret; + } diff --git a/centrallix-lib/src/xstring.c b/centrallix-lib/src/xstring.c index 51df377c..2743ae66 100644 --- a/centrallix-lib/src/xstring.c +++ b/centrallix-lib/src/xstring.c @@ -169,6 +169,13 @@ xsConcatenate(pXString this, char* text, int len) ASSERTMAGIC(this, MGK_XSTRING); CXSEC_VERIFY(*this); + + /** Guard invalid length. **/ + if (len < -1) + { + CXSEC_EXIT(XS_FN_KEY); + return -1; + } /** Determine length. **/ if (len == -1) len = strlen(text); @@ -605,6 +612,14 @@ xsFind(pXString this,char* find,int findlen, int offset) CXSEC_ENTRY(XS_FN_KEY); ASSERTMAGIC(this, MGK_XSTRING); CXSEC_VERIFY(*this); + + /** Guard against undefined behavior. **/ + if (find == NULL) + { + CXSEC_EXIT(XS_FN_KEY); + return -1; + } + if(findlen==-1) findlen=strlen(find); for(;offsetLength;offset++) { @@ -636,6 +651,14 @@ xsFindRev(pXString this,char* find,int findlen, int offset) CXSEC_ENTRY(XS_FN_KEY); ASSERTMAGIC(this, MGK_XSTRING); CXSEC_VERIFY(*this); + + /** Guard against undefined behavior. **/ + if (find == NULL) + { + CXSEC_EXIT(XS_FN_KEY); + return -1; + } + if(findlen==-1) findlen=strlen(find); offset=this->Length-offset-1; for(;offset>=0;offset--) @@ -970,4 +993,3 @@ xsConcatQPrintf(pXString this, char* fmt, ...) CXSEC_EXIT(XS_FN_KEY); return rval; } - diff --git a/centrallix-lib/tests/t_driver.c b/centrallix-lib/tests/t_driver.c index 20c0091c..823b0298 100644 --- a/centrallix-lib/tests/t_driver.c +++ b/centrallix-lib/tests/t_driver.c @@ -59,16 +59,38 @@ start(void* v) signal(SIGSEGV, segv_handler); signal(SIGABRT, abort_handler); signal(SIGALRM, alarm_handler); - alarm(10); + + /*** Set a timer before Lockup is triggered, using a significantly + *** larger value if Valgrind appears to be enabled. + ***/ + #ifdef NO_BLK_CACHE + alarm(90); /* Valgrind detected. */ + #else + alarm(5); /* Normal timeout. */ + #endif + + times(&t); start = t.tms_utime + t.tms_stime + t.tms_cutime + t.tms_cstime; rval = test(&tname); times(&t); end = t.tms_utime + t.tms_stime + t.tms_cutime + t.tms_cstime; + if (rval < 0) printf("%-62.62s FAIL\n", tname); else - printf("%-62.62s PASS %lld\n", tname, rval*100/(long long)(end - start)); + { + long long duration = end - start; + if (duration == 0) + { + printf("%-62.62s PASS ???\n", tname); + printf("Warning: Test ran too fast! Ops/sec could not be measured. Please run tests in a loop or use loop_tests() from test_utils.h.\n"); + return; + } + long long ops_per_second = rval * (100 / duration); + if (ops_per_second > 0) printf("%-62.62s PASS %lld\n", tname, ops_per_second); + else printf("%-62.62s PASS %.4lf\n", tname, rval * (100.0 / duration)); + } return; } @@ -79,4 +101,3 @@ main(int argc, char* argv[]) mtInitialize(0, start); return 0; } - diff --git a/centrallix-lib/tests/test_clusters_00.c b/centrallix-lib/tests/test_clusters_00.c new file mode 100644 index 00000000..382d7998 --- /dev/null +++ b/centrallix-lib/tests/test_clusters_00.c @@ -0,0 +1,96 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_clusters_00.c */ +/* Author: Israel Fuller */ +/* Creation: November 25th, 2025 */ +/* Description: Test the ca_edit_dist() function from clusters.h. */ +/************************************************************************/ + +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" +#include "util.h" + +/** Tested module. **/ +#include "clusters.h" + +static bool do_tests(void) + { + bool success = true; + + /** Kitten tests. **/ + success &= EXPECT_EQL(ca_edit_dist("kitten", "kitten", 0, 0), 0, "%d"); /* 0 edits. */ + success &= EXPECT_EQL(ca_edit_dist("kitten", "skitten", 0, 0), 1, "%d"); /* 1 insert. */ + success &= EXPECT_EQL(ca_edit_dist("kitten", "itten", 0, 0), 1, "%d"); /* 1 delete. */ + success &= EXPECT_EQL(ca_edit_dist("kitten", "mitten", 0, 0), 1, "%d"); /* 1 replace. */ + success &= EXPECT_EQL(ca_edit_dist("kitten", "smitten", 0, 0), 2, "%d"); /* 1 insert and 1 replace. */ + success &= EXPECT_EQL(ca_edit_dist("kitten", "iktten", 0, 0), 1, "%d"); /* 1 transpose. */ + success &= EXPECT_EQL(ca_edit_dist("kitten", "kittens", 0, 0), 1, "%d"); /* 1 insert (end). */ + success &= EXPECT_EQL(ca_edit_dist("kitten", "kitte", 0, 0), 1, "%d"); /* 1 delete (end). */ + success &= EXPECT_EQL(ca_edit_dist("kitten", "kittem", 0, 0), 1, "%d"); /* 1 replace (end). */ + success &= EXPECT_EQL(ca_edit_dist("kitten", "kittne", 0, 0), 1, "%d"); /* 1 transpose (end). */ + + /** Alternate words. **/ + success &= EXPECT_EQL(ca_edit_dist("lawn", "flown", 0, 0), 2, "%d"); /* 1 insert and one replace. */ + success &= EXPECT_EQL(ca_edit_dist("hello", "hello!", 0, 0), 1, "%d"); /* 1 insert (end). */ + success &= EXPECT_EQL(ca_edit_dist("zert", "zerf", 0, 0), 1, "%d"); /* 1 replace (end). */ + success &= EXPECT_EQL(ca_edit_dist("llearr", "lear", 0, 0), 2, "%d"); /* 2 deletes (start & end). */ + + /** Long strings for testing edge cases. **/ + char* str1 = "This is a very long string!! I do not expect this function to need to process a string longer than this, because this string is a full 254 characters. That is pretty long. The object system limits strings to this size so we cannot make a longer string..."; + const size_t str_size = (strlen(str1) + 1) * sizeof(char); + char* str2 = memcpy(malloc(str_size), str1, str_size); + char* str3 = "This is quite a lengthy string. I do not expect the function to compute any longer string since this one is a full 254 characters. That is plenty, even if someone adds many contact details to their record!! Thus, this test should cover most cases we see."; + + /** Test edge cases. **/ + success &= EXPECT_EQL(ca_edit_dist("", "", 0, 0), 0, "%d"); /* Identical, empty string: 0 edits. */ + success &= EXPECT_EQL(ca_edit_dist(str1, str1, 0, 0), 0, "%d"); /* Identical, very long strings. */ + success &= EXPECT_EQL(ca_edit_dist(str1, str2, 0, 0), 0, "%d"); /* Identical, very long strings (different pointers). */ + success &= EXPECT_EQL(ca_edit_dist(str2, str3, 0, 0), 133, "%d"); /* 133 edits. */ + + /** Empty string comparsions. **/ + success &= EXPECT_EQL(ca_edit_dist(str1, "", 0, 0), (int)strlen(str1), "%d"); + success &= EXPECT_EQL(ca_edit_dist(str2, "", 0, 0), (int)strlen(str2), "%d"); + success &= EXPECT_EQL(ca_edit_dist(str3, "", 0, 0), (int)strlen(str3), "%d"); + + /** Specifying lengths with overflows. **/ + success &= EXPECT_EQL(ca_edit_dist("A string with edits", "A string without edits", 13, 13), 0, "%d"); + success &= EXPECT_EQL(ca_edit_dist("A string with edits", "A string without edits", 13, 0), 9, "%d"); + success &= EXPECT_EQL(ca_edit_dist("A string with edits", "A string without edits", 0, 13), 6, "%d"); + success &= EXPECT_EQL(ca_edit_dist("A string with edits", "A string without edits", 0, 0), 3, "%d"); + success &= EXPECT_EQL(ca_edit_dist("A string with edits", "A string without edits", 19, 0), 3, "%d"); + success &= EXPECT_EQL(ca_edit_dist("A string with edits", "A string without edits", 0, 19), 6, "%d"); + + /** Test for errors in identical pointer optimizations with specified lengths. **/ + char identical_string[] = "Identical String!"; + success &= EXPECT_EQL(ca_edit_dist(identical_string, identical_string, 0, 0), 0, "%d"); + success &= EXPECT_EQL(ca_edit_dist(identical_string, identical_string, 0, 17), 0, "%d"); + success &= EXPECT_EQL(ca_edit_dist(identical_string, identical_string, 17, 0), 0, "%d"); + success &= EXPECT_EQL(ca_edit_dist(identical_string, identical_string, 0, 16), 1, "%d"); + success &= EXPECT_EQL(ca_edit_dist(identical_string, identical_string, 0, 8), 9, "%d"); + success &= EXPECT_EQL(ca_edit_dist(identical_string, identical_string, 5, 6), 1, "%d"); + success &= EXPECT_EQL(ca_edit_dist(identical_string, identical_string, 16, 13), 3, "%d"); + success &= EXPECT_EQL(ca_edit_dist(identical_string, identical_string, 0, 1), 16, "%d"); + success &= EXPECT_EQL(ca_edit_dist(identical_string, identical_string, 1, 0), 16, "%d"); + + return success; + } + +long long test(char** tname) + { + *tname = "cluster-00 ca_edit_dist()"; + return loop_tests(do_tests); + } diff --git a/centrallix-lib/tests/test_clusters_01.c b/centrallix-lib/tests/test_clusters_01.c new file mode 100644 index 00000000..bea41a4e --- /dev/null +++ b/centrallix-lib/tests/test_clusters_01.c @@ -0,0 +1,51 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_clusters_01.c */ +/* Author: Israel Fuller */ +/* Creation: November 25th, 2025 */ +/* Description: Test the ca_edit_dist() function from clusters.h. */ +/************************************************************************/ + +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" +#include "util.h" + +/** Tested module. **/ +#include "clusters.h" + +static bool do_tests(void) + { + bool success = true; + + /** Many, many replace edits. **/ + const static unsigned short num_edits = 254;//15827; + char AAA[num_edits + 1], BBB[num_edits + 1]; + memset(AAA, 'A', num_edits); + memset(BBB, 'B', num_edits); + AAA[num_edits] = BBB[num_edits] = '\0'; + success &= EXPECT_EQL(ca_edit_dist(AAA, "", 0, 0), num_edits, "%d"); + success &= EXPECT_EQL(ca_edit_dist("", BBB, 0, 0), num_edits, "%d"); + success &= EXPECT_EQL(ca_edit_dist(AAA, BBB, 0, 0), num_edits, "%d"); + + return success; + } + +long long test(char** tname) + { + *tname = "cluster-01 ca_edit_dist(): Stress test"; + return loop_tests(do_tests); + } diff --git a/centrallix-lib/tests/test_clusters_02.c b/centrallix-lib/tests/test_clusters_02.c new file mode 100644 index 00000000..7912770f --- /dev/null +++ b/centrallix-lib/tests/test_clusters_02.c @@ -0,0 +1,96 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_clusters_02.c */ +/* Author: Israel Fuller */ +/* Creation: November 25th, 2025 */ +/* Description: Test the ca_build_vector() function from clusters.h. */ +/************************************************************************/ + +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" +#include "util.h" + +/** Tested module. **/ +#include "clusters.h" + +/** Useful testing macro. **/ +#define EXPECT_VEC_EQL(v1, v2) \ + ({ \ + pVector _v1 = (v1); \ + pVector _v2 = (v2); \ + int success = ca_eql(_v1, _v2); \ + if (!success) \ + { \ + printf(" > Expected %s to equal %s at %s:%d, but got:\n", #v1, #v2, __FILE__, __LINE__); \ + printf(" > 1. "); ca_print_vector(_v1); printf("\n"); \ + printf(" > 2. "); ca_print_vector(_v2); printf("\n"); \ + fflush(stdout); \ + } \ + success; \ + }) + +static bool do_tests(void) + { + bool success = true; + + /** Make an array to STORE() pointers to vectors so we can free them. **/ + const unsigned int max_index = 16u; + unsigned int index = 0u; + pVector free_list[max_index]; + #define STORE(v) (free_list[index++] = (v)) + + /** Edge case: Null string. **/ + success &= EXPECT_EQL(ca_build_vector(NULL), NULL, "%p"); + + /** Edge case: Empty string. **/ + success &= EXPECT_VEC_EQL(STORE(ca_build_vector("")), ((int[]){-172, 11, -78})); + + /** Single letter cases. **/ + success &= EXPECT_VEC_EQL(STORE(ca_build_vector("a")), ((int[]){-204, 12, -25, 12, -20})); + success &= EXPECT_VEC_EQL(STORE(ca_build_vector("b")), ((int[]){-151, 13, -11, 13, -87})); + success &= EXPECT_VEC_EQL(STORE(ca_build_vector("v")), ((int[]){-221, 7, -19, 7, -9})); + + /** Multi-letter cases. **/ + success &= EXPECT_VEC_EQL(STORE(ca_build_vector("def")), ((int[]){-79, 4, -51, 2, -4, 7, -64, 9, -49})); + + /** Clean up using the free list. **/ + if (index >= max_index) + { + printf(" > MEMORY ERROR!!\n"); + printf(" > Allocated %u vectors, overflowing the free list of size %u.\n", index + 1, max_index); + printf(" > Increase the size of the free list (aka. max_index) to %u or more.\n", index + 1); + return false; + } + while (index > 0u) + { + pVector cur_vector = free_list[--index]; + if (cur_vector == NULL) continue; + else ca_free_vector(cur_vector); + } + + return success; + } + +long long test(char** tname) + { + *tname = "cluster-02 ca_build_vector()"; + return loop_tests(do_tests); + } + +/** Clean up scope. **/ +#undef STORE +#undef EXPECT_VEC_EQL diff --git a/centrallix-lib/tests/test_clusters_03.c b/centrallix-lib/tests/test_clusters_03.c new file mode 100644 index 00000000..d77cc44c --- /dev/null +++ b/centrallix-lib/tests/test_clusters_03.c @@ -0,0 +1,88 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_clusters_03.c */ +/* Author: Israel Fuller */ +/* Creation: November 26th, 2025 */ +/* Description: Test the ca_cos_compare() function from clusters.h. */ +/************************************************************************/ + +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" +#include "util.h" + +/** Tested module. **/ +#include "clusters.h" + + +static bool do_tests(void) + { + bool success = true; + + /** Make an array to STORE() pointers to vectors so we can free them. **/ + const unsigned int max_index = 32u; + unsigned int index = 0u; + pVector free_list[max_index]; + #define STORE(v) (free_list[index++] = (v)) + + /** ca_cos_compare() shortcut macro. **/ + #define cos_cmp(str1, str2) ca_cos_compare(STORE(ca_build_vector(str1)), STORE(ca_build_vector(str2))) + + /** Basic tests of cosine similarity. **/ + success &= EXPECT_RANGE(cos_cmp("hello", "hello"), 0.999, 1.0, "%g"); + success &= EXPECT_RANGE(cos_cmp("hello", "zephora"), 0.0, 0.001, "%g"); + success &= EXPECT_RANGE(cos_cmp("hello", "hello world"), 0.6, 0.7, "%g"); + success &= EXPECT_RANGE(cos_cmp("hello there", "hellow there"), 0.9, 1.0, "%g"); + + /** Tests on fabricated contact information. */ + /*** All email addresses and phone numbers are imaginary and were + *** fabricated for the purposes of this test. + ***/ + success &= EXPECT_RANGE(cos_cmp("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470"), 0.49, 0.54, "%g"); + success &= EXPECT_RANGE(cos_cmp("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189"), 0.45, 0.50, "%g"); + success &= EXPECT_RANGE(cos_cmp("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791"), 0.425, 0.475, "%g"); + success &= EXPECT_RANGE(cos_cmp("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791"), 0.94, 0.99, "%g"); + success &= EXPECT_RANGE(cos_cmp("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149"), 0.575, 0.625, "%g"); + success &= EXPECT_RANGE(cos_cmp("This is an identical case", "This is an identical case"), 0.975, 1.00, "%g"); + success &= EXPECT_RANGE(cos_cmp("Samuel", "Alex"), 0.00, 0.025, "%g"); + + /** Clean up scope. **/ + #undef STORE + #undef cos_cmp + + /** Clean up using the free list. **/ + if (index >= max_index) + { + printf(" > MEMORY ERROR!!\n"); + printf(" > Allocated %u vectors, overflowing the free list of size %u.\n", index + 1, max_index); + printf(" > Increase the size of the free list (aka. max_index) to %u or more.\n", index + 1); + return false; + } + while (index > 0u) + { + pVector cur_vector = free_list[--index]; + if (cur_vector == NULL) continue; + else ca_free_vector(cur_vector); + } + + return success; + } + +long long test(char** tname) + { + *tname = "cluster-03 ca_cos_compare()"; + return loop_tests(do_tests); + } diff --git a/centrallix-lib/tests/test_clusters_04.c b/centrallix-lib/tests/test_clusters_04.c new file mode 100644 index 00000000..afc2d2d4 --- /dev/null +++ b/centrallix-lib/tests/test_clusters_04.c @@ -0,0 +1,70 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_clusters_04.c */ +/* Author: Israel Fuller */ +/* Creation: November 26th, 2025 */ +/* Description: Test the ca_lev_compare() function from clusters.h. */ +/************************************************************************/ + +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" +#include "util.h" + +/** Tested module. **/ +#include "clusters.h" + +static bool do_tests(void) + { + bool success = true; + + /** Basic tests of Levenshtein edit distance similarity. **/ + success &= EXPECT_RANGE(ca_lev_compare("hello", "hello"), 0.99, 1.0, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("hello", "hello!"), 0.8, 1.0, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("hello", "asdfkh"), 0.0, 0.1, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("hello", "aaaaaaaaaaaaaaaaa"), 0.0, 0.1, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("hello", "nope"), 0.0, 0.2, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("hello", "noepo"), 0.15, 0.25, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("below", "hello!"), 0.4, 0.6, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("kitten", "smitten"), 0.65, 0.85, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("hello", "bobbobbobbob"), 0.0, 0.1, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("hello", ""), 0.0, 0.05, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("", ""), 0.99, 1.0, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("blooooop", "blob"), 0.3, 0.5, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("", "!"), 0.0, 0.01, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("h", "h"), 0.99, 1.0, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("hi", "hi"), 0.99, 1.0, "%g"); + + /** Kitten tests with specific edit operations. **/ + success &= EXPECT_RANGE(ca_lev_compare("kitten", "kitten"), 0.99, 1.0, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("kitten", "skitten"), 0.8, 0.9, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("kitten", "itten"), 0.8, 0.9, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("kitten", "mitten"), 0.8, 0.9, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("kitten", "smitten"), 0.7, 0.8, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("kitten", "iktten"), 0.8, 0.9, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("kitten", "kittens"), 0.8, 0.9, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("kitten", "kitte"), 0.8, 0.9, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("kitten", "kittem"), 0.8, 0.9, "%g"); + success &= EXPECT_RANGE(ca_lev_compare("kitten", "kittne"), 0.8, 0.9, "%g"); + + return success; + } + +long long test(char** tname) + { + *tname = "cluster-04 ca_lev_compare()"; + return loop_tests(do_tests); + } diff --git a/centrallix-lib/tests/test_clusters_05.c b/centrallix-lib/tests/test_clusters_05.c new file mode 100644 index 00000000..70b617c1 --- /dev/null +++ b/centrallix-lib/tests/test_clusters_05.c @@ -0,0 +1,183 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_clusters_05.c */ +/* Author: Israel Fuller */ +/* Creation: November 26th, 2025 */ +/* Description: Test the ca_most_similar() function from clusters.h. */ +/************************************************************************/ + +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" +#include "util.h" +#include "xhash.h" + +/** Tested module. **/ +#include "clusters.h" + +static const unsigned int key_length = 64u; +static pXHashTable mock_sims = NULL; +static bool* success_ptr = NULL; +static double get_mock_sim(void* v1, void* v2) + { + char key[key_length]; + char* str1 = v1; + char* str2 = v2; + + /** Try key1, key2. **/ + snprintf(key, sizeof(key), "%s|%s", str1, str2); + double* sim = (double*)xhLookup(mock_sims, key); + if (sim != NULL) goto found; + + /** Try key2, key1. **/ + snprintf(key, sizeof(key), "%s|%s", str2, str1); + sim = (double*)xhLookup(mock_sims, key); + if (sim != NULL) goto found; + + /** Key not found. **/ + fprintf(stderr, " > get_mock_sim(\"%s\", \"%s\"): No sim provided!\n", str1, str2); + *success_ptr = false; + return NAN; + + found: + /** Key found. **/ + return *sim; + } + +static int do_nothing() { return 0; } + +static bool do_tests(void) + { + bool success = true; + + /** Check error cases. **/ + success &= EXPECT_STR_EQL(ca_most_similar(NULL, (void*[]){"str_abc", "str1"}, 2, ca_lev_compare, 0.0), NULL); + success &= EXPECT_STR_EQL(ca_most_similar("str", NULL, 2, ca_lev_compare, 0.0), NULL); + success &= EXPECT_STR_EQL(ca_most_similar("str", (void*[]){"str_abc", "str1"}, 0, ca_lev_compare, 0.0), NULL); + success &= EXPECT_STR_EQL(ca_most_similar("str", (void*[]){"str_abc", "str1"}, 2, NULL, 0.0), NULL); + success &= EXPECT_STR_EQL(ca_most_similar("str", (void*[]){"str_abc", "str1"}, 2, ca_lev_compare, 1.1), NULL); + success &= EXPECT_STR_EQL(ca_most_similar("str", (void*[]){"str_abc", "str1"}, 2, ca_lev_compare, -0.1), NULL); + success &= EXPECT_STR_EQL(ca_most_similar("str", (void*[]){"str_abc", "str1"}, 2, ca_lev_compare, INFINITY), NULL); + success &= EXPECT_STR_EQL(ca_most_similar("str", (void*[]){"str_abc", "str1"}, 2, ca_lev_compare, -INFINITY), NULL); + success &= EXPECT_STR_EQL(ca_most_similar("str", (void*[]){"str_abc", "str1"}, 2, ca_lev_compare, NAN), NULL); + + /** Simple test cases. **/ + success &= EXPECT_STR_EQL(ca_most_similar("str1", (void*[]){"str_abc", "str1"}, 2, ca_lev_compare, 0.0), "str1"); + success &= EXPECT_STR_EQL(ca_most_similar("str", (void*[]){"str_abc", "str1"}, 2, ca_lev_compare, 0.0), "str1"); + success &= EXPECT_STR_EQL(ca_most_similar("kitten", (void*[]){"str_abc", "str1"}, 2, ca_lev_compare, 0.0), "str1"); + success &= EXPECT_STR_EQL(ca_most_similar("str1", (void*[]){"str2", "str", "eight"}, 3, ca_lev_compare, 0.0), "str2"); + + /** Many, identically similar options. */ + success &= EXPECT_STR_EQL(ca_most_similar("kitten", + (void*[]){"skitten", "itten", "mitten", "iktten", "kittens", "kitte", "kittem", "kittne"}, 8, + ca_lev_compare, 0.0), "skitten"); + + /** Pointer-perfect handling. **/ + char* target = "string"; + success &= EXPECT_EQL((char*)ca_most_similar(target, (void*[]){"str", target}, 2, ca_lev_compare, 0.0), target, "%s"); + + /** List overflow. **/ + success &= EXPECT_STR_EQL(ca_most_similar("target", (void*[]){"str1", "targets", "target", "walmart"}, 2, ca_lev_compare, 0.0), "targets"); + + /** Threshold exceeded. **/ + success &= EXPECT_STR_EQL(ca_most_similar("blob", (void*[]){"blooooop", "targets", "string"}, 3, ca_lev_compare, 0.0), "blooooop"); + success &= EXPECT_STR_EQL(ca_most_similar("blob", (void*[]){"blooooop", "targets", "string"}, 3, ca_lev_compare, 0.5), NULL); + success &= EXPECT_STR_EQL(ca_most_similar("hello", (void*[]){"bane", "noepo", "stars"}, 3, ca_lev_compare, 0.0), "noepo"); + success &= EXPECT_STR_EQL(ca_most_similar("hello", (void*[]){"bane", "noepo", "stars"}, 3, ca_lev_compare, 0.25), NULL); + success &= EXPECT_STR_EQL(ca_most_similar("kitten", (void*[]){"skitten", "fit"}, 2, ca_lev_compare, 0.0), "skitten"); + success &= EXPECT_STR_EQL(ca_most_similar("kitten", (void*[]){"skitten", "fit"}, 2, ca_lev_compare, 0.9), NULL); + + /** Make an array to STORE() pointers to vectors so we can free them. **/ + const unsigned int max_index = 32u; + unsigned int index = 0u; + pVector free_list[max_index]; + #define STORE(v) (free_list[index++] = (v)) + #define str(s) STORE(ca_build_vector(s)) + + /** Alternative similarity function. **/ + pVector hello = str("hello"), fellow = str("fellow"), felon = str("felon"); + pVector held = str("held"), zephora = str("zephora"), hexza = str("hexza"); + pVector hello_there = str("hello there"), hello_world = str("hello world"); + pVector hellow_there = str("hellow there"); + success &= EXPECT_STR_EQL(ca_most_similar(hello, (void*[]){fellow, felon, hello, held}, 4, ca_cos_compare, 0.0), hello); + success &= EXPECT_STR_EQL(ca_most_similar(str("hello"), (void*[]){fellow, felon, hello, held}, 4, ca_cos_compare, 0.0), hello); + success &= EXPECT_STR_EQL(ca_most_similar(hello, (void*[]){zephora, hello_world, hexza}, 3, ca_cos_compare, 0.0), hello_world); + success &= EXPECT_STR_EQL(ca_most_similar(hello, (void*[]){zephora, hello_world}, 1, ca_cos_compare, 0.0), zephora); + success &= EXPECT_STR_EQL(ca_most_similar(hello, (void*[]){zephora}, 1, ca_cos_compare, 0.0), zephora); + success &= EXPECT_STR_EQL(ca_most_similar(hello_there, (void*[]){hello_world, zephora, hellow_there, hexza}, 4, ca_cos_compare, 0.0), hellow_there); + success &= EXPECT_STR_EQL(ca_most_similar(hello_there, (void*[]){hello_world, zephora, hellow_there}, 2, ca_cos_compare, 0.0), hello_world); + success &= EXPECT_STR_EQL(ca_most_similar(hello_there, (void*[]){hello_world, zephora, hellow_there}, 2, ca_cos_compare, 0.8), NULL); + + /** Special characters (ignored by the similarity function). **/ + pVector yip = str("Yippee!!!"); + pVector str1 = str("@*#((%^!&@*-+!"), str2 = str(">>->y i!&P^^_pe$/\n?e"), str3 = str("yip"); + success &= EXPECT_STR_EQL(ca_most_similar(yip, (void*[]){str1, str2, str3}, 3, ca_cos_compare, 0.0), str2); + success &= EXPECT_STR_EQL(ca_most_similar(yip, (void*[]){str1, str2, str3}, 3, ca_cos_compare, 1.0), str2); + + /** Clean up scope. **/ + #undef STORE + #undef str + + /** Clean up using the free list. **/ + if (index >= max_index) + { + printf(" > MEMORY ERROR!!\n"); + printf(" > Allocated %u vectors, overflowing the free list of size %u.\n", index + 1, max_index); + printf(" > Increase the size of the free list (aka. max_index) to %u or more.\n", index + 1); + return false; + } + while (index > 0u) + { + pVector cur_vector = free_list[--index]; + if (cur_vector == NULL) continue; + else ca_free_vector(cur_vector); + } + + /** Set up the mock similarity function. **/ + XHashTable sim_table; + if (!check(xhInit(&sim_table, 64, 0))) return false; + mock_sims = &sim_table; + success_ptr = &success; + + /** Completely different strings are similar. **/ + double str1_str = 0.2, str1_str2 = 0.1, str1_eight = 0.8; + if (!check(xhAdd(&sim_table, "str1|str", (void*)&str1_str))) return false; + if (!check(xhAdd(&sim_table, "str1|str2", (void*)&str1_str2))) return false; + if (!check(xhAdd(&sim_table, "str1|eight", (void*)&str1_eight))) return false; + success &= EXPECT_STR_EQL(ca_most_similar("str1", (void*[]){"str2", "str", "eight"}, 3, get_mock_sim, 0.0), "eight"); + success &= EXPECT_STR_EQL(ca_most_similar("str1", (void*[]){"str2", "str", "eight"}, 3, get_mock_sim, 0.9), NULL); + if (!check(xhClear(&sim_table, do_nothing, NULL))) return false; + + /** Nans are skipped. **/ + double val_nan = 0.8, val_vals = NAN, val_val = 0.2; + if (!check(xhAdd(&sim_table, "val|nan", (void*)&val_nan))) return false; + if (!check(xhAdd(&sim_table, "val|vals", (void*)&val_vals))) return false; + if (!check(xhAdd(&sim_table, "val|val", (void*)&val_val))) return false; + success &= EXPECT_STR_EQL(ca_most_similar("val", (void*[]){"val", "vals", "nan"}, 3, get_mock_sim, 0.0), "nan"); + success &= EXPECT_STR_EQL(ca_most_similar("val", (void*[]){"val", "vals", "nan"}, 3, get_mock_sim, 0.9), NULL); + if (!check(xhClear(&sim_table, do_nothing, NULL))) return false; + + /** Clean up. **/ + if (!check(xhDeInit(&sim_table))) return false; + + return success; + } + +long long test(char** tname) + { + *tname = "cluster-05 ca_most_similar()"; + return loop_tests(do_tests); + } diff --git a/centrallix-lib/tests/test_clusters_06.c b/centrallix-lib/tests/test_clusters_06.c new file mode 100644 index 00000000..dbcade75 --- /dev/null +++ b/centrallix-lib/tests/test_clusters_06.c @@ -0,0 +1,151 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_clusters_06.c */ +/* Author: Israel Fuller */ +/* Creation: November 26th, 2025 */ +/* Description: Test the searching functions from clusters.h. */ +/************************************************************************/ + +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" +#include "util.h" + +/** Tested module. **/ +#include "clusters.h" + + +static int cmp_dups(const void* v1, const void* v2) + { + const Dup* dup1 = v1; + const Dup* dup2 = v2; + const int r = strcmp(dup1->key1, dup2->key1); + return (r != 0) ? r : strcmp(dup1->key2, dup2->key2); + } + +#define EXPECT_DUP(dup, k1, k2, sim_min, sim_max) \ + ({ \ + bool success = true; \ + pDup d = (dup); \ + success &= EXPECT_STR_EQL(d->key1, k1); \ + success &= EXPECT_STR_EQL(d->key2, k2); \ + success &= EXPECT_RANGE(d->similarity, sim_min, sim_max, "%g"); \ + success; \ + }) + + +static bool do_tests(void) + { + bool success = true; + + /** Allocate some test data. **/ + void* data[] = { + "string", + "string2", + "str", + "hello world", + "data", + "string3", + }; + void* keys[] = { "1", "2", "3", "4", "5", "6" }; + + /** Check error cases. **/ + success &= EXPECT_EQL(ca_complete_search(NULL, 6, ca_lev_compare, 0.8, NULL, NULL), NULL, "%p"); + success &= EXPECT_EQL(ca_complete_search(data, 0, ca_lev_compare, 0.8, NULL, NULL), NULL, "%p"); + success &= EXPECT_EQL(ca_complete_search(data, 6, NULL, 0.8, NULL, NULL), NULL, "%p"); + success &= EXPECT_EQL(ca_complete_search(data, 6, ca_lev_compare, 1.1, NULL, NULL), NULL, "%p"); + success &= EXPECT_EQL(ca_complete_search(data, 6, ca_lev_compare, -0.1, NULL, NULL), NULL, "%p"); + success &= EXPECT_EQL(ca_complete_search(data, 6, ca_lev_compare, INFINITY, NULL, NULL), NULL, "%p"); + success &= EXPECT_EQL(ca_complete_search(data, 6, ca_lev_compare, -INFINITY, NULL, NULL), NULL, "%p"); + success &= EXPECT_EQL(ca_complete_search(data, 6, ca_lev_compare, NAN, NULL, NULL), NULL, "%p"); + + /** Test complete search. **/ + { + XArray xdups; + if (!check(xaInit(&xdups, 4))) return false; + success &= EXPECT_EQL(ca_complete_search(data, 6, ca_lev_compare, 0.8, keys, &xdups), &xdups, "%p"); + pDup* dups = (pDup*)xdups.Items; + for (unsigned int i = 0u; i < xdups.nItems; i++) + { + pDup cur = dups[i]; + if (cur->key1 > cur->key2) + { + char* temp = cur->key1; + cur->key1 = cur->key2; + cur->key2 = temp; + } + } + qsort(dups, xdups.nItems, sizeof(pDup), cmp_dups); + success &= EXPECT_EQL(xdups.nItems, 3, "%d"); + success &= EXPECT_DUP(dups[0], "1", "2", 0.8, 1.0); + success &= EXPECT_DUP(dups[1], "1", "6", 0.8, 1.0); + success &= EXPECT_DUP(dups[2], "2", "6", 0.8, 1.0); + } + + /** Test sliding search: Large window. **/ + { + XArray xdups; + if (!check(xaInit(&xdups, 4))) return false; + success &= EXPECT_EQL(ca_sliding_search(data, 6, 5, ca_lev_compare, 0.8, keys, &xdups), &xdups, "%p"); + pDup* dups = (pDup*)xdups.Items; + for (unsigned int i = 0u; i < xdups.nItems; i++) + { + pDup cur = dups[i]; + if (cur->key1 > cur->key2) + { + char* temp = cur->key1; + cur->key1 = cur->key2; + cur->key2 = temp; + } + } + qsort(dups, xdups.nItems, sizeof(pDup), cmp_dups); + success &= EXPECT_EQL(xdups.nItems, 2, "%d"); + success &= EXPECT_DUP(dups[0], "1", "2", 0.8, 1.0); + // success &= EXPECT_DUP(dups[1], "1", "6", 0.8, 1.0); /* Missed. */ + success &= EXPECT_DUP(dups[1], "2", "6", 0.8, 1.0); + } + + /** Test sliding search: Small window. **/ + { + XArray xdups; + if (!check(xaInit(&xdups, 4))) return false; + success &= EXPECT_EQL(ca_sliding_search(data, 6, 2, ca_lev_compare, 0.8, keys, &xdups), &xdups, "%p"); + pDup* dups = (pDup*)xdups.Items; + for (unsigned int i = 0u; i < xdups.nItems; i++) + { + pDup cur = dups[i]; + if (cur->key1 > cur->key2) + { + char* temp = cur->key1; + cur->key1 = cur->key2; + cur->key2 = temp; + } + } + qsort(dups, xdups.nItems, sizeof(pDup), cmp_dups); + success &= EXPECT_EQL(xdups.nItems, 1, "%d"); + success &= EXPECT_DUP(dups[0], "1", "2", 0.8, 1.0); + // success &= EXPECT_DUP(dups[1], "1", "6", 0.8, 1.0); /* Missed. */ + // success &= EXPECT_DUP(dups[2], "2", "6", 0.8, 1.0); /* Missed. */ + } + + return success; + } + +long long test(char** tname) + { + *tname = "cluster-06 Searching"; + return loop_tests(do_tests); + } diff --git a/centrallix-lib/tests/test_memstr_00.c b/centrallix-lib/tests/test_memstr_00.c index 38d13f93..13022b93 100755 --- a/centrallix-lib/tests/test_memstr_00.c +++ b/centrallix-lib/tests/test_memstr_00.c @@ -16,7 +16,7 @@ test(char** tname) char * ptr; *tname = "memstr-00 correct null ptr"; - iter = 4000; + iter = 64000; for(i=0;i +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" +#include "util.h" + +/** Tested module. **/ +#include "newmalloc.h" + +static unsigned int seed_counter = 0; +static char* err_buf; +static unsigned int err_buf_i; +static unsigned int err_buf_size; + +static int mock_error_fn(char* error_msg) + { + const size_t len = strlen(error_msg) + 1lu; + + /** Ensure enough space to store the error. **/ + while (len > err_buf_size - err_buf_i) + { + err_buf_size *= 2; + err_buf = check_ptr(realloc(err_buf, err_buf_size)); + } + + err_buf_i += snprintf( + err_buf + err_buf_i, + err_buf_size - err_buf_i, + "> %s\n", error_msg + ); + + return 0; + } + +/** Initialize memory of a given size with random data. **/ +static void* random_init(void* ptr, size_t size) + { + if (ptr == NULL) return NULL; + unsigned char* p = (unsigned char*)ptr; + for (size_t i = 0; i < size; i++) { + p[i] = (unsigned char)(rand() % 256); + } + return ptr; + } + +static bool do_tests(void) + { + bool success = true; + + /** Set a consistent, distinct seed for each test iteration. **/ + srand(seed_counter++); + + /** Initialize the mock error function. **/ + err_buf = check_ptr(malloc(err_buf_size = 256)); + err_buf_i = snprintf(err_buf, err_buf_size, "%s", ""); + nmSetErrFunction(mock_error_fn); + + /** Baseline: Should leak. **/ + success &= EXPECT_NOT_NULL(nmSysMalloc(42)); + + /** Basic string data. **/ + char* str1; + success &= EXPECT_NOT_NULL(str1 = nmSysMalloc(16)); + snprintf(str1, 16, "ThisIsSomeData!"); + char* str2; + success &= EXPECT_NOT_NULL(str2 = nmSysMalloc(32)); + snprintf(str2, 32, "ThisDataIsDifferentStringData.\n"); + success &= EXPECT_STR_EQL(str1, "ThisIsSomeData!"); + success &= EXPECT_STR_EQL(str2, "ThisDataIsDifferentStringData.\n"); + + /** 128 MB random data, varying sizes. **/ + #define TEST_LIMIT 16384 + void** data = check_ptr(malloc(TEST_LIMIT * sizeof(void*))); + void** test = check_ptr(malloc(TEST_LIMIT * sizeof(void*))); + for (size_t i = 1lu; i < TEST_LIMIT; i++) + { + success &= EXPECT_NOT_NULL(test[i] = nmSysMalloc(i)); + data[i] = random_init(check_ptr(malloc(i)), i); + memcpy(test[i], data[i], i); + } + for (size_t i = TEST_LIMIT - 1lu; i > 0lu; i--) + success &= EXPECT_EQL(memcmp(data[i], test[i], i), 0, "%d"); + + /** Basic string data is unharmed. **/ + success &= EXPECT_STR_EQL(str1, "ThisIsSomeData!"); + success &= EXPECT_STR_EQL(str2, "ThisDataIsDifferentStringData.\n"); + + /** Reallocate all variably sized memory to a different size. **/ + for (size_t i = TEST_LIMIT - 1lu; i > 0lu; i--) + success &= EXPECT_NOT_NULL(test[i] = nmSysRealloc(test[i], i)); + for (size_t i = 1lu; i < TEST_LIMIT; i++) + success &= EXPECT_EQL(memcmp(data[i], test[i], min(i, TEST_LIMIT - i)), 0, "%d"); + + /** Basic string data is unharmed. **/ + success &= EXPECT_STR_EQL(str1, "ThisIsSomeData!"); + success &= EXPECT_STR_EQL(str2, "ThisDataIsDifferentStringData.\n"); + + /** Testing strdup. **/ + char* str_dup1; + char* str_dup2; + success &= EXPECT_NOT_NULL(str_dup1 = nmSysStrdup(str1)); + success &= EXPECT_NOT_NULL(str_dup2 = nmSysStrdup(str2)); + success &= EXPECT_STR_EQL(str_dup1, "ThisIsSomeData!"); + success &= EXPECT_STR_EQL(str_dup2, "ThisDataIsDifferentStringData.\n"); + str_dup1[12] = '\0'; + str_dup2[2] = 'a'; + str_dup2[3] = 't'; + success &= EXPECT_STR_EQL(str_dup1, "ThisIsSomeDa"); + success &= EXPECT_STR_EQL(str_dup2, "ThatDataIsDifferentStringData.\n"); + + /** Basic string data is unharmed. **/ + success &= EXPECT_STR_EQL(str1, "ThisIsSomeData!"); + success &= EXPECT_STR_EQL(str2, "ThisDataIsDifferentStringData.\n"); + + /** Free random data, varying sizes. **/ + for (size_t i = 1lu; i < TEST_LIMIT; i++) + { + free(data[i]); + nmSysFree(test[i]); + } + + /** Basic string data is unharmed. **/ + success &= EXPECT_STR_EQL(str1, "ThisIsSomeData!"); + success &= EXPECT_STR_EQL(str2, "ThisDataIsDifferentStringData.\n"); + + /** Free data. **/ + nmSysFree(str1); + nmSysFree(str2); + + /** Dup string data is unharmed. **/ + success &= EXPECT_STR_EQL(str_dup1, "ThisIsSomeDa"); + success &= EXPECT_STR_EQL(str_dup2, "ThatDataIsDifferentStringData.\n"); + + /** Large singular allocation. **/ + #define _256MB 256000000lu + void* large_buf; + success &= EXPECT_NOT_NULL(large_buf = nmSysMalloc(_256MB)); + for (size_t i = _256MB - 1lu; i > 0lu; i--) + *((unsigned char*)large_buf + i) = (unsigned char)(i % 255lu); + *(unsigned char*)large_buf = 0u; + for (size_t i = 0lu; i < _256MB; i++) + success &= EXPECT_EQL(*((unsigned char*)large_buf + i), (unsigned char)(i % 255lu), "%d"); + + /** Dup string data is unharmed. **/ + success &= EXPECT_STR_EQL(str_dup1, "ThisIsSomeDa"); + success &= EXPECT_STR_EQL(str_dup2, "ThatDataIsDifferentStringData.\n"); + + /** Free dups. **/ + nmSysFree(str_dup1); + nmSysFree(str_dup2); + + /** Free large allocation. **/ + nmSysFree(large_buf); + + /** Expect no captured errors. **/ + success &= EXPECT_STR_EQL(err_buf, ""); + + return success; + } + +long long test(char** tname) + { + *tname = "newmalloc-00 nmSysMalloc(), nmSysFree(), nmSysRealloc(), & nmSysStrdup()"; + return loop_tests(do_tests); + } + +/** Scope cleanup. **/ +#undef TEST_LIMIT +#undef _256MB diff --git a/centrallix-lib/tests/test_newmalloc_01.c b/centrallix-lib/tests/test_newmalloc_01.c new file mode 100644 index 00000000..eacdd6c5 --- /dev/null +++ b/centrallix-lib/tests/test_newmalloc_01.c @@ -0,0 +1,161 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_newmalloc_01.c */ +/* Author: Israel Fuller */ +/* Creation: December 15th, 2025 */ +/* Description: Test the nmMalloc(), nmFree(), and nmClear() functions */ +/* from the NewMalloc library. */ +/************************************************************************/ + +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" +#include "util.h" + +/** Tested module. **/ +#include "newmalloc.h" + +static unsigned int seed_counter = 0; +static char* err_buf; +static unsigned int err_buf_i; +static unsigned int err_buf_size; + +static int mock_error_fn(char* error_msg) + { + const size_t len = strlen(error_msg) + 1lu; + + /** Ensure enough space to store the error. **/ + while (len > err_buf_size - err_buf_i) + { + err_buf_size *= 2; + err_buf = check_ptr(realloc(err_buf, err_buf_size)); + } + + err_buf_i += snprintf( + err_buf + err_buf_i, + err_buf_size - err_buf_i, + "> %s\n", error_msg + ); + + return 0; + } + +/** Initialize memory of a given size with random data. **/ +static void* random_init(void* ptr, size_t size) + { + if (ptr == NULL) return NULL; + unsigned char* p = (unsigned char*)ptr; + for (size_t i = 0; i < size; i++) { + p[i] = (unsigned char)(rand() % 256); + } + return ptr; + } + +static bool do_tests(void) + { + bool success = true; + + /** Set a consistent, distinct seed for each test iteration. **/ + srand(seed_counter++); + + /** Initialize the mock error function. **/ + err_buf = check_ptr(malloc(err_buf_size = 256)); + err_buf_i = snprintf(err_buf, err_buf_size, "%s", ""); + nmSetErrFunction(mock_error_fn); + + /** Baseline: Should leak. **/ + success &= EXPECT_NOT_NULL(nmMalloc(42)); + + /** Basic string data. **/ + char* str1; + success &= EXPECT_NOT_NULL(str1 = nmMalloc(16)); + snprintf(str1, 16, "ThisIsSomeData!"); + char* str2; + success &= EXPECT_NOT_NULL(str2 = nmMalloc(32)); + snprintf(str2, 32, "ThisDataIsDifferentStringData.\n"); + success &= EXPECT_STR_EQL(str1, "ThisIsSomeData!"); + success &= EXPECT_STR_EQL(str2, "ThisDataIsDifferentStringData.\n"); + + /** 128 MB random data, varying sizes. **/ + #define TEST_LIMIT 16384 + void** data = check_ptr(malloc(TEST_LIMIT * sizeof(void*))); + void** test = check_ptr(malloc(TEST_LIMIT * sizeof(void*))); + for (size_t i = 1lu; i < TEST_LIMIT; i++) + { + success &= EXPECT_NOT_NULL(test[i] = nmMalloc(i)); + data[i] = random_init(check_ptr(malloc(i)), i); + memcpy(test[i], data[i], i); + } + for (size_t i = TEST_LIMIT - 1lu; i > 0lu; i--) + success &= EXPECT_EQL(memcmp(data[i], test[i], i), 0, "%d"); + + /** Basic string data is unharmed. **/ + success &= EXPECT_STR_EQL(str1, "ThisIsSomeData!"); + success &= EXPECT_STR_EQL(str2, "ThisDataIsDifferentStringData.\n"); + + /** Large singular allocation. **/ + #define _256MB 256000000lu + void* large_buf; + success &= EXPECT_NOT_NULL(large_buf = nmMalloc(_256MB)); + for (size_t i = _256MB - 1lu; i > 0lu; i--) + *((unsigned char*)large_buf + i) = (unsigned char)(i % 255lu); + *(unsigned char*)large_buf = 0u; + for (size_t i = 0lu; i < _256MB; i++) + success &= EXPECT_EQL(*((unsigned char*)large_buf + i), (unsigned char)(i % 255lu), "%d"); + + /** Dup string data is unharmed. **/ + success &= EXPECT_STR_EQL(str1, "ThisIsSomeData!"); + success &= EXPECT_STR_EQL(str2, "ThisDataIsDifferentStringData.\n"); + + /** Free random data, varying sizes. **/ + for (size_t i = 1lu; i < TEST_LIMIT; i++) + { + free(data[i]); + nmFree(test[i], i); + } + + /** Basic string data is unharmed. **/ + success &= EXPECT_STR_EQL(str1, "ThisIsSomeData!"); + success &= EXPECT_STR_EQL(str2, "ThisDataIsDifferentStringData.\n"); + + /** Free data. **/ + nmFree(str1, 16); + nmFree(str2, 32); + + /** Free large allocation. **/ + nmFree(large_buf, _256MB); + + /** Clear cache. **/ + nmClear(); + + /** Debug info. **/ + printf("\n"); + nmStats(); + + /** Expect no captured errors. **/ + success &= EXPECT_STR_EQL(err_buf, ""); + + return success; + } + +long long test(char** tname) + { + *tname = "newmalloc-01 nmMalloc(), nmFree(), & nmClear()"; + return loop_tests(do_tests); + } + +/** Scope cleanup. **/ +#undef TEST_LIMIT +#undef _256MB diff --git a/centrallix-lib/tests/test_util_01.c b/centrallix-lib/tests/test_util_01.c index a4c2efae..5eafd527 100644 --- a/centrallix-lib/tests/test_util_01.c +++ b/centrallix-lib/tests/test_util_01.c @@ -8,7 +8,7 @@ /* GNU Lesser General Public License, Version 2.1, contained in the */ /* included file "COPYING". */ /* */ -/* Module: test_util_00.c */ +/* Module: test_util_01.c */ /* Author: Micah Shennum */ /* Creation: May 26th, 2011 */ /* Description: Test strtoi */ diff --git a/centrallix-lib/tests/test_util_02.c b/centrallix-lib/tests/test_util_02.c new file mode 100644 index 00000000..1ba0e4a5 --- /dev/null +++ b/centrallix-lib/tests/test_util_02.c @@ -0,0 +1,72 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_util_02.c */ +/* Author: Israel Fuller */ +/* Creation: November 24th, 2025 */ +/* Description: Test the util.h round_to() function. */ +/************************************************************************/ + +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" + +/** Tested module. **/ +#include "util.h" + +static bool do_tests(void) + { + bool success = true; + + /** Rounding to 0 decimals does not change whole numbers. **/ + success &= EXPECT_EQL(round_to(0.0, 0), 0.0, "%g"); + success &= EXPECT_EQL(round_to(123, 0), 123.0, "%g"); + success &= EXPECT_EQL(round_to(-123, 0), -123.0, "%g"); + + /** Rounding to 0 decimals rounds decimals. **/ + success &= EXPECT_EQL(round_to(1.23, 0), 1.0, "%g"); + success &= EXPECT_EQL(round_to(24.43, 0), 24.0, "%g"); + success &= EXPECT_EQL(round_to(1234567890.499, 0), 1234567890.0, "%g"); + success &= EXPECT_EQL(round_to(-1.82, 0), -2.0, "%g"); + success &= EXPECT_EQL(round_to(-987654321.499, 0), -987654321.0, "%g"); + + /** Test rounding to various numbers of decimals. **/ + success &= EXPECT_EQL(round_to(1.23, 1), 1.2, "%g"); + success &= EXPECT_EQL(round_to(1.23, 2), 1.23, "%g"); + success &= EXPECT_EQL(round_to(1.23, 3), 1.23, "%g"); + success &= EXPECT_EQL(round_to(1.23, 8), 1.23, "%g"); + success &= EXPECT_EQL(round_to(5824113.8, 8), 5824113.8, "%g"); + + /** Test rounding to negative numbers of decimals. **/ + success &= EXPECT_EQL(round_to(1.23, -1), 0.0, "%g"); + success &= EXPECT_EQL(round_to(123.4, -1), 120.0, "%g"); + success &= EXPECT_EQL(round_to(586241.7, -4), 590000.0, "%g"); + + /** Rounding infinity. */ + success &= EXPECT_EQL(round_to( INFINITY, 0), INFINITY, "%g"); + success &= EXPECT_EQL(round_to(-INFINITY, 0), -INFINITY, "%g"); + success &= EXPECT_EQL(round_to( INFINITY, 16), INFINITY, "%g"); + success &= EXPECT_EQL(round_to(-INFINITY, 16), -INFINITY, "%g"); + success &= EXPECT_EQL(round_to( INFINITY, -16), INFINITY, "%g"); + success &= EXPECT_EQL(round_to(-INFINITY, -16), -INFINITY, "%g"); + + return success; + } + +long long test(char** tname) + { + *tname = "util-02 round_to()"; + return loop_tests(do_tests); + } diff --git a/centrallix-lib/tests/test_util_03.c b/centrallix-lib/tests/test_util_03.c new file mode 100644 index 00000000..777af212 --- /dev/null +++ b/centrallix-lib/tests/test_util_03.c @@ -0,0 +1,104 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_util_03.c */ +/* Author: Israel Fuller */ +/* Creation: November 24th, 2025 */ +/* Description: Test the util.h timer1 functionality. */ +/************************************************************************/ + +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" + +/** Tested module. **/ +#include "util.h" + +static bool do_nothing(void) + { + return true; + } + +long long test(char** tname) + { + *tname = "util-03 Timer"; + + /** Allocate a stack and a heap timer. **/ + Timer t; + Timer *timer1 = timer_init(&t); + pTimer timer2 = timer_new(); + + /** 0.1 second wait. **/ + timer_start(timer1); + timer_start(timer2); + usleep(99900); /* 0.0999 seconds (leave room for overhead). */ + double t1_inter = round_to(timer_get(timer1), 3); + double t2_inter = round_to(timer_get(timer2), 3); + usleep(99900); /* 0.0999 seconds (leave room for overhead). */ + timer_stop(timer1); + timer_stop(timer2); + + /** Extract values with rounding to give margin for error. **/ + double t1_val = round_to(timer_get(timer1), 3); + double t2_val = round_to(timer_get(timer2), 3); + + /** Check for incorrect values. **/ + if (!EXPECT_EQL(t1_inter, 0.1, "%g")) goto fail; + if (!EXPECT_EQL(t2_inter, 0.1, "%g")) goto fail; + if (!EXPECT_EQL(t1_inter, t2_inter, "%g")) goto fail; + if (!EXPECT_EQL(t1_val, 0.2, "%g")) goto fail; + if (!EXPECT_EQL(t2_val, 0.2, "%g")) goto fail; + if (!EXPECT_EQL(t1_val, t2_val, "%g")) goto fail; + + /** Test that timer can resume properly. **/ + timer_start(timer1); + timer_start(timer2); + usleep(99900); /* 0.0999 seconds (leave room for overhead). */ + double t1_inter2 = round_to(timer_get(timer1), 3); + double t2_inter2 = round_to(timer_get(timer2), 3); + usleep(99900); /* 0.0999 seconds (leave room for overhead). */ + timer_stop(timer1); + timer_stop(timer2); + + /** Extract values with rounding to give margin for error. **/ + double t1_val2 = round_to(timer_get(timer1), 3); + double t2_val2 = round_to(timer_get(timer2), 3); + + /** Check for incorrect values. **/ + if (!EXPECT_EQL(t1_inter2, 0.3, "%g")) goto fail; + if (!EXPECT_EQL(t2_inter2, 0.3, "%g")) goto fail; + if (!EXPECT_EQL(t1_inter2, t2_inter2, "%g")) goto fail; + if (!EXPECT_EQL(t1_val2, 0.4, "%g")) goto fail; + if (!EXPECT_EQL(t2_val2, 0.4, "%g")) goto fail; + if (!EXPECT_EQL(t1_val2, t2_val2, "%g")) goto fail; + + /** Clean up. **/ + timer_de_init(timer1); + timer_free(timer2); + + /*** This test takes a lot of real time (calling usleep()) without + *** using very many CPU cycles. This means we need to waste some + *** CPU cycles so that the test runner doesn't crash because the + *** CPU clock time was too low. + ***/ + long long i = loop_tests(do_nothing); + + /** Return success. **/ + return i; + + /** Return failure. **/ + fail: + return -1; + } diff --git a/centrallix-lib/tests/test_util_04.c b/centrallix-lib/tests/test_util_04.c new file mode 100644 index 00000000..e1b4c60b --- /dev/null +++ b/centrallix-lib/tests/test_util_04.c @@ -0,0 +1,79 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_util_04.c */ +/* Author: Israel Fuller */ +/* Creation: November 24th, 2025 */ +/* Description: Test the util.h max() and min() functions. */ +/************************************************************************/ + +#include +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" + +/** Tested module. **/ +#include "util.h" + +static bool do_tests(void) + { + bool success = true; + + /** Max with doubles. **/ + success &= EXPECT_EQL(max(0.0, 0.0), 0.0, "%g"); + success &= EXPECT_EQL(max(1.0, 0.0), 1.0, "%g"); + success &= EXPECT_EQL(max(-1.0, 0.0), 0.0, "%g"); + success &= EXPECT_EQL(max(-1.1, 0.1), 0.1, "%g"); + success &= EXPECT_EQL(max(0.0001, 0.00011), 0.00011, "%g"); + success &= EXPECT_EQL(max(DBL_MAX, 0.0), DBL_MAX, "%g"); + success &= EXPECT_EQL(max(DBL_MIN, 0.0), DBL_MIN, "%g"); + success &= EXPECT_EQL(max(-DBL_MIN, 0.0), 0.0, "%g"); + success &= EXPECT_EQL(max(-DBL_MAX, 0.0), 0.0, "%g"); + success &= EXPECT_EQL(max(pow(10, DBL_DIG), pow(10, FLT_DIG)), pow(10, DBL_DIG), "%g"); + + /** Min with doubles. **/ + success &= EXPECT_EQL(min(0.0, 0.0), 0.0, "%g"); + success &= EXPECT_EQL(min(1.0, 0.0), 0.0, "%g"); + success &= EXPECT_EQL(min(-1.0, 0.0), -1.0, "%g"); + success &= EXPECT_EQL(min(-1.1, 0.1), -1.1, "%g"); + success &= EXPECT_EQL(min(0.0001, 0.00011), 0.0001, "%g"); + success &= EXPECT_EQL(min(DBL_MAX, 0.0), 0.0, "%g"); + success &= EXPECT_EQL(min(DBL_MIN, 0.0), 0.0, "%g"); + success &= EXPECT_EQL(min(-DBL_MIN, 0.0), -DBL_MIN, "%g"); + success &= EXPECT_EQL(min(-DBL_MAX, 0.0), -DBL_MAX, "%g"); + success &= EXPECT_EQL(min(pow(10, DBL_DIG), pow(10, FLT_DIG)), pow(10, FLT_DIG), "%g"); + + /** Max with ints. **/ + success &= EXPECT_EQL(max(0, 0), 0, "%d"); + success &= EXPECT_EQL(max(1, 0), 1, "%d"); + success &= EXPECT_EQL(max(-1, 0), 0, "%d"); + success &= EXPECT_EQL(max(INT_MAX, 0), INT_MAX, "%d"); + success &= EXPECT_EQL(max(INT_MIN, 0), 0, "%d"); + + /** Min with ints. **/ + success &= EXPECT_EQL(min(0, 0), 0, "%d"); + success &= EXPECT_EQL(min(1, 0), 0, "%d"); + success &= EXPECT_EQL(min(-1, 0), -1, "%d"); + success &= EXPECT_EQL(min(INT_MAX, 0), 0, "%d"); + success &= EXPECT_EQL(min(INT_MIN, 0), INT_MIN, "%d"); + + return success; + } + +long long test(char** tname) + { + *tname = "util-04 min() & max()"; + return loop_tests(do_tests); + } diff --git a/centrallix-lib/tests/test_util_05.c b/centrallix-lib/tests/test_util_05.c new file mode 100644 index 00000000..a88aa9b6 --- /dev/null +++ b/centrallix-lib/tests/test_util_05.c @@ -0,0 +1,128 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_util_05.c */ +/* Author: Israel Fuller */ +/* Creation: November 24th, 2025 */ +/* Description: Test the .h printing functionality. */ +/************************************************************************/ + +#include +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" + +/** Tested module. **/ +#include "util.h" + +/** Assumes success is in scope. **/ +#define TEST_SNPRINT_LLU(buf, buf_size, value, expect) \ + EXPECT_STR_EQL(snprint_llu(buf, buf_size, value), expect) && \ + EXPECT_EQL(snprint_llu(buf, buf_size, value), &buf[0], "%p") + +#define TEST_SNPRINT_BYTES(buf, buf_size, value, expect) \ + EXPECT_STR_EQL(snprint_bytes(buf, buf_size, value), expect) && \ + EXPECT_EQL(snprint_bytes(buf, buf_size, value), &buf[0], "%p") + +static bool do_tests(void) + { + bool success = true; + + /** Detect if metric or CS units are intended. **/ + bool cs = true; + #ifdef UTIL_USE_METRIC + if (UTIL_USE_METRIC) cs = false; + #endif + + /** Allocate space for the string buffer. **/ + char buf[32]; + const size_t buf_size = sizeof(buf) / sizeof(char); + memset(buf, UINT_MAX, sizeof(buf)); /* Use unexpected data to catch uninitialized reads. */ + + /** Test snprint_bytes(). **/ + success &= TEST_SNPRINT_BYTES(buf, buf_size, 0, "0 bytes"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, 10, "10 bytes"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, 100, "100 bytes"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1000, 1) - pow(1000, 0), (cs) ? "999 bytes" : "999 bytes"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1000, 1), (cs) ? "1000 bytes" : "1 KB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1000, 2) - pow(1000, 1), (cs) ? "975.59 KiB" : "999 KB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1000, 2), (cs) ? "976.56 KiB" : "1 MB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1000, 3) - pow(1000, 2), (cs) ? "952.72 MiB" : "999 MB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1000, 3), (cs) ? "953.67 MiB" : "1 GB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1024, 1) - pow(1024, 0), (cs) ? "1023 bytes" : "1.02 KB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1024, 1), (cs) ? "1 KiB" : "1.02 KB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1024, 2) - pow(1024, 1), (cs) ? "1023 KiB" : "1.05 MB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1024, 2), (cs) ? "1 MiB" : "1.05 MB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1024, 3) - pow(1024, 2), (cs) ? "1023 MiB" : "1.07 GB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, pow(1024, 3), (cs) ? "1 GiB" : "1.07 GB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, INT_MAX, (cs) ? "2 GiB" : "2.15 GB"); + success &= TEST_SNPRINT_BYTES(buf, buf_size, UINT_MAX, (cs) ? "4 GiB" : "4.29 GB"); + + /** Test snprint_llu(). Note: 10^16 would fail due to the double precision limit. **/ + success &= TEST_SNPRINT_LLU(buf, buf_size, 0, "0"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 1) - 1, "9"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 1), "10"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 2) - 1, "99"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 2), "100"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 3) - 1, "999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 3), "1,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 4) - 1, "9,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 4), "10,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 5) - 1, "99,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 5), "100,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 6) - 1, "999,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 6), "1,000,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 7) - 1, "9,999,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 7), "10,000,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 8) - 1, "99,999,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 8), "100,000,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 9) - 1, "999,999,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 9), "1,000,000,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 10) - 1, "9,999,999,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 10), "10,000,000,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 11) - 1, "99,999,999,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 11), "100,000,000,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 12) - 1, "999,999,999,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 12), "1,000,000,000,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 13) - 1, "9,999,999,999,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 13), "10,000,000,000,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 14) - 1, "99,999,999,999,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 14), "100,000,000,000,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 15) - 1, "999,999,999,999,999"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(10, 15), "1,000,000,000,000,000"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(1024, 1) - 1, "1,023"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(1024, 1), "1,024"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(1024, 2) - 1, "1,048,575"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(1024, 2), "1,048,576"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(1024, 3) - 1, "1,073,741,823"); + success &= TEST_SNPRINT_LLU(buf, buf_size, pow(1024, 3), "1,073,741,824"); + success &= TEST_SNPRINT_LLU(buf, buf_size, SHRT_MAX, "32,767"); + success &= TEST_SNPRINT_LLU(buf, buf_size, USHRT_MAX, "65,535"); + success &= TEST_SNPRINT_LLU(buf, buf_size, INT_MAX, "2,147,483,647"); + success &= TEST_SNPRINT_LLU(buf, buf_size, UINT_MAX, "4,294,967,295"); + success &= TEST_SNPRINT_LLU(buf, buf_size, LLONG_MAX, "9,223,372,036,854,775,807"); + success &= TEST_SNPRINT_LLU(buf, buf_size, ULLONG_MAX, "18,446,744,073,709,551,615"); + + /** Test __FILENAME__. **/ + success &= EXPECT_STR_EQL(__FILENAME__, "test_util_05.c"); + + return success; + } + +long long test(char** tname) + { + *tname = "util-05 Printing"; + return loop_tests(do_tests); + } diff --git a/centrallix-lib/tests/test_utils.h b/centrallix-lib/tests/test_utils.h new file mode 100644 index 00000000..70fe6448 --- /dev/null +++ b/centrallix-lib/tests/test_utils.h @@ -0,0 +1,123 @@ +#ifndef TEST_UTILITY_H +#define TEST_UTILITY_H + +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_uitls.h */ +/* Author: Israel Fuller */ +/* Creation: November 24th, 2025 */ +/* Description: Useful utils to improve the developer experience when */ +/* testing centrallix-lib. */ +/************************************************************************/ + +#include +#include + +#include "util.h" + +/*** Expect two values to be equal. + *** + *** @param v1 The first value. + *** @param v2 The second value. + *** @param sp The specifier to print the values if there is an error. This + *** MUST be a string known to the compiler at compile time, such as a + *** sting literal or a macro that expands to one. + *** @returns true if successful, false otherwise. + ***/ +#define EXPECT_EQL(v1, v2, sp) \ + ({ \ + __typeof__ (v1) _v1 = (v1); \ + __typeof__ (v2) _v2 = (v2); \ + int success = (_v1 == _v2); \ + if (!success) fprintf(stderr, \ + " > Expected %s ("sp") to equal %s ("sp") at %s:%d\n", \ + #v1, _v1, #v2, _v2, __FILE__, __LINE__ \ + ); \ + success; \ + }) + +/*** Expect two strings to be equal. + *** + *** @param str1 The first string. + *** @param str2 The second string. + *** @returns true if successful, false otherwise. + ***/ +#define EXPECT_STR_EQL(str1, str2) \ + ({ \ + char* _str1 = (str1); \ + char* _str2 = (str2); \ + int success = (_str1 == _str2) || (_str1 != NULL && _str2 != NULL && strcmp(_str1, _str2) == 0); \ + if (!success) fprintf(stderr, \ + " > Expected %s (\"%s\") to equal %s (\"%s\") at %s:%d\n", \ + #str1, _str1, #str2, _str2, __FILE__, __LINE__ \ + ); \ + success; \ + }) + +/*** Expect a value to fall within a range. + *** + *** @param v The value. + *** @param min_v The minimum acceptable value. + *** @param max_v The maximum acceptable value. + *** @param sp The specifier to print the values if there is an error. This + *** MUST be a string known to the compiler at compile time, such as a + *** sting literal or a macro that expands to one. + *** @returns true if successful, false otherwise. + ***/ +#define EXPECT_RANGE(v, min_v, max_v, sp) \ + ({ \ + __typeof__ (v) _v = (v); \ + __typeof__ (min_v) _min = (min_v); \ + __typeof__ (max_v) _max = (max_v); \ + int success = (_min <= _v && _v <= _max); \ + if (!success) fprintf(stderr, \ + " > Expected %s ("sp") to be in the range %s ("sp") - %s ("sp") at %s:%d\n", \ + #v, _v, #min_v, _min, #max_v, _max, __FILE__, __LINE__ \ + ); \ + success; \ + }) + +/*** Syntactic sugar to expect a pointer to be non null in a clearer, more + *** concise way. + *** + *** @param ptr The pointer. + *** @returns true if successful, false otherwise. + ***/ +#define EXPECT_NOT_NULL(ptr) \ + ({ \ + __typeof__ (ptr) _ptr = (ptr); \ + int success = (_ptr != NULL); \ + if (!success) fprintf(stderr, \ + " > Expected %s (%p) to be non null at %s:%d\n", \ + #ptr, _ptr, __FILE__, __LINE__ \ + ); \ + success; \ + }) + +/** Repeat the test as many times as possible within a set time window. **/ +#define loop_tests(test_fn) \ + ({ \ + long long result = 0ll; \ + Timer iter_timer_, *iter_timer = timer_start(timer_init(&iter_timer_)); \ + while (timer_get(iter_timer) < 0.02) \ + { \ + result++; \ + if (!test_fn()) \ + { \ + result = -1; \ + break; \ + } \ + } \ + timer_de_init(iter_timer); \ + result; \ + }) + +#endif diff --git a/centrallix-lib/tests/test_xarray_00.c b/centrallix-lib/tests/test_xarray_00.c new file mode 100644 index 00000000..76dd5bfb --- /dev/null +++ b/centrallix-lib/tests/test_xarray_00.c @@ -0,0 +1,137 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_xarray_00.c */ +/* Author: Israel Fuller */ +/* Creation: November 25th, 2025 */ +/* Description: Test all the functions in the xarray library, except */ +/* xaInit() and xaDeInit() because testing xaNew() and */ +/* xaFree() should cover them, and xaAddItemSorted() and */ +/* xaAddItemSortedInt32(). */ +/************************************************************************/ + +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" +#include "util.h" + +/** Tested module. **/ +#include "xarray.h" + +/** Helper used by xaClear/xaClearR to free allocated items and count them. **/ +static int test_free(void* p, void* arg) + { + if (p == NULL) return 0; + + free(p); + if (arg != NULL) (*(int*)arg)++; + + return 0; + } + +static unsigned int seed_counter = 0; + +static bool do_tests(void) + { + bool success = true; + + /** Create random values using a consistent, distinct seed. **/ + srand(seed_counter++); + const int value1 = rand(); + const int value2 = rand(); + const int value3 = rand(); + const int value4 = rand(); + const int value5 = rand(); + const int value_not = rand(); /* Probably not used... */ + + /** Create a new xarray. **/ + pXArray xa = xaNew(2); + success &= EXPECT_EQL(xa != NULL, true, "%d"); + if (xa == NULL) return false; + + /** A new xarray should have 0 items. **/ + success &= EXPECT_EQL(xaCount(xa), 0, "%d"); + + /** Test adding an item. **/ + int* v1 = malloc(sizeof(int)); *v1 = value1; + success &= EXPECT_EQL(xaAddItem(xa, v1), 0, "%d"); + success &= EXPECT_EQL(xaCount(xa), 1, "%d"); + success &= EXPECT_EQL(*(int*)xaGetItem(xa, 0), value1, "%d"); + + /** Test adding another item. **/ + int* v2 = malloc(sizeof(int)); *v2 = value2; + success &= EXPECT_EQL(xaAddItem(xa, v2), 1, "%d"); + success &= EXPECT_EQL(xaCount(xa), 2, "%d"); + success &= EXPECT_EQL(*(int*)xaGetItem(xa, 1), value2, "%d"); + + /** Test finding items. **/ + success &= EXPECT_EQL(xaFindItem(xa, v2), 1, "%d"); + success &= EXPECT_EQL(xaFindItem(xa, v1), 0, "%d"); + success &= EXPECT_EQL(xaFindItemR(xa, v2), 1, "%d"); + success &= EXPECT_EQL(xaFindItemR(xa, v1), 0, "%d"); + + /** Test finding items that don't exist. **/ + int* v_not = malloc(sizeof(int)); *v_not = value_not; + success &= EXPECT_EQL(xaFindItem(xa, NULL), -1, "%d"); + success &= EXPECT_EQL(xaFindItemR(xa, NULL), -1, "%d"); + success &= EXPECT_EQL(xaFindItem(xa, &v_not), -1, "%d"); + success &= EXPECT_EQL(xaFindItemR(xa, &v_not), -1, "%d"); + free(v_not); v_not = NULL; + + /** Insert before index 1. **/ + int* v3 = malloc(sizeof(int)); *v3 = value3; + success &= EXPECT_EQL(xaInsertBefore(xa, 1, v3), 1, "%d"); + success &= EXPECT_EQL(*(int*)xaGetItem(xa, 1), value3, "%d"); + success &= EXPECT_EQL(xaCount(xa), 3, "%d"); + + /** Insert after index 2. **/ + int* v4 = malloc(sizeof(int)); *v4 = value4; + success &= EXPECT_EQL(xaInsertAfter(xa, 2, v4), 3, "%d"); + success &= EXPECT_EQL(*(int*)xaGetItem(xa, 3), value4, "%d"); + success &= EXPECT_EQL(xaCount(xa), 4, "%d"); + + /** xaSetItem() beyond current end should create NULL gaps. **/ + int* vset = malloc(sizeof(int)); *vset = value5; + success &= EXPECT_EQL(xaSetItem(xa, 5, vset), 5, "%d"); + success &= EXPECT_EQL(xaCount(xa), 6, "%d"); + success &= EXPECT_EQL(*(int*)xaGetItem(xa, 5), value5, "%d"); + success &= EXPECT_EQL(xaGetItem(xa, 4), NULL, "%p"); /* Check null gap. */ + + /** Remove an item and ensure that it is gone. **/ + success &= EXPECT_EQL(xaRemoveItem(xa, 2), 0, "%d"); /* Remove original index 2. */ + success &= EXPECT_EQL(xaCount(xa), 5, "%d"); + + /** Count non-NULL items prior to clearing. **/ + int count_before = xaCount(xa); + int nonnull = 0; + for (int i = 0; i < count_before; i++) + if (xaGetItem(xa, i) != NULL) nonnull++; + + /** Clear and verify free was called for each non-NULL item. **/ + int freed_count = 0; + success &= EXPECT_EQL(xaClear(xa, test_free, &freed_count), 0, "%d"); + success &= EXPECT_EQL(freed_count, nonnull, "%d"); + success &= EXPECT_EQL(xaCount(xa), 0, "%d"); + + /** Clean up. **/ + success &= EXPECT_EQL(xaFree(xa), 0, "%d"); + + return success; + } + +long long test(char** tname) + { + *tname = "xarray-00 Full Test"; + return loop_tests(do_tests); + } diff --git a/centrallix-lib/tests/test_xstring_00.c b/centrallix-lib/tests/test_xstring_00.c new file mode 100644 index 00000000..e4368c7f --- /dev/null +++ b/centrallix-lib/tests/test_xstring_00.c @@ -0,0 +1,162 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 2005 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: test_xstring_00.c */ +/* Author: Israel Fuller */ +/* Creation: November 25th, 2025 */ +/* Description: Test all the functions in the xstring library, except: */ +/* xsInit(), xsDeInit(), xsWrite(), xsGenPrintf_va(), */ +/* xsQPrintf(), xsQPrintf_va(), xsConcatQPrintf() */ +/************************************************************************/ + +#include +#include +#include +#include +#include + +/** Test dependencies. **/ +#include "test_utils.h" +#include "util.h" + +/** Tested module. **/ +#include "xstring.h" + +#define EXPECT_XSTR_EQL(xs, expect) \ + EXPECT_EQL((size_t)xsLength(xs), strlen(expect), "%ld") & \ + EXPECT_STR_EQL(xsString(xs), expect) + +static bool do_tests(void) + { + bool success = true; + + /** Create and init xstring. **/ + pXString xs = xsNew(); + success &= EXPECT_EQL(xs != NULL, true, "%d"); + if (xs == NULL) return false; + + /** Verify initial state. **/ + success &= EXPECT_EQL(xsLength(xs), 0, "%d"); + success &= EXPECT_STR_EQL(xsString(xs), ""); + + /** Set content with xsCopy(). **/ + success &= EXPECT_EQL(xsCopy(xs, "Hello", -1), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Hello"); + + /** Append content with xsConcatenate(). **/ + success &= EXPECT_EQL(xsConcatenate(xs, " World", -1), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Hello World"); + + /** Test xsStringEnd() with a reverse traversal. **/ + char* end_ptr = xsStringEnd(xs); + success &= EXPECT_EQL(end_ptr != NULL, true, "%d"); + success &= EXPECT_EQL(end_ptr[0], '\0', "%d"); + success &= EXPECT_EQL(end_ptr[-1], 'd', "%d"); + success &= EXPECT_EQL(end_ptr[-2], 'l', "%d"); + success &= EXPECT_EQL(end_ptr[-3], 'r', "%d"); + success &= EXPECT_EQL(end_ptr[-4], 'o', "%d"); + success &= EXPECT_EQL(end_ptr[-5], 'W', "%d"); + success &= EXPECT_EQL(end_ptr[-6], ' ', "%d"); + success &= EXPECT_EQL(end_ptr[-7], 'o', "%d"); + success &= EXPECT_EQL(end_ptr[-8], 'l', "%d"); + success &= EXPECT_EQL(end_ptr[-9], 'l', "%d"); + success &= EXPECT_EQL(end_ptr[-10], 'e', "%d"); + success &= EXPECT_EQL(end_ptr[-11], 'H', "%d"); + + /** Overwrite content with overflow. **/ + success &= EXPECT_EQL(xsCopy(xs, "Str&overflow", 3), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Str"); + + /** Append to string with overflow. **/ + success &= EXPECT_EQL(xsConcatenate(xs, "++", 2), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Str++"); + + /** Replace into formatted string. **/ + success &= EXPECT_EQL(xsPrintf(xs, "Number: %d", 42), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Number: 42"); + + /** Append formatted text to string. **/ + success &= EXPECT_EQL(xsConcatPrintf(xs, ", String: %s", "test"), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Number: 42, String: test"); + + /** Forward search with xsFind(). **/ + success &= EXPECT_EQL(xsCopy(xs, "Find me in here!", -1), 0, "%d"); + success &= EXPECT_EQL(xsFind(xs, "me", -1, 0), 5, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Find me in here!"); /* String is not modified. */ + + /** Backward search with xsFindRev(). **/ + success &= EXPECT_EQL(xsCopy(xs, "find find find", -1), 0, "%d"); + success &= EXPECT_EQL(xsFind(xs, "find", -1, 0), 0, "%d"); + success &= EXPECT_EQL(xsFindRev(xs, "find", -1, 0), 10, "%d"); + success &= EXPECT_XSTR_EQL(xs, "find find find"); /* String is not modified. */ + + /** Finding items that don't exist. **/ + success &= EXPECT_EQL(xsFind(xs, NULL, -1, 0), -1, "%d"); + success &= EXPECT_EQL(xsFind(xs, "GeorgeNotFound", -1, 0), -1, "%d"); + success &= EXPECT_XSTR_EQL(xs, "find find find"); /* String is not modified. */ + + /** Find and replace. **/ + success &= EXPECT_EQL(xsCopy(xs, "cat cat cat", -1), 0, "%d"); + success &= EXPECT_EQL(xsReplace(xs, "cat", -1, 0, "dog", -1), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "dog cat cat"); + + /** Find and replace fails to find. **/ + success &= EXPECT_EQL(xsCopy(xs, "GeorgeNotFound", -1), 0, "%d"); + success &= EXPECT_EQL(xsReplace(xs, "cat", -1, 0, "dog", -1), -1, "%d"); + success &= EXPECT_XSTR_EQL(xs, "GeorgeNotFound"); + + /** Find and replace overflows buffer. **/ + success &= EXPECT_EQL(xsCopy(xs, "ve^", -1), 0, "%d"); + success &= EXPECT_EQL(xsReplace(xs, "efghijklmnop", 1, 0, "e... e- e... E!!!", 16), 1, "%d"); + success &= EXPECT_XSTR_EQL(xs, "ve... e- e... E!!^"); + + /** Substitute at offset. **/ + success &= EXPECT_EQL(xsCopy(xs, "Hello World", -1), 0, "%d"); + success &= EXPECT_EQL(xsSubst(xs, 0, 5, "Goodbye", -1), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Goodbye World"); + success &= EXPECT_EQL(xsSubst(xs, 8, 3, "Otherside", 1), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Goodbye Old"); + + /** Trim right whitespace. **/ + success &= EXPECT_EQL(xsCopy(xs, " \tTest \n\t ", -1), 0, "%d"); + success &= EXPECT_EQL(xsRTrim(xs), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, " \tTest"); + + /** Trim left whitespace. **/ + success &= EXPECT_EQL(xsCopy(xs, " \t\n Test\n ", -1), 0, "%d"); + success &= EXPECT_EQL(xsLTrim(xs), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Test\n "); + + /** Trim whitespace on both sides. **/ + success &= EXPECT_EQL(xsCopy(xs, " \t Trimmed \n ", -1), 0, "%d"); + success &= EXPECT_EQL(xsTrim(xs), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Trimmed"); + + /** Test xsInsertAfter() at offset. **/ + success &= EXPECT_EQL(xsCopy(xs, "Hello", -1), 0, "%d"); + success &= EXPECT_EQL(xsInsertAfter(xs, "p the fast food restaurant", 7, 3), 10, "%d"); + success &= EXPECT_XSTR_EQL(xs, "Help the flo"); + + /** Test that xsCheckAlloc() does not destroy content. **/ + success &= EXPECT_EQL(xsCopy(xs, "test", -1), 0, "%d"); + success &= EXPECT_EQL(xsCheckAlloc(xs, 9284), 0, "%d"); + success &= EXPECT_XSTR_EQL(xs, "test"); + + /** Clean up. **/ + xsFree(xs); + + return success; + } + +long long test(char** tname) + { + *tname = "xstring-00 Full Test"; + return loop_tests(do_tests); + } diff --git a/centrallix-os/cluster-schema.cluster b/centrallix-os/cluster-schema.cluster new file mode 100644 index 00000000..5e11cd7c --- /dev/null +++ b/centrallix-os/cluster-schema.cluster @@ -0,0 +1,61 @@ +// Input schema +$Version=2$ +file_name "system/cluster" + { + name "cluster/parameter" + { + type : DATA_T // See datatypes.h + ?default : type // Default value for the variable. + ?name : String // Overrides the name above. + ?style : StyleObj // idk where to find docs for this. + } + // Access with :parameters:name. Accessing dynamic data (e.g. parameters) + // should be done within a runserver() call. + ... + + source : DataSourcePath + key_attr : string ⊂ DataSourcePath/columns + data_attr : string ⊂ DataSourcePath/columns + + cluster_name "cluster/cluster" + { + algorithm : "none" | "sliding-window" | "k-means" // Implemented + | "k-means++" | "k-medoids" | "db-scan" // Not implemented + similarity_measure : "cosine" | "levenshtein" // levenshtein not implemented. + num_clusters : uint > 1 // (probably a parameter) + ?min_improvement : double && 0.0 < x < 1.0 | "none" // default: 0.0001 + ?max_iterations : uint // default: 64 + ?window_size : uint > 0 // required for algorithm = sliding_window. + ?overlap_size : double && 0.0 <= x <= 1.0 // default: 0.0, only allowed for algorithm = k-means | k-means++ | k-medoids, not implemented + + // Not implemented + sub_cluster_name "cluster/cluster" + { + // Same as above. + } + } + ... + + search_name "system/search" + { + source : string ⊂ [cluster_name, ...] + similarity_measure : "cosine" | "levenshtein" + threshold : double && 0.0 < x < 1.0 // optimization. + } + ... + } + +// Output schema + +- /cluster_name + ? /sub_cluster_name + ? ... + - /{query} + - /items : StringVec // The data points in the cluster. + ... +/search_name +- /{query} + - /key1 : string // The key of the first data point. + - /key2 : string // The key of the second data point. + - /sim : double && 0.0 < x <= threshold // The similarity of the two data points. +... diff --git a/centrallix-sysdoc/ClusterDriverRequirements-old.md b/centrallix-sysdoc/ClusterDriverRequirements-old.md new file mode 100644 index 00000000..601f4170 --- /dev/null +++ b/centrallix-sysdoc/ClusterDriverRequirements-old.md @@ -0,0 +1,186 @@ + +## Cluster Driver Specifications +### Cluster Open +```c +void* clusterOpen(pObject obj, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); +``` +`clusterOpen()` shall... +- Create or read a node, as indicated by passed flags. + - Read flags from `obj->Mode`. + - If `O_EXCL` is specified, `O_CREAT` is specified, and there are no other elements in the path, create a new node. + - Otherwise attempt to read the previous object (in `obj->Prev`). + - If this fails and `O_CREAT` is specified, create a new node. + - If there is still no node, fail. +- Parse the provided path. + - Use `obj_internal_PathPart()` with the pathname in `obj->Pathname`. + - Not parse previous parts of the path already parsed by other drivers. + - Start at the `obj->SubPtr`-th path element (skipping `obj->SubPtr - 1` elements). + - Consume elements in the path until `obj_internal_PathPart()` returns `NULL`. + - Store the number of elements consumed in `obj->SubCnt`. +- Determine what data is being targeted from the parsed path. + - If the relevant part of the path contains only the name of the file, the driver shall set the target to root. + - If it contains the name of a valid (sub)cluster or search, the driver shall set the target to that (sub)cluster or search. + - Otherwise, the driver shall produce a descriptive error. +- Parse the provided structure file. + - Follow the spec given in `cluster-schema.cluster`. + - Produce descriptive errors when issues are detected. +- Return a new struct containing necessary information, including: + - The name, source path, and attribute name. + - All parameters (and a param list for scope), clusters, and searches. + - Each parameter shall be represented by a `pParam` object (see `params.h`). + - Each cluster shall be represented by a struct with information including: + - Its name, clustering algorithm, and similarity measure. + - The number of clusters to generate. + - If a k-means algorithm is specified, the improvement threshold. + - The maximum number of iterations to run. + - A list of subclusters with at least this information for each. + - Each search shall be represented by a struct with information including: + - Its name, threshold, and similarity measure. + - Its source, which is a valid cluster name of a cluster in the clusters list. + - Information about targets, derived from the path. + +### Cluster Close +```c +int clusterClose(void* inf_v, pObjTrxTree* oxt); +``` +`clusterClose()` shall... +- Free all allocated data in the driver struct. +- Close any open files or the like in the driver struct. +- Return 0. + +### Cluster Open Query +```c +void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt); +``` +`clusterOpenQuery()` shall... +- Return a query struct that can be passed to `clusterQueryFetch()`. + - This struct shall contain an index to the last row accessed (starting at 0). + - This struct shall contain a pointer to the driver data. + +### Cluster Query Fetch +```c +void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) +``` +`clusterQueryFetch()` shall... +- If the driver struct targets the root node, this function shall produce an error. +- If the driver struct targets an entry, this function shall produce a different error. +- If the driver targets a cluster or search, this function shall return a driver struct targetting the cluster or search *entry* (respectively) indicated by the query struct's row pointer, and increment the pointer. + - Exception: If no data remains, this function shall return `NULL` instead. + - This request shall cause clustering / searching to execute, if it has not executed already. + +### Cluster Query Close +```c +int clusterQueryClose(void* qy_v, pObjTrxTree* oxt); +``` +`clusterQueryClose()` shall... +- Free all allocated data in the query struct. +- Close any open files or the like in the query struct. +- Return 0. + +### Cluster Get Attribute Type +```c +int clusterGetAttrType(void* qy_v, pObjTrxTree* oxt); +``` +`clusterGetAttrType()` shall... +- Return the `DATA_T_...` type of the requested attribute, or `DATA_T_UNAVAILABLE` if the attribute does not exist. +- The name, content_type, inner_type, and outer_type attributes shall be of type `DATA_T_STRING`. +- The last_modification attribute shall be of type `DATA_T_DATETIME`. +- If the target is root... + - The source and attr_name attributes shall be of type `DATA_T_STRING`. +- If the target is a cluster... + - The algorithm and similarity_measure attributes shall be of type `DATA_T_STRING`. + - The num_clusters and max_iterations attributes shall be of type `DATA_T_INTEGER`. + - The improvement_threshold and average_similarity attributes shall be of type `DATA_T_DOUBLE`. +- If the target is a search... + - The source and similarity_measure attribute shall be of type `DATA_T_STRING`. + - The threshold attribute shall be of type `DATA_T_DOUBLE`. +- If the target is a cluster entry... + - The val attribute shall be of type `DATA_T_INTEGER`. + - The sim attribute shall be of type `DATA_T_DOUBLE`. +- If the target is a search entry... + - The val1 and val2 attribute shall be of type `DATA_T_INTEGER`. + - The sim attribute shall be of type `DATA_T_DOUBLE`. + +### Cluster Get Attribute Value +```c +int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* _); +``` +`clusterGetAttrValue()` shall... +- If the given datatype does not match that returned from `clusterGetAttrType()`, the function shall produce an error. +- Requesting the name attribute shall produce the following values, depending on the target: + - If the target is root, the name in the driver struct (aka. the one specified in the .cluster file) shall be produced. + - If the target is a cluster or cluster entry, the name of the cluster shall be produced. + - If the target is a search or search entry, the name of the search shall be produced. +- Requesting the annotation shall produce some string describing the driver. +- Requesting the outer_type shall produce "system/row". +- Requesting the inner_type or content_type shall produce "system/void". (All path elements are consumed.) +- If the target is root... + - Requesting source shall produce the source path. + - Requesting attr_name shall produce the attribute name. +- If the target is a cluster... + - Requesting algorithm shall produce the name of the clustering algorithm. + - Requesting similarity_measure shall produce the name of the similarity measure. + - Requesting num_clusters shall produce the number of clusters. + - Requesting max_iterations shall produce the maximum number of iterations. + - Requesting improvement_threshold shall produce the minimum improvement threshold. + - Requesting average_similarity shall produce the average size of clusters, running clustering / searching algorithms, if necessary. +- If the target is a search... + - Requesting source shall produce the name of the source cluster for the search. + - Requesting similarity_measure shall produce the name of the similarity measure. + - Requesting threshold shall produce the filtering threshold. +- If the target is a cluster entry... + - Requesting val shall produce the value of the data point in this cluster. + - Requesting sim shall produce the similarity of the data point to the center of the cluster. +- If the target is a cluster entry... + - Requesting val1 or val2 shall produce the first and second value (respectively)detected in this search. + - Requesting sim shall produce the similarity of these two data points. + + +### Cluster Get First Attribute +```c +char* clusterGetFirstAttr(void* inf_v, pObjTrxTree oxt); +``` +`clusterGetFirstAttr()` shall... +- Reset the current attribute index on the driver struct to 0. +- Return the value of invoking `clusterGetNextAttr()`. + +### Cluster Get Next Attribute +```c +char* clusterGetNextAttr(void* inf_v, pObjTrxTree oxt); +``` +`clusterGetNextAttr()` shall... +- Return the attribute name at the attribute index given by the driver struct in the list of attributes based on the target type. +- Return `NULL` if the end of the list has been reached. +- Increase the attribute index on the driver struct by 1. + +- The attribute name list for a targetting root shall include "source" and "attr_name". +- The attribute name list for a targetting a cluster shall include "algorithm", "similarity_measure", "num_clusters", "improvement_threshold", and "max_iterations". +- The attribute name list for a targetting a search shall include "source", "threshold", and "similarity_measure". +- The attribute name list for a targetting a cluster entry shall include "val" and "sim". +- The attribute name list for a targetting a search entry shall include "val1", "val2", and "sim". + +### Cluster Get Next Attribute +```c +int clusterInfo(void* inf_v, pObjectInfo info); +``` +`clusterInfo()` shall... +- Provide the OBJ_INFO_F_CANT_ADD_ATTR flag. +- Provide the OBJ_INFO_F_CANT_HAVE_CONTENT flag. +- Provide the OBJ_INFO_F_NO_CONTENT flag. +- If the target is a root... + - Provide the OBJ_INFO_F_CAN_HAVE_SUBOBJ flag. + - Provide the OBJ_INFO_F_SUBOBJ_CNT_KNOWN flag. + - Provide the OBJ_INFO_F_HAS_SUBOBJ flag if there is at least one cluster or search. + - Provide the OBJ_INFO_F_NO_SUBOBJ flag otherwise. + - Provide the total number of clusters and searches as the number of subobjects. +- If the target is a cluster... + - Provide the OBJ_INFO_F_CAN_HAVE_SUBOBJ flag. + - Provide the OBJ_INFO_F_HAS_SUBOBJ flag. + - If the algorithm has been run, provide OBJ_INFO_F_SUBOBJ_CNT_KNOWN flag and the number of data points clustered as the number of subobjects. +- If the target is a search... + - Provide the OBJ_INFO_F_CAN_HAVE_SUBOBJ flag. + - If the algorithm has been run... + - Provide OBJ_INFO_F_SUBOBJ_CNT_KNOWN flag and the number of elements found by the search as the number of subobjects. + - Provide the OBJ_INFO_F_HAS_SUBOBJ flag if at least one element was found. +- If the target is a cluster entry or a search entry... + - Provide the OBJ_INFO_F_CANT_HAVE_SUBOBJ flag. \ No newline at end of file diff --git a/centrallix-sysdoc/OSDriver_Authoring.md b/centrallix-sysdoc/OSDriver_Authoring.md index c167fce2..c4828886 100644 --- a/centrallix-sysdoc/OSDriver_Authoring.md +++ b/centrallix-sysdoc/OSDriver_Authoring.md @@ -1,52 +1,107 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + # ObjectSystem Driver Interface -Author: Greg Beeley -Date: January 13, 1999 +**Author**: Greg Beeley + +**Date**: January 13, 1999 -Updated: March 9, 2011 +**Updated**: November 17, 2025 -License: Copyright (C) 2001-2011 LightSys Technology Services. See LICENSE.txt for more information. +**License**: Copyright (C) 2001-2011 LightSys Technology Services. See LICENSE.txt for more information. ## Table of Contents - [ObjectSystem Driver Interface](#objectsystem-driver-interface) - [Table of Contents](#table-of-contents) - [I Introduction](#i-introduction) - [II Interface](#ii-interface) - - [A. Initialization](#a--initialization) - - [B. Opening And Closing Objects](#b--opening-and-closing-objects) - - [C. Creating and Deleting Objects.](#c--creating-and-deleting-objects) - - [D. Reading and Writing Object Content.](#d--reading-and-writing-object-content) - - [E. Querying for Child Objects.](#e--querying-for-child-objects) - - [F. Managing Object Attributes](#f--managing-object-attributes) - - [G. Managing Object Methods](#g--managing-object-methods) + - [Function: Open](#function-open) + - [Function: OpenChild()](#function-openchild) + - [Function: Close()](#function-close) + - [Function: Create()](#function-create) + - [Function: Delete()](#function-delete) + - [Function: DeleteObj()](#function-deleteobj) + - [Function: Read()](#function-read) + - [Function: Write()](#function-write) + - [Function: OpenQuery()](#function-openquery) + - [Function: QueryDelete()](#function-querydelete) + - [Function: QueryFetch()](#function-queryfetch) + - [Function: QueryCreate()](#function-querycreate) + - [Function: QueryClose()](#function-queryclose) + - [Function: GetAttrType()](#function-getattrtype) + - [Function: GetAttrValue()](#function-getattrvalue) + - [Function: GetFirstAttr()](#function-getfirstattr--getnextattr) + - [Function: GetNextAttr()](#function-getfirstattr--getnextattr) + - [Function: SetAttrValue()](#function-setattrvalue) + - [Function: AddAttr()](#function-addattr) + - [Function: OpenAttr()](#function-openattr) + - [Function: GetFirstMethod()](#function-getfirstmethod--getnextmethod) + - [Function: GetNextMethod()](#function-getfirstmethod--getnextmethod) + - [Function: ExecuteMethod()](#function-executemethod) + - [Function: PresentationHints()](#function-presentationhints) + - [Function: Info()](#function-info) + - [Function: Commit()](#function-commit) + - [Function: GetQueryCoverageMask()](#function-getquerycoveragemask) + - [Function: GetQueryIdentityPath()](#function-getqueryidentitypath) - [III Reading the Node Object](#iii-reading-the-node-object) - - [pSnNode snReadNode(pObject obj)](#psnnode-snreadnodepobject-obj) - - [pSnNode snNewNode(pObject obj, char* content_type)](#psnnode-snnewnodepobject-obj-char-content_type) - - [int snWriteNode(pSnNode node)](#int-snwritenodepsnnode-node) - - [int snDeleteNode(pSnNode node)](#int-sndeletenodepsnnode-node) - - [int snGetSerial(pSnNode node)](#int-sngetserialpsnnode-node) - - [pStructInf stParseMsg(pFile inp_fd, int flags)](#pstructinf-stparsemsgpfile-inp_fd-int-flags) - - [pStructInf stParseMsgGeneric(void* src, int (*read_fn)(), int flags)](#pstructinf-stparsemsggenericvoid-src-int-read_fn-int-flags) - - [int stGenerateMsg(pFile out_fd, pStructInf info, int flags)](#int-stgeneratemsgpfile-out_fd-pstructinf-info-int-flags) - - [int stGenerateMsgGeneric(void* dst, int (*write_fn)(), pStructInf info, int flags)](#int-stgeneratemsggenericvoid-dst-int-write_fn-pstructinf-info-int-flags) - - [pStructInf stCreateStruct(char* name, char* type)](#pstructinf-stcreatestructchar-name-char-type) - - [pStructInf stAddAttr(pStructInf inf, char* name)](#pstructinf-staddattrpstructinf-inf-char-name) - - [pStructInf stAddGroup(pStructInf inf, char* name, char* type)](#pstructinf-staddgrouppstructinf-inf-char-name-char-type) - - [int stAddValue(pStructInf inf, char* strval, int intval)](#int-staddvaluepstructinf-inf-char-strval-int-intval) - - [pStructInf stLookup(pStructInf inf, char* name)](#pstructinf-stlookuppstructinf-inf-char-name) - - [int stAttrValue(pStructInf inf, int* intval, char** strval, int nval)](#int-stattrvaluepstructinf-inf-int-intval-char-strval-int-nval) - - [int stFreeInf(pStructInf this)](#int-stfreeinfpstructinf-this) + - [Module: st_node](#module-st_node) + - [st_node: snReadNode()](#st_node-snreadnode) + - [st_node: snNewNode()](#st_node-snnewnode) + - [st_node: snWriteNode()](#st_node-snwritenode) + - [st_node: snDelete()](#st_node-sndeletenode) + - [st_node: snGetSerial()](#st_node-sngetserial) + - [st_node: snGetLastModification()](#st_node-sngetlastmodification) + - [Module: stparse](#module-stparse) + - [stparse: stStructType()](#stparse-ststructtype) + - [stparse: stLookup()](#stparse-stlookup) + - [stparse: stAttrValue()](#stparse-stattrvalue) + - [stparse: stGetExpression()](#stparse-stgetexpression) + - [stparse: stCreateStruct()](#stparse-stcreatestruct) + - [stparse: stAddAttr()](#stparse-staddattr) + - [stparse: stAddGroup()](#stparse-staddgroup) + - [stparse: stAddValue()](#stparse-staddvalue) + - [stparse: stFreeInf()](#stparse-stfreeinf) + - [stparse: Using Fields Directly](#stparse-using-fields-directly) - [IV Memory Management in Centrallix](#iv-memory-management-in-centrallix) - - [void* nmMalloc(int size)](#void-nmmallocint-size) - - [void nmFree(void* ptr, int size)](#void-nmfreevoid-ptr-int-size) - - [void nmStats()](#void-nmstats) - - [void nmRegister(int size, char* name)](#void-nmregisterint-size-char-name) - - [void nmDebug()](#void-nmdebug) - - [void nmDeltas()](#void-nmdeltas) - - [void* nmSysMalloc(int size)](#void-nmsysmallocint-size) - - [void nmSysFree(void* ptr)](#void-nmsysfreevoid-ptr) - - [void* nmSysRealloc(void* ptr, int newsize)](#void-nmsysreallocvoid-ptr-int-newsize) - - [char* nmSysStrdup(const char* str)](#char-nmsysstrdupconst-char-str) + - [nmMalloc()](#nmmalloc) + - [nmFree()](#nmfree) + - [nmStats()](#nmstats) + - [nmRegister()](#nmregister) + - [nmDebug()](#nmdebug) + - [nmDeltas()](#nmdeltas) + - [nmSysMalloc()](#nmsysmalloc) + - [nmSysRealloc()](#nmsysrealloc) + - [nmSysStrdup()](#nmsysstrdup) + - [nmSysFree()](#nmsysfree) - [V Other Utility Modules](#v-other-utility-modules) - [A. XArray (XA) - Arrays](#axarray-xa---arrays) - [xaInit(pXArray this, int init_size)](#xainitpxarray-this-int-init_size) @@ -115,497 +170,1032 @@ License: Copyright (C) 2001-2011 LightSys Technology Services. See LICENSE.txt - [B. Object attribute enumeration, getting, and setting.](#bobject-attribute-enumeration-getting-and-setting) - [C. Object querying (for subobjects)](#cobject-querying-for-subobjects) + + ## I Introduction -An objectsystem driver's purpose is to provide access to a particular type of local or network data/resource, and to organize that data in a tree- structured heirarchy that can be integrated into the Centrallix's ObjectSystem. This tree structure will vary based on the data being presented, but will fit the basic ObjectSystem model of a heirarchy of objects, each having attributes, perhaps some methods, and possibly content. +An objectsystem driver's purpose is to provide access to a particular type of local or network data/resource. Specific information about the resource to be accessed (such as credentials for a database, queries for selecting data, the auth token for an API, etc.) is stored in a file that is openned by the relevant driver. For example, the query driver (defined in `objdrv_query.c`) opens `.qy` files, which store one or more ObjectSQL queries used to fetch data. -Each objectsystem driver will implement this subtree structure rooted at what is called the "node" object. The node has a specifically recognizable object type which the ObjectSystem Management Layer uses to determine which OS Driver to pass control to. Normally, the 'node' object is a UNIX file either with a particular extension registered with the OSML, or a UNIX file residing in a directory containing a '.type' file, which contains the explicit object type for all objects in that directory without recognizable extensions. +When the object system starts up, each driver registers one or more type names that it supports (e.g. `"system/query"` for the query driver). When a file is openned, the object system uses the file's type name to select which driver to use. It finds this type name with one of two strategies. If the file has an extension (e.g. `example.qy`), that extension can be mapped to a type name using `types.cfg` (e.g. `.qy` maps to `"system/query"`). Althernatively, the file may reside in a directory containing a `.type` file which explicitly specifies the type name for all files in that directory without recognizable extensions. -Normally, objectsystem drivers will be able to manage any number of 'node' objects and the subtrees rooted at them. Each 'node' object will normally relate to a particular instance of a network resource, or in some cases, a group of resources that are easily enumerated. For example, a POP3 server would be a network resource that an OS driver could be written for. If the network had multiple POP3 servers, then that one OS driver would be able to access each of them using different node objects. However, if somehow the OS driver were able to easily enumerate the various POP3 servers on the network (i.e., they responded to some kind of hypothetical broadcast query), then the OS driver author could optionally design the driver to list the POP3 servers under a single node for the whole network. +Once a file is openned, the driver should organize provided data into a tree-structured hierarchy, which becomes part of the path used by Centrallix's ObjectSystem. For example, when opening `example.qy` in the ObjectSystem, the driver makes `/rows` and `/columns` available, allowing for paths such as `/apps/data/example.qy/rows`. The root of a driver's tree (`example.qy`) is called the driver's "node" object, and most paths traverse the root nodes of multiple drivers. A driver author is free to define any manner of tree structures for representing data available within their driver. However, the structure should fit the basic ObjectSystem model of a hierarchy of objects, each having attributes, and optionally some methods and/or content. -The structure of the subtree beneath the node object is entirely up to the drivers' author to determine; the OSML does not impose any structural restrictions on such subtrees. +A driver can be openned multiple times, leading one driver to have multiple "node" objects, also called instances. Typically, each "node" object relates to a particular instance of a network resource. For example, an instance of a POP3 driver might represent a POP3 server on the network. If the network had multiple POP3 servers, this driver could be used to access each of them through different node objects (e.g. `dev.pop3`, `prod.pop3`, etc.). However, if somehow the OS driver were able to easily enumerate the various POP3 servers on the network (i.e., they responded to some kind of hypothetical broadcast query), then the OS driver author could also design the driver to list the POP3 servers under a single node for the whole network. -Here is one example of an OS Driver's node object and subtree (this is for the Sybase OS Driver, objdrv_sybase.c): +The structure of the subtree beneath the node object is entirely up to the drivers' author to determine; the OSML does not impose any structural restrictions on such subtrees. Each object within this structure (e.g. `/example.qy`) can have three types of readable data: +- Child objects (e.g. `/rows`) which can have their own data. +- Content, which can be read similar to reading a file. +- Query data, allowing the object to be queried for information. -``` -OMSS_DB (type = application/sybase) +Thus, parent objects with child objects behave similarly to a directory, although they can still have separate readable data _and_ queryable data. This may seem foreign in the standard file system paradime, however, it is common for web servers, where opening a directory often returns `index.html` file in that directory, or some other form of information to allow further navigation. Querying an object was originally intended as a way to quickly traversal of its child objects, although queries are not required to be implemented this way. + +Below is an example of the Sybase driver's node object and its subtrees of child objects (defined in `objdrv_sybase.c`): + +```sh +OMSS_DB (type = "application/sybase") | - +--- JNetHelp (type = system/table) - | | - | +--- columns (type = system/table-columns) - | | | - | | +--- document_id (type = system/column) - | | | - | | +--- parent_id (type = system/column) - | | | - | | +--- title (type = system/column) - | | | - | | +--- content (type = system/column) - | | - | +--- rows (type = system/table-rows) - | | - | +--- 1 (type = system/row) - | | - | +--- 2 (type = system/row) + +----- JNetHelp (type = "system/table") + | | + | +----- columns (type = "system/table-columns") + | | | + | | +----- document_id (type = "system/column") + | | | + | | +----- parent_id (type = "system/column") + | | | + | | +----- title (type = "system/column") + | | | + | | +----- content (type = "system/column") + | | + | +----- rows (type = "system/table-rows") + | | + | +----- 1 (type = "system/row") + | | + | +----- 2 (type = "system/row") | - +--- Partner (type = system/table) + +----- Partner (type = "system/table") ``` (... and so forth) -In this case the node object would contain the information necessary to access the database, such as server name, database name, max connections to pool, and so forth. More about the node object and managing its parameters will be discussed later in this document. +In this case, the `OMSS_DB` file becomes the driver's node object. This file would contain the information necessary to access the database, such as server name, database name, max connections to pool, and so forth. -OS Drivers support several primary areas of functionality: opening and closing objects, reading and writing object content (if the object has content), setting and viewing object attributes, executing object methods, and querying an object's child objects based on name and/or attribute values. Drivers will also support the creation and deletion of objects and/or a set of child objects. +OS Drivers support several primary areas of functionality: +- Opening and closing objects. +- Creating and deleting node objects (optional). +- Reading and writing object content (optional). +- Getting and (optionally) setting object attributes. +- Executing object methods (optional). +- Querying data attributes (optional). -## II Interface -This section describes the standard interface between the OSML and the ObjectSystem driver itself. - -### A. Initialization -Each OS Driver will have an initialization function, normally named xxxInitialize() where 'xxx' is the driver's abbreviative prefix. This prefix should be attached to each and every function within the OS driver for consistency and project management. Normally 'xxx' is two to four characters, all lowercase. This initialization function is called when the Centrallix starts up, and at least at the present time, this initial call to the OS driver must be manually added to the appropriate startup code, currently found in 'centrallix.c'. - -Within the initialization function, the driver should initialize all necessary global variables and register itself with the OSML. Global variables should all be placed inside a single global 'struct', which is normally named similarly to the driver's prefix, except normally in all uppercase. Under no circumstances should global variables be accessed outside of the module, except via the module's functions. +Using the example above, we can query from the database using a statement like `select :title from /OMSS_DB/JNetHelp/rows`, which will open a sybase driver instance, then open a query and repeatedly fetch rows, getting the `title` attribute from each row. -To register with the OSML, the driver must first allocate an ObjDriver structure and fill in its contents. - pObjDriver drv; - - drv = (pObjDriver)nmMalloc(sizeof(ObjDriver)); +## II Interface +This section describes the standard interface between the OSML and the ObjectSystem driver itself. Every driver should implement certain required functions. (**Note**: Many drivers "implement" some required functions to simply fail with a not implemented or not supported error. For example, most database drivers "implement" `Read()` and `Write()` this way because database content should be queried, not read). Various optional functions are also available, which a driver is not required to implement. + + +The driver should implement an `Initialize()` function, as well as the following (* indicates required functions): +| Function Name | Description +| --------------------------------------------------------- | ------------ +| [Open](#function-open)* | Opens a new driver instance object on a given root node. +| [OpenChild](#function-openchild) | ??? +| [Close](#function-close)* | Close an open object created by either `Open()` or `QueryFetch()`. +| [Create](#function-create) | Create a new driver root node object. +| [Delete](#function-delete) | Delete an existing driver root node object. +| [DeleteObj](#function-deleteobj)* | ??? +| [OpenQuery](#function-openquery)** | Start a new query for child objects of a given object. +| [QueryDelete](#function-querydelete) | Delete specific objects from a query's result set. +| [QueryFetch](#function-queryfetch)** | Open the next child object in the query's result set. +| [QueryCreate](#function-querycreate) | ??? +| [QueryClose](#function-queryclose)** | Close an open query. +| [Read](#function-read)* | Read content from the object. +| [Write](#function-write)* | Write content to the object. +| [GetAttrType](#function-getattrtype)* | Get the type of a given object's attribute. +| [GetAttrValue](#function-getattrvalue)* | Get the value of a given object's attribute. +| [GetFirstAttr](#function-getfirstattr--getnextattr)* | Get the name of the object's first attribute. +| [GetNextAttr](#function-getfirstattr--getnextattr)* | Get the name of the object's next attribute. +| [SetAttrValue](#function-setattrvalue) | Set the value of an object's attribute. +| [AddAttr](#function-addattr) | Add a new attribute to an object. +| [OpenAttr](#function-openattr) | Open an attribute as if it were an object with content. +| [GetFirstMethod](#function-getfirstmethod--getnextmethod) | Get the name of an object's first method. +| [GetNextMethod](#function-getfirstmethod--getnextmethod) | Get the name of an object's next method. +| [ExecuteMethod](#function-executemethod) | Execute a method with a given name and optional parameter string. +| [PresentationHints](#function-presentationhints) | Get info about an object's attributes. +| [Info](#function-info)* | Get info about an object instance. +| [Commit](#function-commit) | Commit changes made to an object. +| [GetQueryCoverageMask](#function-getquerycoveragemask) | ??? +| [GetQueryIdentityPath](#function-getqueryidentitypath) | ??? + +_*Function is always required._ + +_**Function is required to support queries._ + + +--- +### Abbreviative Prefix +Each OS Driver will have an abbreviation prefix, such as `qy` for the query driver or `sydb` for the sybase database driver. This prefix should be prepended to the start of every public function name within the OS driver for consistency and scope management (e.g. `qyInitialize()`, `sydbQueryFetch()`, etc.). Normally, a driver's abbreviation prefix is two to four characters, all lowercase and may be the same as a file extension the driver supports. However, this is not an absolute requirement (see the cluster driver in `objdrv_cluster.c` which supports `.cluster` files using an abbreviation prefix of `cluster`). + +This document uses `xxx` to refer to an unspecified abbreviative prefix. + +--- +### Internal Functions +It is highly likely that driver authors will find shared functionality in the following functions, or wish to abstract out functionality from any of them for a variety of reasons. When creating additional internal functions in this way, they should be named using the convention of `xxx_internal_FunctionName()`, or possibly `xxxi_FunctionName()` for short. + +--- +### Function: Initialize +```c +/*** @returns 0 if successful, or + *** -1 if an error occurred. + ***/ +int xxxInitialize(void) +``` +- ⚠️ **Warning**: Currently, the success/failure of this function is ignored by the caller. +- 📖 **Note**: Unlike other functions defined in the driver, each driver author must manually add this call to the start up code, found in the `cxDriverInit()` function in `centrallix.c`. -This involves setting a large number of fields to the appropriate entry points within the OS Driver, as well as telling the OSML what object type(s) are handled by the driver and giving the OSML a description of the driver. A list of the required entry point functions / fields follows: +The initialization function is called when the Centrallix starts up, and should register the driver with the OSML and initialize necessary global variables. It is recommended to place global variables in a single global 'struct' that is named with the driver's prefix in all uppercase. Global variables should **NOT** be accessed from outside the driver. Instead, the driver should define functions to access them, allowing it to abstract details away from other drivers. -| Function/Field | Description -| -------------------- | ------------ -| Open | Function that the OSML calls when the user opens an object managed by this driver. -| Close | Close an open object. -| Create | Create a new object. -| Delete | Delete an existing object. -| OpenQuery | Start a query for child objects. -| QueryDelete | Delete all objects in the query result set. -| QueryFetch | Open the next child object in the query's result set. -| QueryClose | Close an open query. -| Read | Read content from the object. -| Write | Write content to the object. -| GetAttrType | Get the type of an object's attribute. -| GetAttrValue | Get the value of an object's attribute. -| GetFirstAttr | Get the first attribute associated with the object. -| GetNextAttr | Get the next attribute associated with the object. -| SetAttrValue | Set the value of an attribute. -| AddAttr | Add a new attribute to an object. -| OpenAttr | Open an attribute as if it were an object with content. -| GetFirstMethod | Get the first method of the object. -| GetNextMethod | Get the next method of an object. -| ExecuteMethod | Execute a method with an optional string parameter. +To register itself with the OSML, the driver should first allocate an ObjDriver structure and initialize its contents: -The only method that can be set to NULL is the QueryDelete method, in which case the OSML will call QueryFetch() and Delete() in succession. However, if the underlying network resource has the capability of intelligently deleting objects matching the query's criteria, this method should be implemented (as with a database server). +```c +pObjDriver drv = (pObjDriver)nmMalloc(sizeof(ObjDriver)); +if (drv == NULL) goto error_handling; +memset(drv, 0, sizeof(ObjDriver)); +... +``` -Another field in the driver structure is the Capabilities field. This field is a bitmask, and can currently contain zero or more of the following options: +To initialize this struct, the driver must: +- Provide a name (in `drv->Name`). +- Provide an array of supported root node types (in `drv->RootContentTypes`). +- Provide capability flags (in `drv->Capabilities`). +- Provide function pointers to implemented functions (see [II Interface](#ii-interface) for a list). -- OBJDRV_C_FULLQUERY: Indicates that this objectsystem driver will intelligently process the query's expression tree specified in the OpenQuery call, and will only return objects that match that expression. If this flag is missing, the OSML will filter objects returned by QueryFetch so that the calling user does not get objects that do not match the query. Typically this is set by database server drivers. +#### Name +The `name` field is a 64 character buffer (allowing names up to 63 characters, with a null terminator). It usually follows the format of the driver abbreviation prefix (in all uppercase), followed by a dash, followed by a descriptive name for the driver. - THE ABOVE IS OUT-OF-DATE. From now on, a driver can determine whether to handle the Where and OrderBy on a per-query basis, by setting values in the ObjQuery structure used when opening a new query. This is because a driver may be able to handle Where and OrderBy for some object listings but not for others. +For example: +```c +if (strcpy(drv->Name, "SYBD - Sybase Database Driver") == NULL) goto error_handling; +``` -- OBJDRV_C_TRANS: Indicates that this objectsystem driver requires transaction management by the OSML's transaction layer (the OXT layer). OS drivers that require this normally are those that for some reason cannot complete operations in independence from one another. For example, with a database driver, the creation of a new row object and the setting of its attributes must be done as one operation, although the operation requires several calls from the end user's process. The OXT allows for the grouping of objectsystem calls so that the os driver does not have to complete them independently, but instead can wait until several calls have been made before actually completing the operation. +#### RootContentTypes +The `RootContentTypes` field is an XArray containing a list of strings, representing the type names that the driver can open. This should only include types the driver will handle as root nodes, not other objects created by the driver. Thus, the sybase driver would include `"application/sybase"`, but not `"system/table"`. -The 'Name' field should be filled in with a description of the OS driver, with a maximum length of 63 characters (plus the string null terminator). Normally, the 2-4 letter prefix of the driver is included at the beginning of 'Name', such as "UXD - UNIX filesystem driver". +For example: +```c +if (xaInit(&(drv->RootContentTypes), 2) != 0) goto error_handling; +if (xaAddItem(&(drv->RootContentTypes), "application/sybase") < 0) goto error_handling; +if (xaAddItem(&(drv->RootContentTypes), ""system/query"") < 0) goto error_handling; +``` -Finally, the 'RootContentTypes' field is an XArray containing a list of strings, each of which specifies the node object types that the driver will handle. Such types are added to this XArray using the normal XArray utility functions, such as: +- 📖 **Note**: To make a specific file extension (like `.qy`) open in a driver, edit `types.cfg` to map that file extension to an available root content type supported by the driver (such as `"system/query"`). - xaInit(&drv->RootContentTypes, 16); - xaAddItem(&drv->RootContentTypes, "system/file"); - xaAddItem(&drv->RootContentTypes, "system/directory"); +#### Capabilities +The capabilities field is a bitmask which can contain zero or more of the following flags: -When the structure has been filled out, the os driver should call the OSML to register itself, using the objRegisterDriver function: +- `OBJDRV_C_FULLQUERY`: Indicates that this objectsystem driver will intelligently process the query's expression tree specified in the OpenQuery call, and will only return objects that match that expression. If this flag is missing, the OSML will filter objects returned by QueryFetch so that the calling user does not get objects that do not match the query. Typically this is set by database server drivers. + - > **THE ABOVE IS OUT-OF-DATE** (May 16th, 2022): A driver can now determine whether to handle the Where and OrderBy on a per-query basis, by setting values in the ObjQuery structure used when opening a new query. This allows a because a driver to handle Where and OrderBy for some object listings but not others. - objRegisterDriver(drv); +- `OBJDRV_C_TRANS`: Indicates that this objectsystem driver requires transaction management by the OSML's transaction layer (the OXT layer). OS drivers that require this normally are those that for some reason cannot complete operations in independence from one another. For example, with a database driver, the creation of a new row object and the setting of its attributes must be done as one operation, although the operation requires several calls from the end user's process. The OXT allows for the grouping of objectsystem calls so that the os driver does not have to complete them independently, but instead can wait until several calls have been made before actually completing the operation. -The initialization function should return 0 to indicate success, or -1 on failure. Currently, initialization success/failure is not verified by lsmain.c. +#### Registering the Driver Struct +When all values within the structure have been initialized, the driver should call the OSML to register itself, using the `objRegisterDriver()` function: -The driver should NOT nmFree() the allocated driver structure unless the objRegisterDriver() routine fails (returns -1). +```c +if (objRegisterDriver(drv) != 0) goto error_handling; +``` -Note that the RootContentTypes handled by the driver should only include the types of the objects this driver will handle as node objects. For instance, the Sybase database access driver uses "application/sybase" as its top level type. It won't register such things as "system/table". -### B. Opening And Closing Objects -As an overview, the normal procedure for the open routine to follow is this: +--- +### Function: Open() +```c +void* xxxOpen(pObject parent, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); +``` -1. Access the node object, or create it, depending on whether the object already exists as well as the open mode flags indicated by the end-user. -2. Upon successful node object access, determine what additional components of the pathname are to be handled by this driver, and verify that they can be opened, depending on the object's open mode (CREAT, EXCL, etc.) -3. If it hasn't been already, allocate a structure that will represent this open object and contain information about it and how we're to handle it. It should include a pointer to the node object. -4. Perform any operations inherent in the open process that have not already been performed (such as reading database table information, etc., when a db table's row is being accessed). -5. Return a pointer to the structure allocated in (3) as a void pointer. The OSML will pass this pointer back to the driver on subsequent calls that involve this object. +The `Open()` function opens a given file to create a new driver instance. This procedure normally includes the following steps: -The first basic part of the OS driver consists of the Open and Close routines, normally named 'xxxOpen' and 'xxxClose' within the driver, where 'xxx' is the driver's prefix. The Close routine is normally fairly simple, but the Open routine is one of the most complicated routines in a typical OS driver, for the Open routine must parse the subtree pathname beneath the node object. For example, if the node object had a pathname like: +1. Access or create the node object, depending on specified flags and whether or not it already exists. +2. Parse additional contents of the path after the root node. +3. Allocate a structure that will represent the open object, including a pointer to the node object. +4. Perform other opening operations (such as reading database table information, etc., when a db table's row is being accessed). +5. Return a pointer to the node instance as a void pointer. This pointer will be passed as `void* inf_v` to the driver in subsequent calls involving this object (except the Query functions, discussed below). - /datasources/OMSS_DB +- 📖 **Note - Transactions**: If the os driver specified the `OBJDRV_C_TRANS` capability, it must respect the current state of the user's transaction. If a new object is being created, an object is being deleted, or other modifications/additions are being performed, and if the OXT layer indicates a transaction is in process, the driver must either complete the current transaction and then complete the current call, or else add the current delete/create/modify call to the transaction tree (in which case the tree item is preallocated; all the driver needs to do is fill it in). This is handled using the transaction tree parameter (`oxt : pObjTrxTree*`). The transaction later is discussed in depth in the ??? section. + + -and the user opened an object called: +#### Accessing the Node Object +If `O_CREAT` and `O_EXCL` are both specified in `parent->Mode`, the driver should **only** create a new file and fail if the file already exists (refusing to open and read it). Otherwise, the driver should read an existing file, or create one if it does not exist and `O_CREAT` is specified, failing if no file can be read or created. - /datasources/OMSS_DB/JNetHelp/rows/1 +#### Parsing Path Contents +The task of parsing the provided path into the subtree beneath its root node is one of the more complex operations for a driver. For example, the path to a driver's root node might be `/datasources/OMSS_DB` and the user opens an object called `/datasources/OMSS_DB/JNetHelp/rows/1`. In this case, the OS driver must parse the meaning of the subtree path `JNetHelp/rows/1`, storing the data targetted by the user into the driver instance to allow later method calls to access the correct data. -the OS driver would have to determine what the subtree pathname 'JNetHelp/rows/1' means, since this path will mean different things to different os drivers. +#### Parameters +The `Open()` routine is called with five parameters: -The Open routine also must determine whether the object already exists or not, and if not, whether to create a new object. This logic is largely dependent on the obj->Mode flags, as if O_CREAT is included, the driver must attempt to create the object if it does not already exist, and if O_EXCL is included, the driver must refuse to open the object if it already exists, as with the UNIX open() system call semantics. +- `obj : pObject`: A pointer to the Object structure maintained by the OSML. This structure includes some useful fields: + + - `obj->Mode : int`: A bitmask of the O_* flags, which include: `O_RDONLY` (read only), `O_WRONLY` (write only), `O_RDWR` (read/write), `O_CREAT` (create), `O_TRUNC` (truncate), and `O_EXCL` (exclusive, see above). + + - `obj->Pathname : pPathname`: A pointer to a Pathname struct (defined in `include/obj.h`) which contains the complete parsed pathname for the object. This provides a buffer for the pathname as well as an array of pointers to the pathname's components. The function `obj_internal_PathPart()` can be used to obtain at will any component or series of components of the pathname. -Finally, if the os driver specified a capability of OBJDRV_C_TRANS, it must pay attention to the current state of the end-user's trans- action. If a new object is being created, an object is being deleted, or other modifications/additions are being performed, and if the OXT layer indicates a transaction is in process, the driver must either complete the current transaction and then complete the current call, or else add the current delete/create/modify call to the transaction tree (in which case the tree item is preallocated; all the driver needs to do is fill it in). The transaction layer will be discussed in depth later in this document. + - `obj->Pathname->OpenCtl : pStruct[]`: Parameters for the open() operation, as defined by the driver author. These are specified in the path in a similar way to URLs (`example.qy?param1=value¶m2=other_value`). Drivers typically only use `obj->Pathname->OpenCtl[obj->SubPtr]` (see SubPtr below) to retrieve their own parameters, ignoring parameters passed to other drivers in the path. -As a part of the Open process, the OS driver will normally allocate an internal structure to represent the current open object, and will return that structure as a void* data type in the return value. This pointer will be then passed to each of the other driver entry point functions, with the exception of QueryFetch, QueryDelete, and Query- Close, which will be discussed later. + - `obj->SubPtr : short`: The number of components in the path that are a part of the path to the root node object, including the `.` for the top level directory. For example, in the above path of `/data/file.csv`, the path would be internally represented as `./ data/ file.csv`, so SubPtr is 3. -The Open() routine is called with five parameters: + - `obj->SubCnt : short`: _The driver should set this value_ to show the number of components it controls. This includes the root node object, so `SubCnt` will always be at least 1. For example, when opening `/data/file.csv/rows/1`, the CSV driver will read the `SubPtr` of 3 (see above), representing `./ data/ file.csv`. It will then set a `SubCnt` of 3, representing that it will control `file.csv /rows /1`. (The driver only sets `SubCnt`, `SubPtr` is provided.) -- obj (pObject) - This is a pointer to the Object sturcture maintained by the OSML. This structure will contain some important fields for processing the open() request. + - `obj->Prev : pObject`: The underlying object as opened by the next-lower-level driver. The file can be accessed and parsed by calling functions and passing this pointer to them (such as the st_parse functions, see below). **DO NOT attempt to open the file directly with a call like `fopen()`,** as this would require hard coding the path to the root directory of the object system, which *will* break if the code runs on another machine. - obj->Mode is a bitmask of the O_* flags, which include O_RDONLY, O_WRONLY, O_RDWR, O_CREAT, O_TRUNC, and O_EXCL. + - `obj->Prev->Flags : short`: Contains some useful flags about the underlying object, such as: + - `OBJ_F_CREATED`: The underlying object was just created by this open() operation. In that case, this driver is expected to create the node with `snNewNode()` (see later in this document) as long as `obj->Mode` contains `O_CREAT`. + - obj->Pathname is a Pathname structure which contains the complete parsed pathname for the object. This structure is defined in the file include/obj.h, and has a buffer for the pathname as well as an array of pointers to the pathname's components. The function obj_internal_PathPart() can be used to obtain at will any component or series of components of the pathname. +- `mask : int`: The permission mask to be given to the object, if it is being created. Typically, this will only apply to files and directories, so most drivers can ignore it. The values are the same as the UNIX [octal digit permissions](https://en.wikipedia.org/wiki/Chmod#:~:text=Octal%20digit%20permission) used for the `chmod()` command. - obj->Pathname->OpenCtl[] contains parameters to the open() operation. Frequently these params provide additional information on how to open the object. The use of these parameters is determined by the author of the objectsystem driver. The parameters are those passed in normal URL fasion (?param=value, etc.). Typically, the only OpenCtl of interest is going to be obj->Pathname->OpenCtl[obj->SubPtr] (see below for SubPtr meaning). +- `sys_type : pContentType`: Indicates the content type of the node object as determined by the OSML. The ContentType structure is defined in `include/obj.h`. `sys_type->Name` lists the name of the content type (e.g. `"system/query"` for the query driver). + - obj->SubPtr is the number of components in the path that are a part of the node object's path. For example, in the above path of '/datasources/OMSS_DB', the path would be internally represented as './datasources/ OMSS_DB', and the SubPtr would be 3. +- `usr_type : char*`: The object type requested by the user. This is normally used when creating a new object, though some drivers also use it when opening an existing object. For example, the reporting driver generates HTML report text or plaintext reports if `usr_type` is `"text/html"` or `"text/plain"` (respectively). - obj->SubCnt reflects the number of components of the path which are under the control of the current driver. This includes the node object, so SubCnt will always be at least 1. For example, when opening '/data/file.csv/rows/1', and the driver in question is the CSV driver, SubPtr would be 3 (includes an "invisible" first component), from '/data/file.csv', and SubCnt would be 3, from 'file.csv/rows/1'. The driver will need to SET THE SUBCNT value in its Open function. SubPtr is already set. +- `oxt : pObjTrxTree*`: The transaction tree, used when the driver specifies the `OBJDRV_C_TRANS` capability. More on this field later. Non-transaction-aware drivers can safely ignore this field. + + 📖 **Note**: Yes, this param *is* a pointer to a pointer. Essentially, a pointer passed by reference. - obj->Prev is the underlying object as opened by the next-lower-level driver. It is the duty of this driver to parse the content of that object and do something meaningful with it. - obj->Prev->Flags contains some critical infor- mation about the underlying object. If it contains the flag OBJ_F_CREATED, then the underlying object was just created by this open() operation. In that case, this driver is expected to create the node with snNewNode() (see later in this document) as long as obj->Mode contains O_CREAT. +The `Open()` routine should return a pointer to an internal driver structure on success, or `NULL` on failure. It is normal to allocate one such structure per `Open()` call, and for one of the structure fields to point to shared data describing the node object. Accessing the node object is described later in this document. -- mask (int) - Indicates the security mask to be given to the object if it is being created. Typically, this will only apply to files and directories. The values are the same as UNIX chmod() type values. +While driver instance structures may vary, some fields are common in most drivers (`inf` is the pointer to the structure here): -- systype (pContentType) - This param indicates the content type of the node object as determined by the OSML. The ContentType structure is defined in include/ obj.h, and includes among other things the name of the content type. For example, for the reporting driver, this type would be "system/report". +| Field | Type | Description +| ---------- | --------- | ------------ +| inf->Obj | pObject | A copy of the `obj` pointer passed to `Open()`. +| inf->Mask | int | The `mask` argument passed to `Open()`. +| inf->Node | pSnNode | A pointer to the node object. This can come from `snNewNode()` or `snReadNode()` (for structure files), or other node struct information. -- usrtype (char*) - This param is the requested object type by the user and is normally used when creating a new object, though under some circumstances it may change the way the open operates on an existing object. For example, the reporting driver can change whether it generates HTML report text or plaintext reports based on usrtype being either "text/html" or "text/plain". -- oxt (pObjTrxTree*) - This param is only used by object drivers that specified a capability of OBJDRV_C_TRANS. More on this field later. For non-transaction-aware drivers, this field can be safely ignored. +--- +### Function: OpenChild() +*(Optional)* +```c +void* xxxOpenChild(void* inf_v, pObject obj, char* child_name, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); +``` +**No documentation provided.** - Yes, this param *is* a pointer to a pointer. Essentially, a pointer passed by reference. +--- +### Function: Close() +```c +int xxxClose(void* inf_v, pObjTrxTree* oxt); +``` +The close function closes a driver instance, freeing all allocated data and releasing all shared memory such as open connections, files, or other driver instances. The driver must ensure that all memory allocated by originally opening the object (or allocated by other functions that may be called on an open object) is properly deallocated. This includes the internal structure returned by `Open()`, or by `QueryFetch()`, which is passed in as `inf_v`. The driver may also need to decrement the Open Count (`node->OpenCnt--`) if it had to increment this value during `Open()`. Before doing so, it should also perform a `snWriteNode()` to write any modified node information to the node object. -The Open routine should return its internal structure pointer on success, or NULL on failure. It is normal to allocate one such structure per Open call, and for the structure to point, among other things, to shared data describing the node object. Accessing the node object is described later in this document. +- 📖 **Note**: Remember that the passed driver instance may originally be from a call to `Open()` or a call to `QueryFetch()`. -It is important to know what kinds of fields normally are placed in the allocated data structure returned by Open. These fields are all determined by the driver author, but here are a few typical ones that are helpful to have ("inf" is the pointer to the structure here): +- 📖 **Note**: Even if close fails, the object should still be closed in whatever way is possible. The end-user should deal with the resulting situation by reviewing the `mssError()` messages left by the driver. -| Field | Type | Description -| ---------- | --------- | ------------ -| inf->Obj | pObject | This is a copy of the 'obj' pointer passed to the Open routine. -| inf->Mask | int | The 'mask' argument passed to Open. -| inf->Node | pSnNode | A pointer to the node object, as returned from snNewNode() or snReadNode(), or if structure files aren't being used as the node content type, a pointer to whatever structure contains information about the node object. +- 📖 **Note**: Information may be left unfreed if it is stored in a cache for later use. -The Close() routine is called with two parameters: +The `Close()` routine is called with two parameters: | Param | Type | Description | ------ | ------------ | ------------ -| inf_v | void* | This param is the pointer that the Open routine returned. Normally the driver will cast the void* parameter to some other structure pointer to access the object's information. -| oxt | pObjTrxTree* | The transaction tree pointer. - -The Close routine should return 0 on success or -1 on failure. The os driver must make sure it properly deallocates the memory used by originally opening the object, such as the internal structure returned by open and passed in as inf_v. +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -Note the semantics of a Close failure - the object should still be closed in whatever way is still meaningful. The end-user must deal with the situation by reviewing the returned mssError messages. +The Close routine should return 0 on success or -1 on failure. -Before exiting, the Close routine should make sure it decrements the Open Count (node->OpenCnt--). Before doing so, it should also perform a snWriteNode() to write any modified node information back to the node object. -### C. Creating and Deleting Objects. -The Create and Delete functions are used for creating and deleting objects. Normally, the os driver will process the Pathname in the same manner for Create and Delete as for Open, thus such functionality could be placed in another function. +### Function: Create() +```c +int xxxCreate(pObject obj, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); +``` +The `Create()` function is used to create a new object, and uses the same parameters and return value as `Open()` (documented in detail above). This often means adding a new file to the file system to represent the object. Many drivers do not implement this and recommend that driver end-users create files using a standard text editor or programatically using more general means, such as general structure file generation. If implemented, this function frequently requires very similar path parsing functionality to `Open()`. -As a side note, within Centrallix, the standard function naming convention is to use xxx_internal_FunctionName for functions that are more or less internal to the module and not a part of any standard interface. +- 📖 **Note**: For many drivers, the `Create()` function calls the driver's `Open()` function with `O_CREAT`, then calls its `Close()` function, although some drivers may manage this differently. -The Create routine has parameters identical to the Open routine. It should return 0 on success and -1 on error. -The Delete routine is passed the following parameters: +### Function: Delete() +```c +int clusterDelete(pObject obj, pObjTrxTree* oxt); +``` +The `Delete()` function is used to delete an object, which often means removing a file from the file system. The Delete routine is passed the following parameters: | Param | Type | Description | ------ | ------------- | ------------ | obj | pObject | The Object structure pointer, used in the same way as in Open and Delete. -| oxt | pObjTrxTree* | The transaction tree pointer. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. Delete should return 0 on success and -1 on failure. -For many objectsystem drivers, the Create function simply calls the driver's internal Open() with O_CREAT and then its internal Close, although some drivers could manage Create differently from Open. -### D. Reading and Writing Object Content. -Some, but not all, objects will have content. If the object does or can have content, the driver should handle these functions as is appropriate. Otherwise, the driver should return a failure code (-1) from these functions. +### Function: DeleteObj() +```c +int xxxDeleteObj(void* inf_v, pObjTrxTree* oxt); +``` +**No documentation provided.** + + +### Function: Read() +```c +int xxxRead(void* inf_v, char* buffer, int max_cnt, int offset, int flags, pObjTrxTree* oxt); +``` + + +The `Read()` function reads content from objects that have content, similar to reading content from a file. If the object does or can have content, the driver should handle these functions as is appropriate. Otherwise, the driver should return a failure code (-1) and call `mssError()` in these functions. -The Read routine reads content from the object, as if reading from a file. The parameters passed are almost identical to those used in the fdRead command in MTASK: +The parameters passed are intentionally similar to the `fdRead()` function in `mtask.c`: | Parameter | Type | Description | --------- | ------------- | ------------ -| inf_v | void* | The generic pointer to the structure returned from Open(). -| buffer | char* | The destination buffer for the data being read in. -| maxcnt | int | The maximum number of bytes to read into the buffer. -| flags | int | Either 0 or FD_U_SEEK, in which case the user is specifying the seek offset for the read in the 5th argument. Of course, not all objects will be seekable, and furthermore, some of the objects handled by the driver may have full or limited seek functionality, even though others may not. -| arg | int | Extra argument, currently only used to specify an optional seek offset. -| oxt | pObjTrxTree* | The transaction tree pointer. +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| buffer | char* | The buffer where read data should be stored. +| max_cnt | int | The maximum number of bytes to read into the buffer. +| flags | int | Either `0` or `FD_U_SEEK`. If `FD_U_SEEK` is specified, the caller should specify a seek offset in the 5th argument (`arg`). +| arg | int | Extra argument, currently only used to specify the optional seek offset. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -The Write routine is very similar, except that instead of 'maxcnt', the third argument is 'cnt', and specifies how much data is in the buffer waiting to be written. +- 📖 **Note**: Not all objects can be seekable and some of the objects handled by the driver may have limited seek functionality, even if others do not. Each of these routines should return -1 on failure and return the number of bytes read/written on success. At end of file or on device hangup, 0 should be returned once, and then subsequent calls should return -1. -### E. Querying for Child Objects. -Many objects will have the capability of having sub-objects beneath them, called child objects. In such a case, the parent object becomes a directory of sorts, even though the parent object may also have content, something which is somewhat foreign in the standard filesystem world, but is common for web servers, where opening a directory returns the file 'index.html' on many occasions. -To enumerate a parent object's child objects, the query functions are used. A query may have a specific criteria so that only objects having certain attributes will be listed. As mentioned earlier in this document, a driver may or may not choose to intelligently handle those criteria. The driver has the option of always enumerating all child objects via its query functions, and allowing the OSML filter them and only return to the user the objects that match the criteria. But it also can do the filtering itself or, more typically, pass the filtering on to the source of the data the driver manages, as with a database server. +### Function: Write() +```c +int xxxWrite(void* inf_v, char* buffer, int cnt, int offset, int flags, pObjTrxTree* oxt); +``` + +The `Write()` function is very similar to the `Read()` function above, allowing the caller to write data to objects of supporting drivers with content. However, the third argument (`max_cnt`) is replaced with `cnt`, specifying the number of bytes of data in the buffer that should be written. + + +### Function: OpenQuery() +```c +void* xxxOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt); +``` +The `OpenQuery()` function opens a new query instance struct for fetching query results from a specific driver instance. Queries are often used to enumerate an object's child objects, although this is not a requirement. Queries may include specific criteria, and the driver may decide to intelligently handle them (either manually or, more often, by passing them on to a lower level driver or database) or simply to enumerating all results with its query functions. In the latter case, the OSML layer will filter results and only return objects that match the criteria to the user. + +`OpenQuery()` is passed three parameters: +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| query | pObjQuery | A query structure created by the object system. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +The `query : pObjQuery` parameter contains several useful fields: +| Parameter | Type | Description +| --------------- | ----------------------- | ------------ +| query->QyText | char* | The text specifying the criteria (i.e., the WHERE clause, in Centrallix SQL syntax). +| query->Tree | void* (pExpression) | The compiled expression tree. This expression evaluates to a nonzero value for `true` if the where clause is satisfied, or zero for `false` if it is not. +| query->SortBy[] | void*[] (pExpression[]) | An array of expressions giving the various components of the sorting criteria. +| query->Flags | int | The driver should set and/or clear the `OBJ_QY_F_FULLQUERY` and `OBJ_QY_F_FULLSORT` flags, if needed. -The query mechanism can also be used to delete a set of child objects, optionally matching a certain criteria. The QueryDelete method may be left NULL in the ObjDriver structure if the driver does not implement full query support, in which case the OSML will iterate through the query results and delete the objects one by one. +The `OBJ_QY_F_FULLQUERY` flag indicates that the driver will handle the full WHERE clause specified in `query->Tree`. -The first main function for handling queries is OpenQuery. This function is passed three arguments: +The `OBJ_QY_F_FULLSORT` flag indicates that the driver will handle all sorting for the data specified in `query->SortBy[]`. -- inf_v (void*) The value returned from Open for this object. +If the driver can easily handle sorting/selection (as when querying an database), it should set these flags. Otherwise, it should let the OSML handle the ORDER BY and WHERE conditions to avoid unnecessary work for the driver author. -- query (pObjQuery) The query structure setup by the OSML. It will contain several key fields: +The `OpenQuery()` function returns a `void*` for the query instance struct, which will be passed to the other query functions (`QueryDelete()`, `QueryFetch()`, and `QueryClose()`). This structure normally points to the driver instance struct to allow easy access to queried data. `OpenQuery()` returns `NULL` if the object does not support queries or if an error occurs, in which case `mssError()` should be called before returning. - query->QyText: the text of the criteria (i.e., the WHERE clause, in Centrallix SQL syntax) - query->Tree: the compiled expression tree, which evaluates to nonzero for true or zero for false as the WHERE clause condition. +### Function: QueryDelete() +*(Optional)* +```c +int xxxQueryDelete(void* qy_v, pObjTrxTree* oxt); +``` + +Deletes results in the query result set, optionally matching a certain criteria. `QueryDelete()` is passed two parameters: - query->SortBy[]: an array of expressions giving the various components of the sorting criteria. +| Parameter | Type | Description +| --------- | ------------- | ------------ +| qy_v | void* | A query instance pointer (returned from `QueryOpen()`). +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. - query->Flags: the driver should set and/or clear the flags OBJ_QY_F_FULLQUERY and OBJ_QY_F_FULLSORT if need be. The former indicates that the driver is willing to handle the full WHERE clause (the query->Tree). The latter indicates that the driver is willing to handle the sorting of the data as well (in query->SortBy[]). If the driver can easily have the sorting/selection done (as when querying an RDBMS), it should set these flags. Otherwise, it should let the OSML take care of the ORDER BY and WHERE conditions. +`QueryDelete()` returns 0 to indicate a successful deletion, or -1 to indicate failure, in which case `mssError()` should be called before returning. -- oxt (pObjTrxTree*) The transaction tree pointer. +If a delete is needed and this method is not implemented, the OSML will iterate through the query results and delete the objects one by one. -The OpenQuery function should return a void* value, which will within the driver point to a structure used for managing the query. This structure will normally have a pointer to the inf_v value returned by Open as well, since inf_v is never passed to QueryFetch, QueryDelete or QueryClose. OpenQuery should return NULL if the object does not support queries or if some other error condition occurs that will prevent the execution of the query. -Once the query is underway with OpenQuery, the user will either start fetching the results with QueryFetch, or will issue a delete operation with QueryDelete. +### Function: QueryFetch() +```c +void* xxxQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt); +``` +The `QueryFetch()` function fetches a driver instance pointer (aka. an `inf_v` pointer) to a child object, or `NULL` if there are no more child objects. It may be helpful to think of `QueryFetch()` as similar to an alternate form of `Open()`, even if your driver does not implement the functionality to `Open()` every object that can be found with `QueryFetch()`. In fact, some drivers may use an internal `Open()` function to generate the opened objects. -The QueryFetch routine should return an inf_v pointer to the child object, or NULL if no more child objects are to be returned by the query. Some drivers may be able to use their internal Open function to generate the newly opened object, although others will directly allocate the inf_v structure and fill it in based on the current queried child object. QueryFetch will be passed these parameters: +`QueryFetch()` takes four parameters: -| Parameter | Type | Description -| ---------- | -------------- | ------------ -| qy_v | void* | The value returned by OpenQuery. -| obj | pObject | The newly-created object structure that the OSML is using to track the newly queried child object. -| mode | int | The open mode for the new object, as with obj->Mode in Open(). -| oxt | pObjTrxTree* | The transaction tree pointer. +| Parameter | Type | Description +| ---------- | ------------- | ------------ +| qy_v | void* | A query instance struct (returned by `OpenQuery()`). +| obj | pObject | An object structure that the OSML uses to track the newly queried child object. +| mode | int | The open mode for the new object, the same as `obj->Mode` in `Open()`. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -All object drivers will need to add an element to the obj->Pathname structure to indicate the path to the child object being returned. This will involve a process somewhat like this: (given that new_name is the new object's name, qy is the current query structure, which contains a field 'Parent' that points to the inf_v originally returned by Open, and where the inf_v contains a field Obj that points to the Object structure containing a Pathname structure) +The driver should add an element to the `obj->Pathname` structure to indicate the path of the returned child object. This will involve a process somewhat like this, where: +- `new_name : char*` is the new object's name. +- `qy : pMyDriversQueryInf` is the current query structure. +- `qy->Parent->Obj->Pathname : pPathname` points to the affected Pathname struct. - int cnt; +```c + int count; pObject obj; char* new_name; pMyDriversQueryInf qy; - /** Build the filename. **/ - cnt = snprintf(obj->Pathname->Pathbuf, 256, "%s/%s", - qy->Parent->Obj->Pathname->Pathbuf,new_name); - if (cnt < 0 || cnt >= 256) return NULL; - obj->Pathname->Elements[obj->Pathname->nElements++] = - strrchr(obj->Pathname->Pathbuf,'/')+1; + /** Build the new filename. **/ + count = snprintf(obj->Pathname->Pathbuf, 256, "%s/%s", qy->Parent->Obj->Pathname->Pathbuf, new_name); + if (count < 0 || 256 <= count) return NULL; + obj->Pathname->Elements[obj->Pathname->nElements++] = strrchr(obj->Pathname->Pathbuf, '/') + 1; +``` + +### Function: QueryCreate() +```c +void* xxxQueryCreate(void* qy_v, pObject new_obj, char* name, int mode, int permission_mask, pObjTrxTree *oxt); +``` + +**No documentation provided.** + -QueryDelete is passed the qy_v void* parameter, and an oxt parameter. It should return 0 on successful deletion, and -1 on failure. +### Function: QueryClose() +```c +int xxxQueryClose(void* qy_v, pObjTrxTree* oxt); +``` +The close function closes a query instance, freeing all allocated data and releasing all shared memory such as open connections, files, or other driver instances. This function operates very similarly to `Close()`, documented in detail above. The query should be closed, whether or not `QueryFetch()` has been called enough times to enumerate all of the query results. -QueryClose is also passed qy_v and oxt. It should close the query, whether or not QueryFetch has been called enough times to enumerate all of the query results. -### F. Managing Object Attributes -All objects will have at least some attributes. Five attributes are mandatory: 'name', 'content_type', 'inner_type', 'outer_type', and 'annotation'. All compliant drivers must implement these five attributes, all of which have a data type of DATA_T_STRING. +### Object Attributes +All objects can have attributes, and there are five required attributes that all drivers must implement (explained below). Currently, the OS specification includes support for the following data types: -- DATA_T_INTEGER - 32-bit signed integer. -- DATA_T_STRING - Zero-terminated ASCII string. -- DATA_T_DOUBLE - Double-precision floating point. -- DATA_T_DATETIME - date/time structure. -- DATA_T_MONEY - money data type. +| Name | Description +| ----------------- | ------------ +| `DATA_T_INTEGER` | 32-bit signed integer. +| `DATA_T_STRING` | Null-terminated ASCII string. +| `DATA_T_DOUBLE` | Double-precision floating point number. +| `DATA_T_DATETIME` | Date/time structure. +| `DATA_T_MONEY` | Money structure. + +See `datatypes.h` for more information. + +For `true`/`false` or `on`/`off` attributes, use `DATA_T_INTEGER` where 0 indicates `false` and 1 indicates `true`. + +The following five attributes are required (all are of type `DATA_T_STRING`): + +| Attribute | Description +| ------------ | ------------ +| name | The name of the object, just as it appears in any directory listing. The name of the object must always be unique for its directory. +| annotation | A short description of the object. While users may not assign annotations to all objects, each object should be able to have an annotation. For example, in the Sybase driver, annotations for rows are created by assigning an 'expression' to the table in question, such as `first_name + last_name` for a people table. +| content_type | The type of the object's content, given as a MIME-type. Specify `"system/void"` if the object does not have content. +| inner_type | An alias for 'content_type'. Both should be supported. +| outer_type | This is the type of the object itself (the container). Specify `"system/row"` for objects that can be queried. + +The `last_modification : DATA_T_DATETIME` attribute is a sixth, optional attribute that may be useful in some situations. This attribute should indicate the last time that the object's content was modified or updated. + + + + +### Function: GetAttrType() +```c +int xxxGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt); +``` +The `GetAttrType()` function returns DATA_T_xxx value for the datatype of the requested. It takes three parameters: + +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| attr_name | char* | The name of the attribute to be queried. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +This function should return `DATA_T_UNAVAILABLE` if the requested attribute does not exist on the driver instance. It should return -1 to indicate an error, in which case `mssError()` should be called before returning. + +For example, calling the following on any driver should return `DATA_T_STRING`. +```c +int datatype = driver->GetAttrType(inf_v, 'name', oxt); +``` + + +### Function: GetAttrValue() +```c +int xxxGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +``` +The `GetAttrValue()` function takes four parameters: + +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| attr_name | char* | The name of the attribute to be queried. +| val | pObjData | A pointer to a location where the value of the attribute should be stored. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +The value pointer should be handled in different ways, depending on the type: +- For `DATA_T_INTEGER` types, it is assumed to point to a 32-bit integer where the value should be written. +- For `DATA_T_STRING` types, it is assumed to point to an empty `char*` location where a pointer to a string should be written. +- For `DATA_T_DOUBLE` types, it is assumed to point to a double value where the double should be written. +- For `DATA_T_DATETIME` types, it is assumed to point to an empty `pDateTime` where a pointer to a date time struct (see `obj.h`) should be written. + +In this way, integer and double values are returned by value, and string or datetime values are returned by reference. Items returned by reference are guaranteed to be valid until either the object is closed, or another call to `GetAttrValue()` or `SetAttrValue()` call is made on the same driver (which ever happens first). + +This function should return -1 on a non-existent attribute, 0 on success, and 1 if the value is `NULL` or undefined / unset. + +- 📖 **Note**: The caller of this function can use the POD(x) macro to typecast appropriate pointers to the pObjData pointer, passed to this function. The ObjData structure is a UNION type of structure, allowing easy manipulation of data of various types. See `datatypes.h` for more information. + +- 📖 **Note**: In legacy code, a typecasted void* was used instead of a pObjData pointer used today. This method was binary compatible the current solution because the pObjData is a pointer to a struct union. See `datatypes.h` for more information. + + +### Function: SetAttrValue() +```c +int xxxSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +``` +The `SetAttrValue()` function is the same as `GetAttrValue()`, however it sets the value by reading it from the `val` parameter instead of getting the value by writing it to the `val` parameter. The return value is also identical, and `mssError()` should be invoked on failure, or if setting attributes programatically is not implemented. + + +### Function: GetFirstAttr() & GetNextAttr() +```c +char* xxxGetFirstAttr(void* inf_v, pObjTrxTree* oxt); +char* xxxGetNextAttr(void* inf_v, pObjTrxTree* oxt); +``` +These functions return the names of attributes that can be queried on an object. They both take the same two parameters. + +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +These functions should only return the names of significant values, so `name`, `annotation`, etc. should not be returned from these functions, even though they are required to be valid values for any object. Typically, this is implemented by `GetFirstAttr()` resetting some internal value in the driver `inf_v`, then returning the result of `GetNextAttr()`. `GetNextAttr()` extracts a string from an array or other list of valid attribute names for the object and increments the internal counter. Once the attributes are exhausted, `GetNextAttr()` returns `NULL` and `GetFirstAttr()` can be used to restart and begin querying elements from the start of the list again. If an object has no significant attributes, `GetFirstAttr()` and `GetNextAttr()` both return NULL. + + +### Function: AddAttr() +```c +int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree* oxt); +``` +The `AddAttr()` function adds a new attribute to an existing object. Not all objects support this, and many will refuse the operation. The parameters are the same as those of `GetAttrValue()` and `SetAttrValue()`, documented in detail above. + + +### Function: OpenAttr() +```c +void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree* oxt); +``` +The `OpenAttr()` function is used to open an attribute for `objRead()`/`objWrite()` as if it were an object with content. Not all object drivers will support this, and many will refuse the operation. + +This function takes 4 parameters. `inf_v`, `attr_name`, and `oxt` are the same as they are for `GetAttrValue()` and `SetAttrValue()`. `mode` is the same as it is for `Open()`. This function should return an `inf_v` pointer for the new descriptor (similar to `Open()` and `QueryFetch()` above). + + +### Function: ExecuteMethod() +```c +int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree* oxt); +``` +The `ExecuteMethod()` function is used to execute a method on an object. This feature is rarely used, but some drivers have created methods for actions like dropping their cache or printing debug information. Each method has a unique name within that object, and can take a single string parameter. + +The `ExecuteMethod()` function takes four parameters: + +| Parameter | Type | Description +| ----------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| method_name | char* | The name of the method to be executed. +| param | pObjData | A pointer to a location where the string value of the param is stored. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -True/false or on/off attributes should be treated as DATA_T_INTEGER for the time being with values of 0 and 1. +- 📖 **Note**: The `pObjData` type of the `param` parameter makes it possible that other types of parameters could be supported in the future, however, this is not currently implemented. -Here is a description of the functionality of the five mandatory attributes: +The function returns 0 on success, and -1 to indicate an error, in which case `mssError()` should be called before returning. -| Attribute | Description -| -------------- | ------------ -| 'name' | This attribute indicates the name of the object, just as it should appear in any directory listing. The name of the object must be unique for the directory it is in. -| 'content_type' | This is the type of the object's content, given as a MIME-type. -| 'annotation' | This is an annotation for the object. While users may not assign annotations to all objects, each object should be able to have an annotation. Normally the annotation is a short description of what the object is. For the Sybase driver, annotations for rows are created by assigning an 'expression' to the table in question, such as 'first_name + last_name' for a people table. -| 'inner_type' | An alias for 'content_type'. Both should be supported. -| 'outer_type' | This is the type of the object itself (the container). -A sixth attribute is not mandatory, but is useful if the object might have content that could in turn be a node object (be interpreted by another driver). This attribute is 'last_modification', of type DATA_T_DATETIME, and should indicate when the object's content was last updated or modified. +### Function: GetFirstMethod() & GetNextMethod() +```c +char* xxxGetFirstMethod(void* inf_v, pObjTrxTree* oxt); +char* xxxGetNextMethod(void* inf_v, pObjTrxTree* oxt); +``` +These functions work the same as `GetFirstAttr()` and `GetNextAttr()` (respectively), except that they return the method names instead of the attribute names. -The first function to be aware of is the GetAttrType function. This routine takes the inf_v pointer, the name of the attribute in question, and the oxt* pointer. It should return the DATA_T_xxx value for the data type of the attribute. -Next is the GetAttrValue function, which takes four parameters: the inf_v pointer, the name of the attribute, a void pointer pointing to where the attribute's value will be put, and the oxt* pointer. The way the value pointer is handled depends on the data type. For DATA_T_INTEGER types, the value pointer is assumed to be pointing to a 32-bit integer where the integer value can be written. For DATA_T_ STRING types, the value pointer is assumed to be pointing to an empty pointer location where a pointer to the string can be stored. For DATA_T_DATETIME types, the value pointer is assumed to be pointing to an empty pointer where a pointer to a date time structure (from obj.h) can be stored. And for double values, the value pointer points to a double value where the double will be stored. In this way, integer and double values are returned from GetAttrValue by value, and string or datetime values are returned from GetAttrValue by reference. Items returned by reference must be guaranteed to be valid until the object is closed, or another GetAttrValue or SetAttrValue call is made. This function should return -1 on a non-existent attribute, 0 on success, and 1 if the value is NULL or unset. +### Function: PresentationHints() +```c +pObjPresentationHints xxxPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt); +``` +The `PresentationHints()` function allows the caller to request extra information about a specific attribute on a specific driver instance object. Most of this information is intended to be used for displaying the attribute in a user interface, although it can also be useful for general data validation. As such, many drivers may not implement this function. -UPDATE ON GETATTR/SETATTR: These functions now, instead of taking a void* pointer for the value, take a pObjData pointer, which points to an ObjData structure. The POD(x) macro can be used to typecast appropriate pointers to a pObjData pointer. The ObjData structure is a UNION type of structure, allowing easy manipulation of data of various types. See 'datatypes.h'. Note that this is binary compatible with the old way of using a typecasted void pointer. +The `PresentationHints()` function takes three parameters: -The SetAttrValue function works much the same way as GetAttrValue, just with the information moving in the opposite direction. The third parameter, void* value, is treated in the same manner. +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| attr_name | char* | The name of the requested attribute. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +The returns a new pObjPresentationHints struct on success, or NULL to indicate an error, in which case `mssError()` should be called before returning. This struct should be allocated using `nmMalloc()`, and memset to zero, like this: +```c +pObjPresentationHints hints = nmMalloc(sizeof(ObjPresentationHints)); +if (hints == NULL) goto error_handling; +memset(hints, 0, sizeof(ObjPresentationHints)); +``` -The GetFirstAttr and GetNextAttr functions each take two parameters, the inf_v pointer and the oxt* pointer, and are used to iterate through the non-mandatory attributes for the object. GetFirstAttr should return a string naming the first attribute, and GetNextAttr should iterate through subsequent attributes. When the attributes are exhausted, these functions should return NULL. The attributes 'name', 'annotation', and 'content_type' should not be returned. If the object has no other attributes, GetFirstAttr should return NULL. +The return value, `hints : ObjPresentationHints`, contains the following useful fields which the function should set to give various useful information about the attribute. +- `hints->Constraint : void*`: An expression for determining if a value is valid. +- `hints->DefaultExpr : void*`: An expression defining the default value. +- `hints->MinValue : void*`: An expression defining the minimum valid value. +- `hints->MaxValue : void*`: An expression defining the maximum valid value. +- `hints->EnumList : XArray`: If the attribute is a string enum, this XArray lists the valid string values. +- `hints->EnumQuery : char*`: A query string which enumerates the valid values a string enum attribute. +- `hints->Format : char*`: presentation format - datetime or money +- `hints->AllowChars : char*`: An array of all valid characters for a string attribute, NULL to allow all characters. +- `hints->BadChars : char*`: An array of all invalid characters for a string attribute. +- `hints->Length : int`: The maximum length of data that can be included in a string attribute. +- `hints->VisualLength : int`: The length that the attribute should be displayed if it is show to the user. +- `hints->VisualLength2 : int`: The number of lines to use in a multi-line edit box for the attribute. +- `hints->BitmaskRO : unsigned int`: which bits, if any, in bitmask are read-only +- `hints->Style : int`: Style flags, documented below. +- `hints->StyleMask : int`: A mask for which style flags were set and which were left unset / undefined. +- `hints->GroupID : int`: Used to assign attributes to groups. Use -1 if the attribute is not in a group. +- `hints->GroupName : char*`: The name of the group to which this attribute belongs, or NULL if it is ungrouped or if the group is named elsewhere. +- `hints->OrderID : int`: Used to specify an attribute order. +- `hints->FriendlyName : char*`: Used to specify a "display name" for an attribute (e.g. `n_rows` might have a friendly name of `"Number of Rows"`). Should be `nmSysMalloc()`ed, often using `nmSysStrdup()`. + +- ⚠️ **Warning**: Behavior is undefined if: + - If a character is included in both `hints->AllowChars` and `hints->BadChars`. + - The data is longer than length. + +The `hints->Style` field can be set with several useful flags. To specify that a flag is not set (e.g. to specify explicitly that a field does allow `NULL`s), set the coresponding bit in the `hints->StyleMask` field while leaving the the bit in the `hints->Style` field set to 0. + +The following macros are provided for setting style flags: +- `OBJ_PH_STYLE_BITMASK`: The items in `hints->EnumList` or `hints->EnumQuery` are bitmasked. +- `OBJ_PH_STYLE_LIST`: List-style presentation should be used for the values of an enum attribute. +- `OBJ_PH_STYLE_BUTTONS`: Radio buttons or check boxes should be used for the presentation of enum attribute values. +- `OBJ_PH_STYLE_NOTNULL`: The attribute does not allow `NULL` values. +- `OBJ_PH_STYLE_STRNULL`: An empty string (`""`) should be treated as a `NULL` value. +- `OBJ_PH_STYLE_GROUPED`: The GroupID should be checked and so that fields can be grouped together. +- `OBJ_PH_STYLE_READONLY`: The user is not allowed to modify this attribute. +- `OBJ_PH_STYLE_HIDDEN`: This attribute should be hidden and not presented to the user. +- `OBJ_PH_STYLE_PASSWORD`: Values in this attribute should be hidden, such as for passwords. +- `OBJ_PH_STYLE_MULTILINE`: String values should allow multiline editting. +- `OBJ_PH_STYLE_HIGHLIGHT`: This attribute should be highlighted when presented to the user. +- `OBJ_PH_STYLE_LOWERCASE`: This attribute only allows lowercase characters. +- `OBJ_PH_STYLE_UPPERCASE`: This attribute only allows uppercase characters. +- `OBJ_PH_STYLE_TABPAGE`: Prefer the tab-page layout for grouped fields. +- `OBJ_PH_STYLE_SEPWINDOW`: Prefer separate windows for grouped fields. +- `OBJ_PH_STYLE_ALWAYSDEF`: Always reset the default value when this attribute is modified. +- `OBJ_PH_STYLE_CREATEONLY`: This attribute is writeable only when created, after that it is read only. +- `OBJ_PH_STYLE_MULTISEL`: Multiple select +- `OBJ_PH_STYLE_KEY`: This attribute is a primary key. +- `OBJ_PH_STYLE_APPLYCHG`: Presentation hints should be applied on DataChange instead of on DataModify. + + +### Function: Info() +```c +int xxxInfo(void* inf_v, pObjectInfo info); +``` +The `Info()` function allows the caller to request extra information about a specific driver instance object. It takes two parameters: + +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| info | pObjectInfo | A driver info struct allocated by the caller which the driver sets with information. + +The `pObjectInfo` struct has two fields: `Flags` and `nSubobjects`. This function should set `info->Flags` to 0 (to ensure no uninitialized noise gets into the data), then & it with all of the following flags that apply to that object. +- `OBJ_INFO_F_CAN_HAVE_SUBOBJ` / `OBJ_INFO_F_CANT_HAVE_SUBOBJ`: Indicates that the object can or cannot have subobjects. +- `OBJ_INFO_F_HAS_SUBOBJ` / `OBJ_INFO_F_NO_SUBOBJ`: Indicates that the object has or does not have subobjects. +- `OBJ_INFO_F_SUBOBJ_CNT_KNOWN`: Indicates that we know the number of subobjects. If set, the count should be stored in `info->nSubobjects`. +- `OBJ_INFO_F_CAN_HAVE_CONTENT` / `OBJ_INFO_F_CANT_HAVE_CONTENT`: Indicates that the object can or cannot have content (see `Read()` / `Write()`). +- `OBJ_INFO_F_HAS_CONTENT` / `OBJ_INFO_F_NO_CONTENT`: Indicates that this object does or does not have content (see `Read()` / `Write()`). +- `OBJ_INFO_F_CAN_SEEK_FULL`: Seeking is fully supported (both forwards and backwards) on the object. +- `OBJ_INFO_F_CAN_SEEK_REWIND`: Seeking is only supported with an offset of `0`. +- `OBJ_INFO_F_CANT_SEEK`: Seeking is not supported at all. +- `OBJ_INFO_F_CAN_ADD_ATTR` / `OBJ_INFO_F_CANT_ADD_ATTR`: Indicates that the object does or does not allow attributes to be added with the [AddAttr()](#function-addattr) function. +- `OBJ_INFO_F_SUPPORTS_INHERITANCE`: Indicates that the object supports inheritance through attributes such as `cx__inherit`. See ??? for more information about object inheritance. + + +- `OBJ_INFO_F_FORCED_LEAF`: Indicates that the object is forced to be a 'leaf' unless ls__type used. +- `OBJ_INFO_F_TEMPORARY`: Indicates that this is a temporary object without a vaoid pathname. + + + +The function returns 0 on success, and -1 to indicate an error, in which case `mssError()` should be called before returning. + + +### Function: Commit() +```c +int xxxCommit(void* inf_v, pObjTrxTree *oxt); +``` +**No documentation provided.** -AddAttr is used to add a new attribute to an existing object. Not all objects support this, and many will refuse the operation. The parameters are as follows: void* inf_v, char* attrname, int type, void* value, and pObjTrxTree* oxt. -OpenAttr is used to open an attribute for objRead/objWrite as if it were an object with content. Not all object drivers will support this; this routine should return an inf_v pointer for the new descriptor, and takes four parameters: void* inf_v, char* attrname, int mode, and pObjTrxTree* oxt. The mode is used in the same manner as the Open function. +### Function: GetQueryCoverageMask() +```c +int xxxGetQueryCoverageMask(pObjQuery this); +``` +**No documentation provided.** + + +### Function: GetQueryIdentityPath() +```c +int xxxGetQueryIdentityPath(pObjQuery this, char* pathbuf, int maxlen); +``` +**No documentation provided.** -### G. Managing Object Methods -Objects may optionally have methods associated with them. Each method is given a unique name within the object, and can take a single string parameter. Three functions exist for managing methods. -The first two functions, GetFirstMethod and GetNextMethod, work identically to their counterparts dealing with attributes. The third function, ExecuteMethod, starts a method executing. This function takes four parameters: the inf_v pointer, the name of the method, the optional string parameter, and the oxt* pointer. ## III Reading the Node Object -The Node object has content which controls what resource(s) this driver will actually access, so it is important for the driver to access the node object's content. If the driver's node objects are structure files (which is normally the case when dealing with a remote network resource), then the SN module can make opening the node object much more painless. It also performs caching automatically to improve performance. +A driver will commonly configure itself by reading text content from its node object file, at the root of its object subtree. This content may define what resource(s) a driver should provide, how it should access or compute them, and other similar information. Most drivers use the structure file format for their node objects because SN module makes parsing, reading, and writing these files easier. It also performs caching automatically to improve performance. -Note that the Node object will technically ALREADY BE OPEN as an object in the objectsystem. The OSML does that for you. If your driver will not use the SN/ST modules, then it should read the node object via the normal objRead() function, and write it via objWrite(). Your driver should NEVER objClose() the node object! The OSML does that for you. +- 📖 **Note**: The node object will **already be open** as an object in the ObjectSystem: The OSML does this for each driver. If a driver does not use the SN/ST modules, then it should read and write the node object directly with `objRead()` and `objWrite()`. A driver should **NEVER** `objClose()` the node object! The OSML handles that. -An objectsystem driver will commonly configure itself by reading a text file at the root of its object subtree. There are two main modules available for making this easier. +Although using the structure file format may be complex, it allows significant flexibility. Data is structured in hierarchies where each sub-object can have named attributes as well as sub-objects. Centrallix is filled with examples of this, including any `.qy`, `.app`, `.cmp`, or `.cluster` file. -The normal way to manage object parameters is to use a structure file. Structure files are a little more complicated, but allow for arrays of values for a given attribute name, as well as allowing for tree- structured hierarchies of attributes and values. Structure files are accessed via the stparse and st_node modules. The stparse module provides access to the individual attributes and groups of attributes, and the st_node module loads and saves the structure file heirarchies as a whole. The st_node module also provides node caching to reduce disk activity and eliminate repeated parsing of one file. +Structure files are accessed via the st_node (SN) and stparse (SP) modules. The st_node module loads and saves the structure file heirarchies as a whole. It also manages caching to reduce disk activity and eliminate repeated parsing of the same file. The stparse module provides access to the individual attributes and groups of attributes within a node structure file. -For example, if two sessions open two files, '/test1.rpt' and '/test2.rpt' the st_node (SN) module will cache the internal representations of these node object files, and for successive uses of these node objects, the physical file will not be re-parsed. The file will be re-parsed if its timestamp changes. +For example, if two sessions open two files, `/test1.rpt` and `/test2.rpt` the st_node module will cache the internal representations of these node object files, and for successive uses of these node objects, the physical file will not be re-parsed. The file will be re-parsed if its timestamp changes. -If the underlying object does not support the attribute "last_modification" (assumed to be the timestamp), then SN prints a warning. In essence, this warning indicates that changes to the underlying object will not trigger the SN module to re-read the structure file defining the node object. Otherwise, the SN module keeps track of the timestamp, and if it changes, the node object is re-read and re-parsed. +If the underlying object does not support the attribute "last_modification" (assumed to be the timestamp), then st_node prints a warning. In essence, this warning indicates that changes to the underlying object will not trigger the st_node module to re-read the structure file defining the node object. Otherwise, the st_node module keeps track of the timestamp, and if it changes, the node object is re-read and re-parsed. -The driver's first course of action to obtain node object data is to open the node object with the SN module. The SN module's functions are listed below: +### Module: st_node +To obtain node object data, the driver should first open the node object with the st_node module. To use this module, include the file `st_node.h`, which provides the following functions (read `st_node.c` for more functions and additional information): -### pSnNode snReadNode(pObject obj) -This function reads a Structure File from the already-open node object which is passed in the "obj" parameter in the xxxOpen() routine. The "obj" parameter has an element, obj->Prev, which is a link to the node object as opened by the previous driver in the OSML's chain of drivers for handling this open(). All you need to know to get the parsed node object is the following: - pSnNode node; +### st_node: snReadNode() +```c +pSnNode snReadNode(pObject obj); +``` +The `snReadNode()` function reads a Structure File from the `obj` parameter, which should be a previously openned object. In a driver's `Open()` function, this is `obj->Prev` (the node object as opened by the previous driver in the OSML's chain of drivers). - node = snReadNode(obj->Prev); +**Usage:** +```c +pSnNode node = snReadNode(obj->Prev); +if (node == NULL) goto error_handling; +``` -The returned node structure is managed by the SN module and need not be nmFree()ed. The only thing that must be done is that the driver should increment the node structure's link count like this: +The returned node structure is managed by the SN module and does not need to be `nmFree()`ed. Instead, the driver should increment the node structure's link count for as long as it intends to use this structure, using `node->OpenCnt++;`. When the structure is no longer needed (e.g. when the driver instance is closed), the driver should decrement the link count. - node->OpenCnt++; -When closing an object (and thus releasing a reference to the Node structure), the driver should decrement the link count. +### st_node: snNewNode() +```c +pSnNode snNewNode(pObject obj, char* content_type); +``` +The `snNewNode()` function creates a new node object of the given content type. The open link count should be incremented and decremented when appropriate, as with `snReadNode()`. -### pSnNode snNewNode(pObject obj, char* content_type) -This function creates a new node object with a given content type. The open link count should be incremented as appropriate, as before with snReadNode(). +**Usage:** +```c +pSnNode node = snNewNode(obj->Prev, "system/structure"); +if (node == NULL) goto error_handling; +``` - pSnNode node; +In this case, the new structure file will have the type: `"system/structure"`. - node = snNewNode(obj->Prev, "system/structure"); +- 📖 **Note**: This function only creates node object content, so the underlying object file must already exist. The OSML should do this for you because the previous driver (`obj->Prev`) creates the underlying object. -The "system/structure" argument is the type that will be assigned to the newly created node object. Note that the underlying object must already exist in order for this to create a node object as that object's content. Normally the OSML does this for you by commanding the previous driver (handling obj->Prev) to create the underlying object in question. -### int snWriteNode(pSnNode node) -This function writes a node's internal representation back out to the node file. The node's status (node->Status) should be set to SN_NS_DIRTY in order for the write to actually occur. Otherwise, snWriteNode() does nothing. +### st_node: snWriteNode() +```c +int snWriteNode(pSnNode node); +``` +The `snWriteNode()` function writes a node's internal data back out to the node file, if the node's status (`node->Status`) is set to `SN_NS_DIRTY`. Otherwise, `snWriteNode()` does nothing. -### int snDeleteNode(pSnNode node) -This function deletes a node file. At this point, does not actually delete the file but instead just removes the node's data structures from the internal node cache. -### int snGetSerial(pSnNode node) -This function returns the serial number of the node. Each time the node is re-read because of modifications to the file or is written via snWriteNode because of modifications to the internal structure, the serial number is increased. This is a good way for a driver to refresh internal information that it caches should it determine a node object has changed. +### st_node: snDelete() +```c +int snDelete(pSnNode node); +``` +The `snDelete()` function deletes a node by removing the node's data from the internal node cache. -The stparse module is used to examine the parsed contents of the node file. A node file using the stparse module (and thus st_node module) has a structure file format; see StructureFile.txt. The file format is a tree structure with objects, subobjects, and attributes. The internal parsed representation is a tree, with each tree node being an object in the structure file, and each node having attributes, each of which is also a tree node. Thus, there are three different node types in the tree representation: the top-level ST_T_STRUCT element, which can contain subgroups and attributes; a mid-level ST_T_SUBGROUP tree node, which has a content type, name, and can contain attributes and other subgroups, and lastly a ST_T_ATTRIB node which contains an attribute name and attribute values, either integer or string, and optional lists of such up to 64 items in length. To use this module, include the file stparse.h. +- 📖 **Note**: This does not actually delete the node file. -The following functions are used to manage a parsed structure file: -### pStructInf stParseMsg(pFile inp_fd, int flags) -This function is internal-use-only and is used by the st_node module to parse a structure file. +### st_node: snGetSerial() +```c +int snGetSerial(pSnNode node); +``` +The `snGetSerial()` function returns the serial number of the node. -### pStructInf stParseMsgGeneric(void* src, int (*read_fn)(), int flags) -This function is also internal-use-only (unless you want to parse the file manually without st_node's help) and is used to parse the structure file when the structure file isn't being read from an MTASK pFile descriptor. This is always the case, as the structure file data is being read from a pObject pointer. In such a case, src is the pObject pointer and read_fn is objRead(). +Each time the node is re-read because of modifications to the node file or is written with because `snWriteNode()` was called after modifications to the internal structure, the serial number is increased. This is a good way for a driver to determine if the node file has changed so it can refresh internal cached data. -### int stGenerateMsg(pFile out_fd, pStructInf info, int flags) -This function, also internal-use only, is used by the st_node module to write a structure file whose internal representation is given in the 'info' parameter. -### int stGenerateMsgGeneric(void* dst, int (*write_fn)(), pStructInf info, int flags) -This function is stParseMsgGeneric's converse. +### st_node: snGetLastModification() +```c +pDateTime snGetLastModification(pSnNode node); +``` +The `snGetLastModification()` function returns the date and time that a file was last modified. This pointer will remain valid as long as the passed `pSnNode` struct remains valid. It is managed by the `st_node` module, so the caller should not free the returned pointer. This function promises not to fail and return `NULL`. -### pStructInf stCreateStruct(char* name, char* type) -This function creates a new top-level tree item of type ST_T_STRUCT, with a given name and content-type. -### pStructInf stAddAttr(pStructInf inf, char* name) -This function adds a node of type ST_T_ATTRIB to either a ST_T_STRUCT or ST_T_SUBGROUP type of node, with a given name and no values associated with that name (see AddValue, below). The new attribute tree node is linked under the 'inf' node passed, and is returned. +### Module: stparse +The stparse module is used to examine the parsed contents of the node file using the structure file format; see [StructureFile.txt](../centrallix-doc/StructureFile.txt). This format is a tree structure with node objects that can each have sub-objects and named attributes. Thus, stparse uses three distinct node types: +- `ST_T_STRUCT`: The top-level node, containing the subtrees and attributes in the file. +- `ST_T_SUBGROUP`: A mid-level type for subobjects within the top-level node. Each subgroup has a content type, name, and may contain attributes and other subgroups. +- `ST_T_ATTRIB`: A bottom-level type for each named attribute. Each attribute has a name and values, either of type integer or string, and optional lists of such up to 64 items in length. -### pStructInf stAddGroup(pStructInf inf, char* name, char* type) -This function adds a node of type ST_T_SUBGROUP to either a ST_T_SUBGROUP or ST_T_STRUCT tree node, with a given name and content type (content type such as 'report/query'). +To use this module, include the file `stparse.h`, which includes the following functions (read `stparse.c` for more functions and additional information): -### int stAddValue(pStructInf inf, char* strval, int intval) -This function adds a value to an attribute, and can be called multiple times on an attribute to add a list of values. If 'strval' is not null, a string value is added, otherwise an integer value is added. The string is NOT copied, but is simply pointed-to. If the string is non-static, and has a lifetime less than the ST_T_ATTRIB tree node, then the following procedure must be used: - char* ptr; - char* nptr; - pStructInf attr_inf; +### stparse: stStructType() +```c +int stStructType(pStructInf this); +``` +The `stStructType()` function returns the struct type of the past `pStructInf` parameter, which is either `ST_T_ATTRIB` or `ST_T_SUBGROUP` (see above). - attr_inf = stAddAttr(my_parent_inf, "myattr"); - nptr = (char*)malloc(strlen(ptr)+1); - if (!nptr) go_report_the_error_and_return; - strcpy(nptr, ptr); - stAddValue(attr_inf, nptr, 0); - attr_inf->StrAlloc[0] = 1; +- ⚠️ **Warning**: The root node of type `ST_T_STRUCT` will return `ST_T_SUBGROUP` from this function. If you wish to avoid this, read `inf->Type` (see [stparse: Using Fields Directly](#stparse-using-fields-directly) for more info). It is unclear whether this behavior is a bug or a feature. I've decided to call it a feature! ;) -By following this method (making a copy of the string and then setting the StrAlloc value for that string), when the StructInf tree node is freed by the stparse module, the string will auto- matically be freed as well. -### pStructInf stLookup(pStructInf inf, char* name) -This routine examines all sub-tree-nodes, both group and attribute nodes, for a group or attribute with the given name. If it finds one, it returns a pointer to the sub-node, otherwise NULL. +### stparse: stLookup() +```c +pStructInf stLookup(pStructInf inf, char* name); +``` +The `stLookup()` function searches all sub-tree nodes for a group or attribute of the given name and returns a pointer to it or returns `NULL` if no group or attribute was found. -### int stAttrValue(pStructInf inf, int* intval, char** strval, int nval) -This function returns the value of the given attribute in an ST_T_ATTRIB tree node. If a string value is being returned, pass a pointer to the string pointer. If an integer value is being returned, pass a pointer to an integer. The pointer not being used must be left NULL. 'nval' can normally be 0, but if the attribute has several values, setting nval to 1,2,3, etc., returns the 2nd, 3rd, 4th item, respectively. This routing returns -1 if the attribute value did not exist or if the wrong type was requested. It also returns -1 if 'inf' was NULL. -It is common practice to use the stLookup and stAttrValue functions together to retrieve values, and search for an attribute StructInf and retrieve its value in one operation: +### stparse: stAttrValue() +```c +int stAttrValue(pStructInf inf, int* intval, char** strval, int nval); +``` +This function gets the value of the given attribute in an `ST_T_ATTRIB` node. If the value is an integer, the caller should pass a pointer to an integer where it can be stored. If the value is a string, the caller should pass a pointer to string (aka. a `char*`) where char* for the string can be stored. The unused alternate pointer must be left `NULL`. `nval` can normally be 0, but if the attribute has several values, setting nval to 1, 2, 3, etc., returns the 2nd, 3rd, 4th item, respectively. - pStructInf inf; - char* ptr; +This function returns -1 if the attribute value did not exist, if the wrong type was requested, or if 'inf' was `NULL`. - if (stAttrValue(stLookup(inf, "myattr"),NULL,&ptr,0) == 0) - { - printf("%s is the value\n", ptr); - } +It is common practice to use `stLookup()` and `stAttrValue()` or `stGetExpression()` (see below) together to retrieve values, for example (where `inf` is a `pStructInfo` variable from somewhere): -### int stFreeInf(pStructInf this) -This function is used to free a StructInf tree node. It will free any sub-nodes first, so if that is not desired, be sure to disconnect them by removing them from the SubInf array and appropriately adjusting the nSubInf counter, and setting the SubInf array position to NULL. This function also disconnects the tree node from its parent, if any, so if the parent is already free()'d, be sure to set the node's Parent pointer to NULL. Any strings marked allocated with the StrAlloc flags will be free()'d. +```c +char* ptr; +if (stAttrValue(stLookup(inf, "my_attr"), NULL, &ptr, 0) != 0) + goto error_handling; +printf("The value is: %s\n", ptr); +``` -It is also common practice to bypass the stXxx() functions entirely and access the elements of the StructInf structures themselves. This is not forbidden, and may be done. See the file stparse.h for a description of the structure. For example, - pStructInf inf; - int i; +### stparse: stGetExpression() +```c +pExpression stGetExpression(pStructInf this, int nval); +``` +Returns a pointer to an expression that represents the value of the nval-th element of the given struct. + + +### stparse: stCreateStruct() +```c +pStructInf stCreateStruct(char* name, char* type); +``` +This function creates a new top-level tree item of type `ST_T_STRUCT`, with a given name and content-type. + + +### stparse: stAddAttr() +```c +pStructInf stAddAttr(pStructInf inf, char* name); +``` +This function adds a node of type `ST_T_ATTRIB` to either an `ST_T_STRUCT` or an `ST_T_SUBGROUP` type of node, with a given name and no values (see AddValue, below). The new attribute tree node is linked under the `inf` node passed, and is returned. + + +### stparse: stAddGroup() +```c +pStructInf stAddGroup(pStructInf inf, char* name, char* type); +``` +This function adds a node of type `ST_T_SUBGROUP` to either an `ST_T_SUBGROUP` or an `ST_T_STRUCT` tree node, with a given name and content type (content type such as `"report/query"`). + + +### stparse: stAddValue() +```c +int stAddValue(pStructInf inf, char* strval, int intval); +``` +This function adds a value to an attribute, and can be called multiple times on an attribute to add a list of values. If `strval` is not null, a string value is added, otherwise an integer value is added. The string is NOT copied, but is simply pointed-to. If the string is non-static, and has a lifetime less than the `ST_T_ATTRIB` tree node, then the following procedure should be used, where `str` is the string pointer to the string: + +```c +pStructInf attr_inf = stAddAttr(my_parent_inf, "my_attr"); +if (attr_inf == NULL) goto error_handling; + +char* new_str = (char*)malloc(strlen(str) + 1lu); +if (new_str == NULL) goto error_handling; +strcpy(new_str, str); +stAddValue(attr_inf, new_str, 0); +attr_inf->StrAlloc[0] = 1; +``` + +With this method (making a copy of the string and then setting the StrAlloc value for that string), the string is automatically freed when the StructInf tree node is freed by the stparse module. + + +### stparse: stFreeInf() +```c +int stFreeInf(pStructInf this); +``` +This function is used to free a `StructInf` tree node. This also recursively frees sub-tree nodes, so these should be disconnected before calling if they are still needed. To do this, remove them from the SubInf array by appropriately adjusting the nSubInf counter and setting the SubInf array position to `NULL`. This function also disconnects the tree node from its parent, if any, so if the parent is already `free()`'d, prevent this behavior by setting the node's Parent pointer to `NULL` before calling this function. Any strings marked allocated with the StrAlloc flags will also be `free()`'d by this function, so update that flag if necessary. + + +### stparse: Using Fields Directly +It is also common practice to bypass the stparse functions entirely and access the elements of the `StructInf` struct directly, which is allowed. (See `stparse.h` for more information about this structure.) + +For example (assuming `inf` is a `pStructInfo` variable in scope): +```c +for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + switch (inf->SubInf[i]->Type) + { + case ST_T_ATTRIB: + /** Do stuff with attribute... **/ + break; + + case ST_T_SUBGROUP: + /** Do stuff with group... **/ + break; + + ... + } + } +``` + - for(i=0;inSubInf;i++) - { - if (inf->SubInf[i]->Type == ST_T_ATTRIB) - { - /** do stuff with attribute... **/ - } - } ## IV Memory Management in Centrallix -Centrallix has its own memory manager that caches freshly-deallocated blocks of memory in lists according to size so that they can be quickly reallocated. This memory manager also catches double-freeing of blocks, making debugging of memory problems a little easier. + +Centrallix has its own memory management wrapper that caches deallocated blocks of memory by size to allow for faster reuse. This wrapper also detects double-freeing of blocks (sometimes), making debugging of memory problems just a little bit easier. + +In addition, the memory manager provides statistics on the hit ratio of allocated blocks coming from the lists vs. `malloc()`, and on how many blocks of each size/type are `malloc()`ed and cached. This information can be helpful for tracking down memory leaks. Empirical testing has shown an increase of performance of around 50% or more in programs with the newmalloc module in use. + +One caveat is that this memory manager does not provide `nmRealloc()` function, only `nmMalloc()` and `nmFree()`. Thus, either `malloc()`, `free()`, and `realloc()` or `nmSysMalloc()`, `nmSysFree()`, and `nmSysRealloc()` should be used for blocks of memory that might vary in size. + +- 📖 **Note**: This memory manager is usually the wrong choice for blocks of memory of arbitrary sizes. It is intended for allocating structures quickly that are of a specific size. For example, allocated space for a struct that is always the same size. -In addition the memory manager provides statistics on the hit ratio of allocated blocks coming from the lists vs. malloc(), and information on how many blocks of each size/type are allocated out and cached. This information can be invaluable in tracking down memory leaks. +- 🥱 **tl;dr**: Use `nmMalloc()` for structs, not for strings. -One caveat is that this memory manager does not provide a realloc() function, so the standard malloc(), free(), and realloc() must be used for blocks of memory that might grow in size. This memory manager is also perhaps not the best to use for blocks of memory of arbitrary sizes, but rather is best for allocating structures quickly that are of a specific size and belong to specific objects, such as the StructInf structure or the SnNode structure, and others. In short, use it for structures, but not for strings. +- ⚠️ **Warning**: Calling `free()` on a block obtained from `nmMalloc()` or calling `nmFree()` on a block obtained from `malloc()` might not crash the program immediately. Instead, it will result in either inefficient use of the memory manager, or a significant memory leak, respectively. These practices will also lead to incorrect results from the statistics and block count mechanisms. -Empirical testing has shown an increase of performance of around 50% or more in programs with the newmalloc module in use. The following are the functions for the newmalloc module: -### void* nmMalloc(int size) -This function allocates a block of the given 'size'. It returns NULL if the memory could not be allocated. +### nmMalloc() +```c +void* nmMalloc(int size); +``` +This function allocates a block of the given `size`. It returns `NULL` if the memory could not be allocated. + + +### nmFree() +```c +void nmFree(void* ptr, int size); +``` +This function frees the block of memory. + +- ⚠️ **Warning**: The caller **must know the size of the block.** Getting this wrong is very bad!! For structures, this is trivial, simply use `sizeof()`, exactly the same as with `nmMalloc()`. + + +### nmStats() +```c +void nmStats(void); +``` +Prints statistics about the memory manager, for debugging and optimizing. + +For example: +``` +NewMalloc subsystem statistics: + nmMalloc: 0 calls, 0 hits (-nan%) + nmFree: 0 calls + bigblks: 0 too big, 0 largest size +``` + + + + +### nmRegister() +```c +void nmRegister(int size, char* name); +``` +Registers an inteligent name with a block size. This allows the memory manager to be intelligent when reporting block allocation counts. A given size can have more than one name. This function is optional and not required for any production code to work, but using it can make tracking down memory leaks easier. + +This function is usually called in a module's `Initialize()` function on each of the structures the module uses internally. + + +### nmDebug() +```c +void nmDebug(void); +``` +Prints a listing of block allocation counts, giving (by size): +- The number of blocks allocated but not yet freed. +- The number of blocks in the cache. +- The total allocations for this block size. +- A list of names (from `nmRegister()`) for that block size. + + +### nmDeltas() +```c +void nmDeltas(void); +``` +Prints a listing of all blocks whose allocation count has changed, and by how much, since the last `nmDeltas()` call. This function is VERY USEFUL FOR MEMORY LEAK DETECTIVE WORK. -### void nmFree(void* ptr, int size) -This function frees the block of memory. NOTE THAT THE CALLING FUNCTION MUST KNOW THE SIZE OF THE BLOCK. Getting this wrong is very bad. For structures, this is trivial, just use sizeof() just like with nmMalloc(). -### void nmStats() -Prints out statistics on how well the memory manager is doing. +### nmSysMalloc() +```c +void* nmSysMalloc(int size); +``` +Allocates memory without using the block-caching algorithm. This is roughly equivalent to `malloc()`, but pointers returned by malloc and this function are not compatible with each other - i.e., you cannot `free()` something that was `nmSysMalloc()`'ed, nor can you `nmSysFree()` something that was `malloc()`'ed. -### void nmRegister(int size, char* name) -Registers a name with a block size. This allows the memory manager to be intelligent when reporting block allocation counts. The first argument is the size of the block, the second, an intelligent name for that size of block. A size can have more than one name. This function is optional and need not be used except when tracking down memory leaks, but can be used freely. +- 📖 **Note**: This function is much better to use on variable-sized blocks of memory. `nmMalloc()` is better for fixed-size blocks, such as for data structures. -Typically this function is called in a module's Initialize() function on each of the structures the module uses internally. -### void nmDebug() -Prints out a listing of block allocation counts, giving (by size): 1) number of blocks allocated but not yet freed, 2) number of blocks in the cache, 3) total allocations for this block size, and a list of names (from nmRegister()) for that block size. +### nmSysRealloc() +```c +void* nmSysRealloc(void* ptr, int newsize); +``` +Changes the size of an allocated block of memory that was obtained from `nmSysMalloc()`, `nmSysRealloc()`, or `nmSysStrdup()`. The new pointer may be different if the block has to be moved. This is the rough equivalent of `realloc()`. -### void nmDeltas() -Prints a listing of all blocks whose allocation count has changed, and by how much, since the last nmDeltas() call. This function is VERY USEFUL FOR MEMORY LEAK DETECTIVE WORK. +- 📖 **Note**: If you are `realloc()`'ing a block of memory and need to store pointers to data somewhere inside the block, it is often better to store an offset rather than a full pointer. This is because a full pointer becomes invalid if a `nmSysRealloc()` causes the block to move. -### void* nmSysMalloc(int size) -Allocates memory without using the block-caching algorithm. This is roughly equivalent to malloc(), but pointers returned by malloc and this function are not compatible with each other - i.e., you cannot free() something that was nmSysMalloc'ed, nor can you nmSysFree() something that was malloc'ed. -This function is much better to use on variable-sized blocks of memory. nmMalloc is better for fixed-size blocks, such as for data structures. +### nmSysStrdup() +```c +char* nmSysStrdup(const char* str); +``` +Allocates memory using `nmSysMalloc()` function and copies the string `str` into this memory. It is a rough equivalent of `strdup()`. The resulting pointer can be free'd using `nmSysFree()`. -### void nmSysFree(void* ptr) -Frees a block of memory allocated by nmSysMalloc, nmSysStrdup, or nmSysRealloc. -### void* nmSysRealloc(void* ptr, int newsize) -Changes the size of an allocated block of memory that was obtained via nmSysMalloc or nmSysRealloc or nmSysStrdup. The new pointer may be different if the block had to be moved. This is the rough equivalent of realloc(). Usage Note: If you are realloc'ing a block of memory, and need to store pointers to data somewhere inside the block, it is often better to store the offset rather than a full pointer, as a pointer would become invalid if a nmSysRealloc caused the block to move. +### nmSysFree() +```c +void nmSysFree(void* ptr); +``` +Frees a block of memory allocated by `nmSysMalloc()`, `nmSysRealloc()`, or `nmSysStrdup()`. -### char* nmSysStrdup(const char* str) -Allocates memory for a copy of the string str by using the nmSysMalloc function, and then makes a copy of the string str. It is a rough equivalent of strdup(). The resulting pointer can be free'd using nmSysFree(). -Calling free() on a block obtained from nmMalloc() or calling nmFree() on a block obtained from malloc() will not crash the program. Instead, it will result in either inefficient use of the memory manager, or a huge memory leak, respectively. These practices will also render the statistics and block count mechanisms useless. ## V Other Utility Modules -There are many other utility modules useful in Centrallix. These include the xarray module, used for managing growable arrays; the xhash module, used for managing hash tables with no overflow problems and variable-length keys, the xstring module used for managing growable strings; the expression module used for compiling and evaluating expressions; and the mtsession module, used for managing session-level variables and reporting errors. + + +The Centrallix library (`centralllix-lib`) has a host of useful utility modules. These include `xarray`, used for managing growable arrays; `xstring`, used for managing growable strings; `xhash`, used for managing hash tables with no overflow problems and variable-length keys; `expression`, used for compiling and evaluating expressions; and `mtsession`, used for managing session-level variables and reporting errors. + ### A. XArray (XA) - Arrays The first is the xarray (XA) module. @@ -625,6 +1215,7 @@ This adds an item to the xarray, and keeps the array sorted. The value for sort #### xaFindItem(pXArray this, void* item) This returns the offset into the array's items of the given value. An exact match is required. The array's items are given below: +```c XArray xa; pStructInf inf; int item_id; @@ -639,6 +1230,7 @@ This returns the offset into the array's items of the given value. An exact matc item_id = xaFindItem(&xa, inf); inf == xa.Items[item_id]; +``` #### xaRemoveItem(pXArray this, int index) This function removes an item from the xarray at the given index. @@ -682,22 +1274,27 @@ Copies the string 'text' into the XString. Like xsConcatenate, except that the #### char* xsStringEnd(pXString this) Returns a pointer to the end of the string. Useful for finding the end of the string without performing: +```c pXString xs; xs->String + strlen(xs->String) +``` since the xs module already knows the string length and does not have to search for the null terminator. Furthermore, since the string can contain nulls, the above statement could produce incorrect results in those situations. The contents of the XString can be easily referenced via: +```c pXString xs; printf("This string is %s\n", xs->String); +``` IMPORTANT NOTE: Do not store pointers to values within the string while you are still adding text to the end of the string. If the string ends up realloc()ing, your pointers will be incorrect. Instead, if data in the middle of the string needs to be pointed to, store offsets from the beginning of the string, not pointers to the string. For example, this is WRONG: +```c pXString xs; char* ptr; @@ -706,9 +1303,11 @@ For example, this is WRONG: ptr = xsStringEnd(&xs); xsConcatenate(&xs, "This is the second sentence.", -1); printf("A pointer to the second sentence is '%s'\n", ptr); +``` Instead, use pointer aritmetic and do this: +```c pXString xs; int offset; @@ -717,6 +1316,7 @@ Instead, use pointer aritmetic and do this: offset = xsStringEnd(&xs) - xs->String; xsConcatenate(&xs, "This is the second sentence.", -1); printf("A pointer to the second sentence is '%s'\n",xs->String+offset); +``` ### D. Expression (EXP) - Expression Trees @@ -726,7 +1326,9 @@ Expressions can be stand-alone expression trees, or they can take parameter obje Expression evaluation results in the top-level expression tree node having the final value of the expression, which may be NULL, and may be an integer, string, datetime, money, or double data type. For example, the final value of +``` :myobject:oneattribute == 'yes' +``` would be integer 1 (true) if the attribute's value is indeed 'yes'. @@ -777,8 +1379,10 @@ Frees a parameter object list. #### int expAddParamToList(pParamObjects this, char* name, pObject obj, int flags) Adds a parameter to the parameter object list. The 'obj' pointer may be left NULL during the expCompileExpression state of operation but must be set to a value before expEvalTree is called. Otherwise the attributes that reference that parameter object will result in NULL values in the expression (it's technically not an error). Flags can be EXPR_O_CURRENT if the object is to be marked as the current one, or EXPR_O_PARENT if it is to be marked as the parent object. Current and Parent objects can be referenced in an expression like this: +``` :currentobjattr ::parentobjattr +``` and is thus a shortcut to typing the full object name. @@ -851,15 +1455,21 @@ drivers. Most of them are named obj_internal_XxxYyy or similar. #### char* obj_internal_PathPart(pPathname path, int start, int length) The Pathname structure breaks down a pathname into path elements, which are text strings separated by the directory separator '/'. This function takes the given Pathname structure, and returns the number of path elements requested. For instance, if you have a path: +``` /apps/kardia/data/Kardia_DB/p_partner/rows/1 +``` that path would be stored internally in Centrallix as: +``` ./apps/kardia/data/Kardia_DB/p_partner/rows/1 +``` To just return "Kardia_DB/p_partner", you could call: +``` obj_internal_PathPart(pathstruct, 4, 2); +``` Note that return values from obj_internal_PathPart are only valid until the next call to PathPart on the given pathname structure. @@ -886,9 +1496,9 @@ This function closes a network connection, and optionally waits up to 'linger_ms ### int fdWrite(pFile filedesc, char* buffer, int length, int offset, int flags) This function writes data to a file descriptor, from a given buffer and length, and to an optional seek offset and with some optional flags. Flags can be the following: -- FD_U_NOBLOCK - If the write can't be performed immediately, don't perform it at all. -- FD_U_SEEK - The 'offset' value is valid. Seek to it before writing. Not valid for network connections. -- FD_U_PACKET - ALL of the data of 'length' in 'buffer' must be written. Normal write() semantics in UNIX state that not all data has to be written, and the number of bytes actually written is returned. Setting this flag makes sure all data is really written before returning. +- `FD_U_NOBLOCK` - If the write can't be performed immediately, don't perform it at all. +- `FD_U_SEEK` - The 'offset' value is valid. Seek to it before writing. Not valid for network connections. +- `FD_U_PACKET` - ALL of the data of 'length' in 'buffer' must be written. Normal write() semantics in UNIX state that not all data has to be written, and the number of bytes actually written is returned. Setting this flag makes sure all data is really written before returning. #### int fdRead(pFile filedesc, char* buffer, int maxlen, int offset, int flags) The complement to the above routine. Takes the same flags as the above routine, except FD_U_PACKET means that all of 'maxlen' bytes must be read before returning. This is good for reading a packet that is known to be exactly 'maxlen' bytes long, but which might be broken up into fragments by the network (TCP/IP has a maximum frame transmission size of about 1450 bytes). diff --git a/centrallix-sysdoc/string_comparison.md b/centrallix-sysdoc/string_comparison.md deleted file mode 100644 index 222e3e6d..00000000 --- a/centrallix-sysdoc/string_comparison.md +++ /dev/null @@ -1,99 +0,0 @@ -# String Comparison -The following sections discuss the two approaches to calculating similarity between two strings. Both approaches use a SQL function to calculate a similarity metric (on a scale of 0 to 1) for two string parameters. - -## Table of Contents -- [String Comparison](#string-comparison) - - [Table of Contents](#table-of-contents) - - [Levenshtein Similarity](#levenshtein-similarity) - - [Levenshtein](#levenshtein) - - [Cosine Similarity](#cosine-similarity) - - [CHAR_SET](#char_set) - - [Frequency Table](#frequency-table) - - [Relative Frequency Table](#relative-frequency-table) - - [TF-IDF](#tf-idf) - - [Dot Product](#dot-product) - - [Magnitude](#magnitude) - - [Similarity](#similarity) - - [Future Implementation](#future-implementation) - - [Inverse Document Frequency (IDF)](#inverse-document-frequency-idf) - -## Levenshtein Similarity -The [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) distance is defined as the number of insertions, deletions, or substitutions required to make one string exactly like another string. - -### Levenshtein -```c -int exp_fn_levenshtein(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) -``` -Returns the levenshtein edit distance between two strings. - -```c -int exp_fn_fuzzy_compare(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) -``` -Returns a value between 0.0 (complete match) and 1.0 (complete difference) between strings a and b, based on the (levenshtein distance) / (max len of input strings). -Some alterations to the calculation are as follows: -- matching an empty string against anything returns 0.5. -- a string that only required insertions to become the other string has its (lev_dist)/(strlen) value halved before returning -The parameter max_field_width is required, but not used. - -## Cosine Similarity - -The [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) function is defined as the dot product of two vectors divided by the product of the magnitude of the two vectors. We use the relative frequency of the individual characters within each term as the vectors in the calculation. The following functions are used to calculate cosine similarity. - -### CHAR_SET -```c -const char *CHAR_SET ... -``` -`CHAR_SET` represents all of the characters that should be considered during the calculation of similarity. `CHAR_SET` can be extended to include additional characters, as necessary. - -### Frequency Table -```c -int exp_fn_i_frequency_table(double *table, char *term) -``` -Helper function for similarity(). Creates a frequency table containing indices corresponding to all characters in `CHAR_SET` (all other characters are ignored). The values in the frequency table will contain the number of times each character appers in `term`. - -The `table` parameter must be allocated prior to calling the function with `nmMalloc()` using `sizeof(x * sizeof(double))`, where `x` is the length of `CHAR_SET`. The function will initialize all `table` values to 0, before calculating the frequency values. - -### Relative Frequency Table -```c -int exp_fn_i_relative_frequency_table(double *frequency_table) -``` -Helper function for similarity(). Converts a frequency table into a relative frequency table, where each value in the `frequency_table` is converted to the percent of occurrence (i.e., frequency divided by the sum of total occurrences). - -The `frequency_table` parameter must have been created using the `exp_fn_i_frequency_table` function above. - -### TF-IDF -```c -int exp_fn_i_tf_idf_table(double *frequency_table) -``` -Helper function for similarity(). Creates a TF x IDF vector from a frequency table, where each value in the resulting table is created by multiplying the relative frequency of each letter by the corresponding coefficient in the IDF array. - -The `frequency_table` parameter must have been created using the `exp_fn_i_frequency_table` function above. - -### Dot Product -```c -int exp_fn_i_dot_product(double *dot_product, double *r_freq_table1, double *r_freq_table2) -``` -Helper function for similarity(). Calculates the dot product of two relative frequency tables (sum of the squared values from each relative frequency table). - -The `dot_product` parameter should be initialized to 0 before calling the function. The table parameters must contain relative frequency tables that are generated from the `exp_fn_i_relative_frequency_table` function. The lengths of both tables must equal the length of `CHAR_SET`. - -### Magnitude -```c -int exp_fn_i_magnitude(double *magnitude, double *r_freq_table) -``` -Helper function for similarity(). Calculates the magnitude of a relative frequency table (square root of the sum of the squared relative frequencies). - -The `magnitude` parameter should be initialized to 0 before calling the function. The table parameter must contain a relative frequency table that was generated from the `exp_fn_i_relative_frequency_table` function. The length of the frequency table must equal the length of `CHAR_SET`. - -### Similarity -```c -int exp_fn_similarity(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) -``` -Returns a value between 0.0 (completely different) and 1.0 (complete match) reflecting the similarity between the value passed in to i0 and the value passed in to i1. The first two parameters should contain strings that need to be compared. If the value 1 is passed in the third parameter, then the similarity function will rely on TF x IDF scores to determine similarity. If no third parameter is passed, then the function will rely only on relative frequency scores. - -## Future Implementation - -### Inverse Document Frequency (IDF) -In text mining, the most common metric to use in the cosine similarity function is the [TF x IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) metric. Our approach uses only TF (term frequency). Inverse document frequency calculates a weighting factor for each character. This could increase precision a small amount by weighting characters that appear on many records as less important in distinguishing matches, and weighting characters that appear on only certain records as more important. IDF could be calculated by iterating through the entire partner dataset each time. The current approach uses the relative frequency of each letter used in the English language on [Wikipedia](https://en.wikipedia.org/wiki/Letter_frequency), which may not be consistent with the data in the partner database. - - diff --git a/centrallix-sysdoc/string_similarity.md b/centrallix-sysdoc/string_similarity.md new file mode 100644 index 00000000..b6c96e8a --- /dev/null +++ b/centrallix-sysdoc/string_similarity.md @@ -0,0 +1,157 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# String Similarity +The following sections discuss the approaches to calculating similarity between two strings using the `clusters.c` library. This library can be included using `#include "cxlib/clusters.h"` in the centrallix codebase (use `#include "clusters.h"` in other libaries in centrallix-lib). + + +## Table of Contents +- [String Comparison](#string-comparison) + - [Table of Contents](#table-of-contents) + - [Cosine Similarity](#cosine-similarity) + - [Character Sets](#character-sets) + - [Character Pair Hashing](#character-pair-hashing) + - [String Vectors](#string-vectors) + - [Sparse Vectors](#sparse-vectors) + - [Computing Similarity](#computing-similarity) + - [Levenshtein Similarity](#levenshtein-similarity) + - [Clustering](#clustering) + - [K-means Clustering](#k-means-clustering) + - [K-means++ Clustering](#k-means-clustering-1) + - [K-medoids Clustering](#k-medoids-clustering) + - [DBScan Clustering](#db-scan) + - [Sliding Clusters](#sliding-clusters) + - [Future Implementation](#future-implementation) + - [K-means Fuzzy Clustering](#k-means-fuzzy-clusterings) + - [Implement Missing Algorithms](#implement-missing-algorithms) + + +## Cosine Similarity +The [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) function is defined as the dot product of two vectors divided by the product of the magnitude of the two vectors. Conceptually, it's like finding the _angle_ between two vectors. To get these vectors, we use the relative frequency of character pairs within each string. To reduce memory cost and speed up computation, we store them in a special sparsely allocated form, described below. + +### Character Sets +Cosine compare currently uses the following character sets. These can be extended or modified later, if necessary. +```c +const char ALLOW_SET[] = " \n\v\f\r!#$%&\"'()*+,-./:;<=>?@[]^_{|}~ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; +const char CHAR_SET[] = "`abcdefghijklmnopqrstuvwxyz0123456789"; +const char SIGNIFICANT_SET[] = "`ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; +const char IGNORE_SET[] = " \n\v\f\r!#$%&\"'()*+,-./:;<=>?@[]^_{|}"; +const char BOUNDARY_CHAR = ('a' - 1); // aka. '`' +``` +- `ALLOW_SET` represents all characters which can be passed to a similarity detection algorithm. Passing other characters may cause warnings and errors, undefined or unintended behavior, and even security concerns. +- `CHAR_SET` represents all of the characters that will be uniquely considered during the calculation of similarity. Currently, this is all lowercase letters and numbers. +- `SIGNIFICANT_SET` represents all of the characters that are significant for the purposes of similarity. For example, the uppercase letters are significant because they are considered identical to lowercase letters. Thus, they are included in the `SIGNIFICANT_SET`, but not in the `CHAR_SET`. +- `IGNORE_SET` represents characters which, while allowed to be passed to a similarity algorithm, will be ignored. For example, the strings "Ya!!" and "Ya..." will be considered identical. +- The `BOUNDARY_CHAR` is a special character which is conceptually added to the start and end of any string to be checked. + - This allows for pairs that functionally include only the first and last character. + - This character appears to have been selected to be one before the first character in `CHAR_SET` (thus convention dictates that it be written `'a' - 1` to indicate this), although it's unknown if that's the main or only reason. + - If `clusters.h` is included, it can be accessed using the `CA_BOUNDARY_CHAR` macro. + +### Character Pair Hashing +Even with a small set of ASCII characters (say 36), there are still `36^2 = 1296` possible character pairs. If the number of characters in the `CHAR_SET` ever needed to be expanded - for example, to include all UTF-8 characters - this number would quickly explode exponentially to utterly infeasible proportions. Thus, a hashing algorithm is employed to hash each character pair down to a more reasonable number of dimensions (which can be accessed with the `CA_NUM_DIMS` macro). + +### String Vectors +Any string of characters in the `ALLOW_SET` can be represented by a vector. For simplicity, imagine this vector has only `5` dimensions. To find this vector, we hash each character pair in the string. As each character pair is hashed (for example, that the pair "ab" happens to hash to `3`), the corresponding dimension is increased by some amount. This amount varies to based on the characters in the pair, helping to mitigate the impact of collisions where different character pairs hash to identical numbers (a larger number of dimensions also helps to mitigate this). + +Remember that the first and last characters form a pair with the `BOUNDARY_CHAR`, so the string "ab" has three pairs: "a", "ab", and "b". If these each hash to `2`, `3`, and `0`. Thus, the vector generated by the string "ab" might be: `[7, 0, 4, 3, 0]`. Notice that dimensions #1 and #4 are both `0` because no character pairs generated a hash of `1` or `4`. In real usecases, the vast majority of elements are `0`s because the number of dimensions used is much larger than the number of character pairs in a typical string. + +### Sparse Vectors +As noted above, the vast majority of elements in a vector generated by a typical string are `0`s. This would lead to a large waste of memory and computation if every `0` was stored separately, so instead, vectors are stored sparsely. Because all hashes are positive integers, we represent `n` `0`s with a value of ` -n`. Thus, the vector `[0, 1, 0, 0, 0]` (representing an empty string in `5` dimensions) would be represented sparsely as `[-1, 1, -3]`. + +**Note**: A value of `0` in a sparse vector is undefined, so no element should be equal to `0`. + +**Note**: Sparse arrays can vary greatly in length. To find their size, one needs to traverse the array until the total number of values found adds up to `CA_NUM_DIMS`. The `ca_sparse_len()` function can be used to do this. Also, the `ca_build_vector()` and `ca_free_vector()` use the `nmSys` functions from `newmalloc.h` to avoid conflicts over the size of the allocated data. + +### Computing Similarity +Finally, to find the cosine similarity between two strings, we can simply take the [dot product](https://en.wikipedia.org/wiki/Dot_product) of their coresponding vectors. Then, we normalize the dot product by dividing by the magnitudes of both vectors multiplied together. Two strings can be compared this way using the `ca_cos_compare()` function. + + +## Levenshtein Similarity +The [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) distance is defined as the number of insertions, deletions, or substitutions required to make one string exactly like another string. The version implemented in `clusters.c` additionally allows a new operation called a "swap" in which two adjacent characters change places. Transpositions of larger pieces of text are, unfortunately, not handled as well, which is a potential downfall of using levenshtein edit distance. + +The levenshtein similarity of two strings can be compared using the `ca_lev_compare()` function. + + +## Clustering +When searching for similar strings in a large amount of data (for example, `1,000,000` strings), comparing every string to every other string can be very computationally expensive. To speed up this process, it is helpful to _cluster_ similar strings together, then only compare strings within similar clusters. This sacrifices some accuracy to allow large amounts of data to be searched and compared in a feasible amount of time. + +### K-means Clustering +When clustering data using the [k-means](https://en.wikipedia.org/wiki/K-means_clustering) algorithm, data is divided into a predefined number of clusters with the goal of maximizing the average similarity of data points within any given cluster. To quickly summarize the algorithm: +1. Randomly select `k` data points to be the initial centroids of each cluster. +2. For each data point, find the centroid it is most similar to, and assign it to that cluster. +3. For each cluster, find the new centroid by averaging all data points in the cluster. +4. Repeat steps 2 and 3 until the clusters stabilize (i.e. no data point changes clusters). + +The implementation used in `clusters.c` also allows the programmer to specify a maximum number of iterations (called `max_iter` in the code) to prevent this process from running forever. Additionally, successive iterations can give diminishing results or even produce clusters that are slightly worse. To improve performance, the programmer can also specify a minimum improvement threshold (called `min_improvement`). Clusters must become more similar by at least this amount each iteration, otherwise the algorithm ends, even if the maximum number of iterations has not yet been reached. + +The `ca_kmeans()` function can be invoked using [the cosine comparison string vectors](#string-vectors) (see above) to cluster them into similar clusters. + +### K-means++ Clustering +**Not yet implemented** +This method is largely identical to k-means, except that [k-means++](https://en.wikipedia.org/wiki/K-means%2B%2B) assigns the initial centroids using an approximate algorithm designed to avoid some of the poor clustering possible with random assignment. + +### K-medoids Clustering +**Not yet implemented** +This method is also very similar to k-means, except that [k-medoids](https://en.wikipedia.org/wiki/K-medoids) places an additional requirement that all centroids be points in the data. This would theoretically allow for other similarity measures (such as Levenshtein edit distance) to be used for clustering instead of only cosine compare. + +### DB-Scan +**Proposed, not yet implemented or documented** + +### Sliding Clusters +A far more basic method of "clustering" is to simply sort all data alphabetically, then, instead of comparing each string to all other strings, it can be compared to only the next `n` strings. Of course, differences near the start of a string (for example, "fox" vs. "box") will cause those strings to sort far away from each other, leading them to be completely missed. + +Sorting using a similarity measure, such as `ca_cos_compare()` or `ca_lev_compare()` would resolve this issue. However, these comparison functions do not meet the transitivity requirement for sorting, which is that `(A < B) & (B < C) -> (A < C)`. For example, "car" is similar to "boxcar", which is also similar to "box". However, "car" and "box" are not similar at all. + +Additionally, sorting by the cosine vectors (similarly to how we cluster by them when using k-means) was proposed, but further investigation showed that this was also not possible. + +For problems where a sorting algorithm exists which can mitigate the above issues, this solution may prove very promising. However, so far we have not found such a problem, so the other clustering algorithms tend to outperform Sliding Clusters. + + +## Future Implementation + +### K-means Fuzzy Clustering +One of the biggest downsides with k-means is that it creates very arbitrary boundaries between clusters. Elements on either side of these boundaries may be highly similar, but if comparisons only occur within a cluster, these similar entries will be missed. The problem becomes more extreme as a higher k value (more clusters) is used, creating more arbitrary boundaries. This drawback is probably the main reason that clustering sacrifices some accuracy over searching every element. + +Running the entire search multiple types may allow some of these to be found because the initial cluster locations are random. This approach is partially implemented for duplicate searching because the algorithm runs nightly anyway, so a simple upsert (**UP**date existing entries; in**SERT** new entries) slightly reduces this problem. However, this solution is obviously far from ideal. + +If the clustering could be expanded with an additional step that makes clusters larger, adding elements from other clusters to them, this might effectively mitigate the issue. It may also allow developers to use larger numbers of clusters, improving performance as well as accuracy. Further research is needed to verify the effectiveness of this approach before an implementation is written. + +### Implement Missing Algorithms +Several algorithms (such as [k-means++](#k-means-clustering-1), [k-medoids](#k-medoids-clustering), and [DBScan](#db-scan)) above are proposed but lack an implementation. They may be effective and useful, however, to reduce development time, they have not yet been implemented. + +### Upgrade Other Duplicate Detection Systems +When a new record is entered, a quick scan is run to check if it might be a duplicate. There is also a button in the UI for a record that lets you run a duplicate check. These systems could also be upgraded using the new algorithms and strategies developed for general duplicate detection. + +### Known Issues +- The cluster driver often fails to open the structure file if it was modifed since the last time the path was openned. Opening a different path (including the root path, even though it does not support queries) fixes this issue. This is either a bug in the st_node caching or in the cluster driver's usage of stparse. +- The cluster does not invalidate caches if the underlying data source changes. This bug exists because I wasn't sure how to do this, but I'm pretty sure it's possible. Workaround: Developers should use `exec "cache" "drop_all"` to invalidate caches when data is changed, or use a fresh object system instance. diff --git a/centrallix/Makefile.in b/centrallix/Makefile.in index 7d2b1e23..0d13843d 100644 --- a/centrallix/Makefile.in +++ b/centrallix/Makefile.in @@ -115,6 +115,7 @@ XOBJDRIVERS=objdrv_ux.o \ objdrv_uxprint.o \ objdrv_qytree.o \ objdrv_qypivot.o \ + objdrv_cluster.o \ objdrv_datafile.o \ objdrv_audio.o \ objdrv_link.o \ @@ -133,6 +134,7 @@ XV3OBJDRIVERS= \ objdrv_uxprint_v3.o \ objdrv_qytree.o \ objdrv_qypivot.o \ + objdrv_cluster.o \ objdrv_query.o \ objdrv_datafile.o \ objdrv_audio.o \ @@ -314,6 +316,7 @@ XEXPRMODS=exp_main.o \ exp_compiler.o \ exp_evaluate.o \ exp_functions.o \ + exp_double_metaphone.o \ exp_generator.o EXPRMODS=$(patsubst %,expression/%,$(XEXPRMODS)) diff --git a/centrallix/centrallix.c b/centrallix/centrallix.c index 6467ab2b..75e19d12 100644 --- a/centrallix/centrallix.c +++ b/centrallix/centrallix.c @@ -440,6 +440,7 @@ cxDriverInit() stxInitialize(); /* Structure file driver */ qytInitialize(); /* Query Tree driver */ qypInitialize(); /* Query Pivot driver */ + clusterInitialize(); /* Cluster driver */ qyInitialize(); /* stored query (aka view) driver */ rptInitialize(); /* report writer driver */ uxpInitialize(); /* UNIX printer access driver */ @@ -694,4 +695,3 @@ cxLinkSigningSetup(pStructInf my_config) return 0; } - diff --git a/centrallix/etc/types.cfg b/centrallix/etc/types.cfg index 11ebc3e3..6cbac5ae 100644 --- a/centrallix/etc/types.cfg +++ b/centrallix/etc/types.cfg @@ -51,6 +51,7 @@ "system/symbolic-link" "Symbolic Link" lnk "" "text/plain" "text/css" "CSS File" css "" "text/plain" "system/querypivot" "Query Pivot Object" qyp "" "system/structure" +"system/cluster" "Clustering Object" cluster "" "system/structure" "application/json" "JSON data" json "" "text/plain" "text/json" "JSON data" "" "" "application/json" "text/x-json" "JSON data" "" "" "application/json" diff --git a/centrallix/expression/exp_compiler.c b/centrallix/expression/exp_compiler.c index 48f97592..702b31f8 100644 --- a/centrallix/expression/exp_compiler.c +++ b/centrallix/expression/exp_compiler.c @@ -1043,8 +1043,8 @@ expCompileExpression(char* text, pParamObjects objlist, int lxflags, int cmpflag /*** expBindExpression - do late binding of an expression tree to an *** object list. 'domain' specifies the requested bind domain, whether - *** runstatic (EXP_F_RUNSTATIC), runserver (EXP_F_RUNSERVER), or runclient - *** (EXP_F_RUNCLIENT). 'domain' can also be -0-, in which case we rebind + *** runstatic (EXPR_F_RUNSTATIC), runserver (EXPR_F_RUNSERVER), or runclient + *** (EXPR_F_RUNCLIENT). 'domain' can also be -0-, in which case we rebind *** a domainless expression. ***/ int @@ -1072,16 +1072,10 @@ expBindExpression(pExpression exp, pParamObjects objlist, int flags) break; } } - if (exp->ObjID == -1) - { - cm |= EXPR_MASK_EXTREF; - } - } - else if (exp->ObjID == -2 || exp->ObjID == -3) - { - if (exp->ObjID == -2) cm |= (1<<(objlist->CurrentID)); - if (exp->ObjID == -3) cm |= (1<<(objlist->ParentID)); + cm |= EXPR_MASK_EXTREF; } + else if (exp->ObjID == EXPR_CTL_CURRENT) cm |= (1<<(objlist->CurrentID)); + else if (exp->ObjID == EXPR_CTL_PARENT) cm |= (1<<(objlist->ParentID)); else if (exp->ObjID >= 0) { cm |= (1<<(exp->ObjID)); @@ -1105,4 +1099,3 @@ expBindExpression(pExpression exp, pParamObjects objlist, int flags) return cm; } - diff --git a/centrallix/expression/exp_double_metaphone.c b/centrallix/expression/exp_double_metaphone.c new file mode 100644 index 00000000..8b7c4cd6 --- /dev/null +++ b/centrallix/expression/exp_double_metaphone.c @@ -0,0 +1,1521 @@ +/************************************************************************/ +/* Text-DoubleMetaphone */ +/* Centrallix Core */ +/* */ +/* Copyright 2000, Maurice Aubrey . */ +/* All rights reserved. */ +/* */ +/* This code is copied for redistribution with modification, from the */ +/* gitpan/Text-DoubleMetaphone implementation on GitHub (1), which is */ +/* under the following license. */ +/* */ +/* This code is based heavily on the C++ implementation by Lawrence */ +/* Philips and incorporates several bug fixes courtesy of Kevin */ +/* Atkinson . */ +/* */ +/* This module is free software; you may redistribute it and/or */ +/* modify it under the same terms as Perl itself. */ +/* */ +/* A summary of the relevant content from https://dev.perl.org/licenses */ +/* has been included below for the convenience of the reader. This */ +/* information was collected and saved on September 5th, 2025 and may */ +/* differ from current information. For the most up to date copy of */ +/* this information, please use the link provided above. */ +/* */ +/* Perl5 is Copyright © 1993 and later, by Larry Wall and others. */ +/* */ +/* It is free software; you can redistribute it and/or modify it */ +/* under the terms of either: */ +/* */ +/* a) the GNU General Public License (2) as published by the Free */ +/* Software Foundation (3); either version 1 (2), or (at your */ +/* option) any later version (4), or */ +/* */ +/* b) the "Artistic License" (5). */ +/* */ +/* Citations: */ +/* 1: https://github.com/gitpan/Text-meta_double_metaphone */ +/* 2: https://dev.perl.org/licenses/gpl1.html */ +/* 3: http://www.fsf.org */ +/* 4: http://www.fsf.org/licenses/licenses.html#GNUGPL */ +/* 5: https://dev.perl.org/licenses/artistic.html */ +/* */ +/* Centrallix is published under the GNU General Public License, */ +/* satisfying the above requirement. A summary of this is included */ +/* below for the convenience of the reader. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: exp_double_metaphone.c */ +/* Author: Maurice Aubrey */ +/* Description: This module implements a "sounds like" algorithm by */ +/* Lawrence Philips which he published in the June, 2000 */ +/* issue of C/C++ Users Journal. Double Metaphone is an */ +/* improved version of the original Metaphone algorithm */ +/* written by Philips'. This implementaton was written by */ +/* Maurice Aubrey for C/C++ with bug fixes provided by */ +/* Kevin Atkinson. It was revised by Israel Fuller to */ +/* better align with the Centrallix coding style and */ +/* standards so that it could be included here. */ +/************************************************************************/ + +/*** Note to future programmers reading this file (by Israel Fuller): + *** + *** This file was copied from a GitHub Repo with proper licensing (in case + *** you didn't read the legal stuff above), so feel free to check it out. + *** + *** As for this code, I've modified it to use styling and memory allocation + *** consistent with the rest of the Centrallix codebase. Also, I have added + *** documentation comments and extensive test cases (at the end of the file), + *** however, these reflect my own (possibly incorrect) understanding, which + *** might not line up with the original author. + *** + *** To be honest, though, trying to make this code as readable as possible + *** was very challenging due to all the messy boolean algebra. If there is + *** ever a professional linguist reading this, please factor out some of the + *** logic into local variables with descriptive names so that the rest of us + *** can read this code without our eyes glazing over. + *** + *** If you have any questions, please feel free to reach out to me or Greg. + *** + *** Original Source: https://github.com/gitpan/Text-meta_double_metaphone + ***/ + +#include +#include +#include +#include +#include +#include + +/*** If running in a testing environment, newmalloc is not + *** available, so we fall back to default C memory allocation. + ***/ +#ifndef TESTING +#include "cxlib/newmalloc.h" +#define META_MALLOC(size) nmSysMalloc(size) +#define META_REALLOC(ptr, size) nmSysRealloc(ptr, size) +#define META_FREE(ptr) nmSysFree(ptr) +#else +#include +#define META_MALLOC(size) malloc(size) +#define META_REALLOC(ptr, size) realloc(ptr, size) +#define META_FREE(ptr) free(ptr) +#endif + +/*** Helper function to handle checking for failed memory allocation + *** Author: Israel Fuller. + *** + *** @param ptr Pointer to the memory that should be allocated. + *** @param fname The name of the function invoked to allocate memory. + *** @param size The amount of memory being allocated. + *** @returns The pointer, for chaining. + ***/ +void* meta_check_allocation(void* ptr, const char* fname, const size_t size) + { + if (ptr == NULL) + { + /** Create the most descriptive error message we can. **/ + char error_buf[BUFSIZ]; + snprintf(error_buf, sizeof(error_buf), "exp_double_metaphone.c: Fail - %s(%lu)", fname, size); + perror(error_buf); + + // Throw error for easier locating in a debugger. + fprintf(stderr, "Program will now crash.\n"); + assert(0); + } + return ptr; + } + +/** Malloc shortcut macros. **/ +#define SAFE_MALLOC(size) \ + ({ \ + const size_t sz = (size); \ + memset(meta_check_allocation(META_MALLOC(sz), "META_MALLOC", sz), 0, sz); \ + }) +#define SAFE_REALLOC(ptr, size) \ + ({ \ + const size_t sz = (size); \ + meta_check_allocation(META_REALLOC(ptr, sz), "META_REALLOC", sz); \ + }) + +typedef struct + { + char* str; + size_t length; + size_t bufsize; + int free_str_on_destroy; + } +MetaString; + +/*** Allocates a new MetaString. + *** + *** @param init_str The initial size of the string. + *** @returns The new MetaString. + ***/ +MetaString* meta_new_string(const char* init_str) + { + MetaString *s; + char empty_string[] = ""; + + s = (MetaString*)SAFE_MALLOC(sizeof(MetaString)); + + if (init_str == NULL) + init_str = empty_string; + + s->length = strlen(init_str); + /** Preallocate a bit more for potential growth. **/ + s->bufsize = s->length + 7u; + + s->str = (char*)SAFE_MALLOC(s->bufsize * sizeof(char)); + + strncpy(s->str, init_str, s->length + 1); + s->free_str_on_destroy = 1; + + return s; + } + +/*** Frees a MetaString. + *** + *** @param s The MetaString. + ***/ +void meta_destroy_string(MetaString* s) + { + if (s == NULL) + return; + + if (s->free_str_on_destroy && s->str != NULL) + META_FREE(s->str); + + META_FREE(s); + } + +/*** Increases a MetaString's buffer size. + *** + *** @param s The MetaString* being modified. + *** @param chars_needed Minimum number of characters to increase buffer size. + ***/ +void meta_increase_buffer(MetaString* s, const size_t chars_needed) + { + s->bufsize += chars_needed + 8u; + s->str = SAFE_REALLOC(s->str, s->bufsize * sizeof(char)); + } + +/*** Convert all characters of a MetaString to uppercase. + *** + *** @param s The MetaString being modified. + ***/ +void meta_make_upper(MetaString* s) + { + for (char* i = s->str; i[0] != '\0'; i++) + *i = (char)toupper(*i); + } + +/*** @param s The MetaString being checked. + *** @param pos The character location to check within the MetaString. + *** @returns 1 if the location is out of bounds for the MetaString, + *** 0 otherwise. + ***/ +bool meta_is_out_of_bounds(MetaString* s, unsigned int pos) + { + return (s->length <= pos); + } + +/*** Checks if a character in a MetaString is a vowel. + *** + *** @param s The MetaString being checked. + *** @param pos The character location to check within the MetaString. + ***/ +bool meta_is_vowel(MetaString* s, unsigned int pos) + { + if (meta_is_out_of_bounds(s, pos)) return 0; + + const char c = *(s->str + pos); + return ((c == 'A') || (c == 'E') || (c == 'I') || + (c == 'O') || (c == 'U') || (c == 'Y')); + } + +/*** Search a MetaString for "W", "K", "CZ", or "WITZ", which indicate that the + *** string is Slavo Germanic. + *** + *** @param s The MetaString to be searched. + *** @returns 1 if the MetaString is Slavo Germanic, or 0 otherwise. + ***/ +bool meta_is_slavo_germanic(MetaString* s) + { + return (strstr(s->str, "W") != NULL) + || (strstr(s->str, "K") != NULL) + || (strstr(s->str, "CZ") != NULL) + || (strstr(s->str, "WITZ") != NULL); + } + +/*** @param s The MetaString being checked. + *** @param pos The character location to check within the MetaString. + *** @returns The character at the position in the MetaString, or + *** '\0' if the position is not in the MetaString. + ***/ +char meta_get_char_at(MetaString* s, unsigned int pos) + { + return (meta_is_out_of_bounds(s, pos)) ? '\0' : ((char) *(s->str + pos)); + } + +/*** Checks for to see if any of a list of strings appear in a the given + *** MetaString after the given start position. + *** + *** @attention - Note that the START value is 0 based. + *** + *** @param s The MetaString being modified. + *** @param start The zero-based start of at which to begin searching + *** within the MetaString. + *** @param length The length of the character strings being checked. + *** @returns 1 if any of the character sequences appear after the start + *** in the MetaString and 0 otherwise. + ***/ +bool meta_is_str_at(MetaString* s, unsigned int start, ...) + { + va_list ap; + + /** Should never happen. **/ + if (meta_is_out_of_bounds(s, start)) + return 0; + + const char* pos = (s->str + start); + va_start(ap, start); + + char* test; + do + { + test = va_arg(ap, char*); + if (*test && (strncmp(pos, test, strlen(test)) == 0)) + return true; + } + while (test[0] != '\0'); + + va_end(ap); + + return false; + } + +/*** Adds a string to a MetaString, expanding the MetaString if needed. + *** + *** @param s The MetaString being modified. + *** @param new_str The string being added. + ***/ +void meta_add_str(MetaString* s, const char* new_str) + { + if (new_str == NULL) + return; + + const size_t add_length = strlen(new_str); + if ((s->length + add_length) > (s->bufsize - 1)) + meta_increase_buffer(s, add_length); + + strcat(s->str, new_str); + s->length += add_length; + } + +/*** Computes double metaphone. + *** + *** Example Usage: + *** ```c + *** char* primary_code; + *** char* secondary_code; + *** meta_double_metaphone(input, &primary_code, &secondary_code); + *** ``` + *** + *** @param str The string to compute. + *** @param primary_code A pointer to a buffer where the pointer to a string + *** containing the produced primary code will be stored. + *** @param secondary_code A pointer to a buffer where the pointer to a string + *** containing the produced secondary code will be stored. + ***/ +void meta_double_metaphone(const char* str, char** primary_code, char** secondary_code) + { + size_t length; + if (str == NULL || (length = strlen(str)) == 0u) { + fprintf(stderr, "Warning: Call to meta_double_metaphone() with invalid string.\n"); + + /** Double Metaphone on an invalid string yields two empty strings. **/ + *primary_code = (char*)SAFE_MALLOC(sizeof(char)); + *secondary_code = (char*)SAFE_MALLOC(sizeof(char)); + return; + } + unsigned int current = 0; + unsigned int last = (unsigned int)(length - 1); + + /** Pad original so we can index beyond end. **/ + MetaString* original = meta_new_string(str); + meta_make_upper(original); + meta_add_str(original, " "); + + MetaString* primary = meta_new_string(""); + MetaString* secondary = meta_new_string(""); + primary->free_str_on_destroy = 0; + secondary->free_str_on_destroy = 0; + + /** Skip these if they are at start of a word. **/ + if (meta_is_str_at(original, 0, "GN", "KN", "PN", "WR", "PS", "")) + current += 1; + + /** Initial 'X' is pronounced 'Z' e.g. 'Xavier' **/ + const char first_char = meta_get_char_at(original, 0); + if (first_char == 'X') + { + meta_add_str(primary, "S"); /* 'Z' maps to 'S' */ + meta_add_str(secondary, "S"); + current += 1; + } + + /** Precomputing this is useful. **/ + const bool is_slavo_germanic = meta_is_slavo_germanic(original); + + /** Main loop. **/ + while (current < length) + { + const char cur_char = meta_get_char_at(original, current); + const char next_char = meta_get_char_at(original, current + 1); + switch (cur_char) + { + case 'A': + case 'E': + case 'I': + case 'O': + case 'U': + case 'Y': + { + if (current == 0) + { + /** All init vowels now map to 'A'. **/ + meta_add_str(primary, "A"); + meta_add_str(secondary, "A"); + } + current += 1; + break; + } + + case 'B': + { + /** "-mb", e.g", "dumb", already skipped over... **/ + meta_add_str(primary, "P"); + meta_add_str(secondary, "P"); + + current += (next_char == 'B') ? 2 : 1; + break; + } + + case 'C': + { + /** Various germanic. **/ + if ( + (current > 1) + && !meta_is_vowel(original, current - 2) + && meta_is_str_at(original, (current - 1), "ACH", "") + && meta_get_char_at(original, current + 2) != 'I' + && ( + meta_get_char_at(original, current + 2) != 'E' + || meta_is_str_at(original, (current - 2), "BACHER", "MACHER", "") + ) + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + /** Special case 'caesar' **/ + if (current == 0 && meta_is_str_at(original, current, "CAESAR", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "S"); + current += 2; + break; + } + + /** Italian 'chianti' **/ + if (meta_is_str_at(original, current, "CHIA", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + if (meta_is_str_at(original, current, "CH", "")) + { + /** Find 'michael' **/ + if (current > 0 && meta_is_str_at(original, current, "CHAE", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "X"); + current += 2; + break; + } + + /** Greek roots e.g. 'chemistry', 'chorus' **/ + if ( + current == 0 + && meta_is_str_at(original, (current + 1), "HOR", "HYM", "HIA", "HEM", "HARAC", "HARIS", "") + && !meta_is_str_at(original, 0, "CHORE", "") + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + /** Germanic, greek, or otherwise 'ch' for 'kh' sound. */ + if ( + meta_is_str_at(original, 0, "SCH", "VAN ", "VON ", "") + /** 'architect but not 'arch', 'orchestra', 'orchid' **/ + || meta_is_str_at(original, (current - 2), "ORCHES", "ARCHIT", "ORCHID", "") + || meta_is_str_at(original, (current + 2), "T", "S", "") + || ( + (current == 0 || meta_is_str_at(original, (current - 1), "A", "O", "U", "E", "")) + /** e.g., 'wachtler', 'wechsler', but not 'tichner' **/ + && meta_is_str_at(original, (current + 2), "L", "R", "N", "M", "B", "H", "F", "V", "W", " ", "") + ) + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + else + { + if (current > 0) + { + if (meta_is_str_at(original, 0, "MC", "")) + { + /* e.g., "McHugh" */ + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + else + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "K"); + } + } + else + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + } + } + current += 2; + break; + } + + /** e.g, 'czerny' **/ + if (meta_is_str_at(original, current, "CZ", "") + && !meta_is_str_at(original, (current - 2), "WICZ", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + current += 2; + break; + } + + /** e.g., 'focaccia' **/ + if (meta_is_str_at(original, (current + 1), "CIA", "")) + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + current += 3; + break; + } + + /** Double 'C' rule. **/ + if ( + meta_is_str_at(original, current, "CC", "") + && !(current == 1 && first_char == 'M') /* McClellan exception. */ + ) + { + /** 'bellocchio' but not 'bacchus' **/ + if ( + meta_is_str_at(original, (current + 2), "I", "E", "H", "") + && !meta_is_str_at(original, (current + 2), "HU", "") + ) + { + /** 'accident', 'accede' 'succeed' **/ + if ( + (current == 1 && meta_get_char_at(original, current - 1) == 'A') + || meta_is_str_at(original, (current - 1), "UCCEE", "UCCES", "") + ) + { + meta_add_str(primary, "KS"); + meta_add_str(secondary, "KS"); + /** 'bacci', 'bertucci', other italian **/ + } + else + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + } + current += 3; + break; + } + else + { /** Pierce's rule **/ + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + } + + if (meta_is_str_at(original, current, "CK", "CG", "CQ", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + if (meta_is_str_at(original, current, "CI", "CE", "CY", "")) + { + /* Italian vs. English */ + if (meta_is_str_at(original, current, "CIO", "CIE", "CIA", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + } + else + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "S"); + } + current += 2; + break; + } + + /** else **/ + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + + /** Name sent in 'mac caffrey', 'mac gregor **/ + if (meta_is_str_at(original, (current + 1), " C", " Q", " G", "")) + current += 3; + else if (meta_is_str_at(original, (current + 1), "C", "K", "Q", "") + && !meta_is_str_at(original, (current + 1), "CE", "CI", "")) + current += 2; + else + current += 1; + break; + } + + case 'D': + { + if (meta_is_str_at(original, current, "DG", "")) + { + if (meta_is_str_at(original, (current + 2), "I", "E", "Y", "")) + { + /** e.g. 'edge' **/ + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + current += 3; + break; + } + else + { + /** e.g. 'edgar' **/ + meta_add_str(primary, "TK"); + meta_add_str(secondary, "TK"); + current += 2; + break; + } + } + + if (meta_is_str_at(original, current, "DT", "DD", "")) + { + meta_add_str(primary, "T"); + meta_add_str(secondary, "T"); + current += 2; + break; + } + + /** else **/ + meta_add_str(primary, "T"); + meta_add_str(secondary, "T"); + current += 1; + break; + } + + case 'F': + { + current += (next_char == 'F') ? 2 : 1; + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + break; + } + + case 'G': + { + if (next_char == 'H') + { + /** 'Vghee' */ + if (current > 0 && !meta_is_vowel(original, (current - 1))) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + if (current < 3) + { + /** 'ghislane', 'ghiradelli' **/ + if (current == 0) + { + if (meta_get_char_at(original, (current + 2)) == 'I') + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + } + else + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + current += 2; + break; + } + } + + if ( + /** Parker's rule (with some further refinements) - e.g., 'hugh' **/ + (current > 1 && meta_is_str_at(original, (current - 2), "B", "H", "D", "")) + /** e.g., 'bough' **/ + || (current > 2 && meta_is_str_at(original, (current - 3), "B", "H", "D", "")) + /** e.g., 'broughton' **/ + || (current > 3 && meta_is_str_at(original, (current - 4), "B", "H", "")) + ) + { + current += 2; + break; + } + else + { + /** e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' **/ + if ( + current > 2 + && meta_get_char_at(original, (current - 1)) == 'U' + && meta_is_str_at(original, (current - 3), "C", "G", "L", "R", "T", "") + ) + { + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + } + else if (current > 0 && meta_get_char_at(original, (current - 1)) != 'I') + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + + current += 2; + break; + } + } + + if (next_char == 'N') + { + if (current == 1 && !is_slavo_germanic && meta_is_vowel(original, 0)) + { + meta_add_str(primary, "KN"); + meta_add_str(secondary, "N"); + } + else + /** not e.g. 'cagney' **/ + if ( + next_char != 'Y' + && !is_slavo_germanic + && !meta_is_str_at(original, (current + 2), "EY", "") + ) + { + meta_add_str(primary, "N"); + meta_add_str(secondary, "KN"); + } + else + { + meta_add_str(primary, "KN"); + meta_add_str(secondary, "KN"); + } + current += 2; + break; + } + + /** 'tagliaro' **/ + if ( + !is_slavo_germanic + && meta_is_str_at(original, (current + 1), "LI", "") + ) + { + meta_add_str(primary, "KL"); + meta_add_str(secondary, "L"); + current += 2; + break; + } + + /** -ges-,-gep-,-gel-, -gie- at beginning **/ + if ( + current == 0 + && ( + next_char == 'Y' + || meta_is_str_at( + original, (current + 1), + "ES", "EP", "EB", "EL", "EY", "IB", + "IL", "IN", "IE", "EI", "ER", "" + ) + ) + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "J"); + current += 2; + break; + } + + /** -ger-, -gy- **/ + if ( + (next_char == 'Y' || meta_is_str_at(original, (current + 1), "ER", "")) + /** Exceptions. **/ + && !meta_is_str_at(original, 0, "DANGER", "RANGER", "MANGER", "") + && !meta_is_str_at(original, (current - 1), "E", "I", "RGY", "OGY", "") + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "J"); + current += 2; + break; + } + + /** Italian e.g, 'biaggi' **/ + if ( + meta_is_str_at(original, (current + 1), "E", "I", "Y", "") + || meta_is_str_at(original, (current - 1), "AGGI", "OGGI", "") + ) + { + /** Obvious germanic. **/ + if (meta_is_str_at(original, 0, "SCH", "VAN ", "VON ", "") + || meta_is_str_at(original, (current + 1), "ET", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + else + { + /** Always soft, if french ending. **/ + if (meta_is_str_at(original, (current + 1), "IER ", "")) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + } + else + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "K"); + } + } + current += 2; + break; + } + + current += (next_char == 'G') ? 2 : 1; + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + break; + } + + case 'H': + { + /** Only keep if first & before vowel or between 2 vowels. **/ + if ( + (current == 0 || meta_is_vowel(original, (current - 1))) + && meta_is_vowel(original, current + 1) + ) + { + meta_add_str(primary, "H"); + meta_add_str(secondary, "H"); + current += 2; + } + else /* also takes care of 'HH' */ + current += 1; + break; + } + + case 'J': + { + /** Obvious spanish, 'jose', 'san jacinto' **/ + const bool has_jose_next = meta_is_str_at(original, current, "JOSE", ""); + const bool starts_with_san = meta_is_str_at(original, 0, "SAN ", ""); + if (has_jose_next || starts_with_san) + { + if ( + starts_with_san + /** I don't know what this condition means. **/ + || (current == 0 && meta_get_char_at(original, current + 4) == ' ') + ) + { + meta_add_str(primary, "H"); + meta_add_str(secondary, "H"); + } + else + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "H"); + } + current += 1; + break; + } + + if (current == 0 && !has_jose_next) + { + meta_add_str(primary, "J"); /* Yankelovich/Jankelowicz */ + meta_add_str(secondary, "A"); + } + else + { + /** spanish pron. of e.g. 'bajador' **/ + if ( + !is_slavo_germanic + && (next_char == 'A' || next_char == 'O') + && meta_is_vowel(original, (current - 1)) + ) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "H"); + } + else + { + if (current == last) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, ""); + } + else + { + if ( + !meta_is_str_at(original, (current + 1), "L", "T", "K", "S", "N", "M", "B", "Z", "") + && !meta_is_str_at(original, (current - 1), "S", "K", "L", "") + ) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + } + } + } + } + + current += (next_char == 'J') ? 2 : 1; + break; + } + + case 'K': + { + current += (next_char == 'K') ? 2 : 1; + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + break; + } + + case 'L': + { + if (next_char == 'L') + { + /** Spanish e.g. 'cabrillo', 'gallegos' **/ + if ( + ( + current == length - 3 + && meta_is_str_at(original, (current - 1), "ILLO", "ILLA", "ALLE", "") + ) + || ( + meta_is_str_at(original, (current - 1), "ALLE", "") + && ( + meta_is_str_at(original, (last - 1), "AS", "OS", "") + || meta_is_str_at(original, last, "A", "O", "") + ) + ) + ) + { + meta_add_str(primary, "L"); + meta_add_str(secondary, ""); + current += 2; + break; + } + current += 2; + } + else + current += 1; + meta_add_str(primary, "L"); + meta_add_str(secondary, "L"); + break; + } + + case 'M': + { + current += ( + ( + meta_is_str_at(original, (current - 1), "UMB", "") + && (current + 1 == last || meta_is_str_at(original, (current + 2), "ER", "")) + ) + /** 'dumb','thumb' **/ + || next_char == 'M' + ) ? 2 : 1; + meta_add_str(primary, "M"); + meta_add_str(secondary, "M"); + break; + } + + case 'N': + { + current += (next_char == 'N') ? 2 : 1; + meta_add_str(primary, "N"); + meta_add_str(secondary, "N"); + break; + } + + case 'P': + { + if (next_char == 'H') + { + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + current += 2; + break; + } + + /** Also account for "campbell", "raspberry" **/ + current += (meta_is_str_at(original, (current + 1), "P", "B", "")) ? 2 : 1; + meta_add_str(primary, "P"); + meta_add_str(secondary, "P"); + break; + } + + case 'Q': + { + current += (next_char == 'Q') ? 2 : 1; + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + break; + } + + case 'R': + { + /** French e.g. 'rogier', but exclude 'hochmeier' **/ + const bool no_primary = ( + !is_slavo_germanic + && current == last + && meta_is_str_at(original, (current - 2), "IE", "") + && !meta_is_str_at(original, (current - 4), "ME", "MA", "") + ); + + meta_add_str(primary, (no_primary) ? "" : "R"); + meta_add_str(secondary, "R"); + current += (next_char == 'R') ? 2 : 1; + break; + } + + case 'S': + { + /** Special cases 'island', 'isle', 'carlisle', 'carlysle' **/ + if (meta_is_str_at(original, (current - 1), "ISL", "YSL", "")) + { + current += 1; + break; + } + + /** Special case 'sugar-' **/ + if (current == 0 && meta_is_str_at(original, current, "SUGAR", "")) + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "S"); + current += 1; + break; + } + + if (meta_is_str_at(original, current, "SH", "")) + { + const bool germanic = meta_is_str_at(original, (current + 1), "HEIM", "HOEK", "HOLM", "HOLZ", ""); + const char* sound = (germanic) ? "S" : "X"; + meta_add_str(primary, sound); + meta_add_str(secondary, sound); + current += 2; + break; + } + + /** Italian & Armenian. **/ + if (meta_is_str_at(original, current, "SIO", "SIA", "SIAN", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, (is_slavo_germanic) ? "S" : "X"); + current += 3; + break; + } + + /** german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' **/ + /** also, -sz- in slavic language although in hungarian it is pronounced 's' **/ + if (current == 0 && meta_is_str_at(original, (current + 1), "M", "N", "L", "W", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + current += 1; + break; + } + if (meta_is_str_at(original, (current + 1), "Z", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + current += 2; + break; + } + + if (meta_is_str_at(original, current, "SC", "")) + { + /** Schlesinger's rule. **/ + if (meta_get_char_at(original, current + 2) == 'H') + { + /** Dutch origin, e.g. 'school', 'schooner' **/ + if (meta_is_str_at(original, (current + 3), "OO", "ER", "EN", "UY", "ED", "EM", "")) + { + /** 'schermerhorn', 'schenker' **/ + const bool x_sound = meta_is_str_at(original, (current + 3), "ER", "EN", ""); + meta_add_str(primary, (x_sound) ? "X" : "SK"); + meta_add_str(secondary, "SK"); + current += 3; + break; + } + else + { + const bool s_sound = ( + current == 0 + && !meta_is_vowel(original, 3) + && meta_get_char_at(original, 3) != 'W' + ); + meta_add_str(primary, "X"); + meta_add_str(secondary, (s_sound) ? "S" : "X"); + current += 3; + break; + } + } + + /** Default case. **/ + const char* sound = (meta_is_str_at(original, (current + 2), "E", "I", "Y", "")) ? "S" : "SK"; + meta_add_str(primary, sound); + meta_add_str(secondary, sound); + current += 3; + break; + } + + /** French e.g. 'resnais', 'artois' **/ + const bool no_primary = (current == last && meta_is_str_at(original, (current - 2), "AI", "OI", "")); + meta_add_str(primary, (no_primary) ? "" : "S"); + meta_add_str(secondary, "S"); + current += (meta_is_str_at(original, (current + 1), "S", "Z", "")) ? 2 : 1; + break; + } + + case 'T': + { + if (meta_is_str_at(original, current, "TIA", "TCH", "TION", "")) + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + current += 3; + break; + } + + if (meta_is_str_at(original, current, "TH", "TTH", "")) + { + /** Special case 'thomas', 'thames' or germanic. **/ + if ( + meta_is_str_at(original, (current + 2), "OM", "AM", "") + || meta_is_str_at(original, 0, "SCH", "VAN ", "VON ", "") + ) + meta_add_str(primary, "T"); + else + meta_add_str(primary, "0"); /* Yes, zero. */ + meta_add_str(secondary, "T"); + current += 2; + break; + } + + meta_add_str(primary, "T"); + meta_add_str(secondary, "T"); + current += (meta_is_str_at(original, (current + 1), "T", "D", "")) ? 2 : 1; + break; + } + + case 'V': + { + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + current += (next_char == 'V') ? 2 : 1; + break; + } + + case 'W': + { + /** Can also be in middle of word. **/ + if (meta_is_str_at(original, current, "WR", "")) + { + meta_add_str(primary, "R"); + meta_add_str(secondary, "R"); + current += 2; + break; + } + + const bool next_is_vowel = meta_is_vowel(original, current + 1); + if (current == 0 && (next_is_vowel || meta_is_str_at(original, current, "WH", ""))) + { + /** Wasserman should match Vasserman. **/ + meta_add_str(primary, "A"); + meta_add_str(secondary, (next_is_vowel) ? "F" : "A"); + } + + /** Arnow should match Arnoff. **/ + if ((current == last && meta_is_vowel(original, current - 1)) + || meta_is_str_at(original, (current - 1), "EWSKI", "EWSKY", "OWSKI", "OWSKY", "") + || meta_is_str_at(original, 0, "SCH", "") + ) + { + meta_add_str(primary, ""); + meta_add_str(secondary, "F"); + current += 1; + break; + } + + /** Polish e.g. 'filipowicz' **/ + if (meta_is_str_at(original, current, "WICZ", "WITZ", "")) + { + meta_add_str(primary, "TS"); + meta_add_str(secondary, "FX"); + current += 4; + break; + } + + /** Else skip it. **/ + current += 1; + break; + } + + case 'X': + { + /** French e.g. breaux **/ + const bool silent = ( + current == last + && ( + meta_is_str_at(original, (current - 2), "AU", "OU", "") + || meta_is_str_at(original, (current - 3), "IAU", "EAU", "") + ) + ); + if (!silent) + { + meta_add_str(primary, "KS"); + meta_add_str(secondary, "KS"); + } + + current += (meta_is_str_at(original, (current + 1), "C", "X", "")) ? 2 : 1; + break; + } + + case 'Z': + { + /** Chinese pinyin e.g. 'zhao' **/ + if (next_char == 'H') + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + current += 2; + break; + } + + const bool has_t_sound = ( + meta_is_str_at(original, (current + 1), "ZO", "ZI", "ZA", "") + || (is_slavo_germanic && current > 0 && meta_get_char_at(original, (current - 1)) != 'T') + ); + meta_add_str(primary, "S"); + meta_add_str(secondary, (has_t_sound) ? "TS" : "S"); + current += (next_char == 'Z') ? 2 : 1; + break; + } + + default: + current += 1; + } + } + + *primary_code = primary->str; + *secondary_code = secondary->str; + + meta_destroy_string(original); + meta_destroy_string(primary); + meta_destroy_string(secondary); + } + +#ifdef TESTING +/*** Built in test cases. + *** + *** These tests have been integrated into the Centrallix testing environment, + *** where they can be run using `export TONLY=exp_fn_double_metaphone_00`, + *** followed by make test, in the Centrallix directory. + *** + *** The can also be run here by executing the following commands in the + *** centrallix/expression directory, which aditionally generates a coverage + *** report. These tests cover all parts of the double metaphone algorithm, + *** although some of the error cases in various helper functions (such as + *** meta_destroy_string(null)) are not covered by testing. + *** + *** Commands: + *** gcc exp_double_metaphone.c -o exp_double_metaphone.o -I .. -DTESTING -fprofile-arcs -ftest-coverage -O0 + *** ./exp_double_metaphone.o + *** gcov exp_double_metaphone.c + ***/ + +unsigned int num_tests_passed = 0u, num_tests_failed = 0u; + +void test(const char* input, const char* expected_primary, const char* expected_secondary) { + char* codes[2]; + + /** Run DoubleMetaphone() and extract results. **/ + char* actual_primary; + char* actual_secondary; + meta_double_metaphone( + input, + memset(&actual_primary, 0, sizeof(actual_primary)), + memset(&actual_secondary, 0, sizeof(actual_secondary)) + ); + + /** Test for correct value. **/ + if (!strcmp(expected_primary, actual_primary) && + !strcmp(expected_secondary, actual_secondary)) + num_tests_passed++; + else + { + printf( + "\nTEST FAILED: \"%s\"\n" + "Expected: %s %s\n" + "Actual: %s %s\n", + input, + expected_primary, expected_secondary, + actual_primary, actual_secondary + ); + num_tests_failed++; + } + } + +// Special thanks to the following websites for double checking the correct results: +// 1: https://words.github.io/double-metaphone +// 2: https://mainegenealogy.net/metaphone_converter.asp +// 3: https://en.toolpage.org/tool/metaphone +void run_tests(void) { + printf("\nRunning tests...\n"); + + /** Test that always fails. **/ + // test("This", "test", "fails."); + + /** Invalid string tests, by Israel. **/ + fprintf(stderr, "There should be two warnings after this line.\n"); + test(NULL, "", ""); + test("", "", ""); + + /** Basic tests, by Israel. **/ + test("Test", "TST", "TST"); + test("Basic", "PSK", "PSK"); + test("Centrallix", "SNTRLKS", "SNTRLKS"); + test("Lawrence", "LRNS", "LRNS"); + test("Philips", "FLPS", "FLPS"); + test("Acceptingness", "AKSPTNNS", "AKSPTNKNS"); + test("Supercalifragilisticexpialidocious", "SPRKLFRJLSTSKSPLTSS", "SPRKLFRKLSTSKSPLTXS"); + test("Suoicodilaipxecitsiligarfilacrepus", "SKTLPKSSTSLKRFLKRPS", "SKTLPKSSTSLKRFLKRPS"); + + /** Match tests from code comments above. **/ + test("Smith", "SM0", "XMT"); + test("Schmidt", "XMT", "SMT"); + test("Snider", "SNTR", "XNTR"); + test("Schneider", "XNTR", "SNTR"); + test("Arnow", "ARN", "ARNF"); + test("Arnoff", "ARNF", "ARNF"); + + /** Tests from examples in code comments above. **/ + test("Accede", "AKST", "AKST"); + test("Accident", "AKSTNT", "AKSTNT"); + test("Actually", "AKTL", "AKTL"); + test("Arch", "ARX", "ARK"); + test("Artois", "ART", "ARTS"); + test("Bacchus", "PKS", "PKS"); + test("Bacci", "PX", "PX"); + test("Bajador", "PJTR", "PHTR"); + test("Bellocchio", "PLX", "PLX"); + test("Bertucci", "PRTX", "PRTX"); + test("Biaggi", "PJ", "PK"); + test("Bough", "P", "P"); + test("Breaux", "PR", "PR"); + test("Broughton", "PRTN", "PRTN"); + test("Cabrillo", "KPRL", "KPR"); + test("Caesar", "SSR", "SSR"); + test("Cagney", "KKN", "KKN"); + test("Campbell", "KMPL", "KMPL"); + test("Carlisle", "KRLL", "KRLL"); + test("Carlysle", "KRLL", "KRLL"); + test("Chemistry", "KMSTR", "KMSTR"); + test("Chianti", "KNT", "KNT"); + test("Chorus", "KRS", "KRS"); + test("Cough", "KF", "KF"); + test("Czerny", "SRN", "XRN"); + test("Dumb", "TM", "TM"); + test("Edgar", "ATKR", "ATKR"); + test("Edge", "AJ", "AJ"); + test("Filipowicz", "FLPTS", "FLPFX"); + test("Focaccia", "FKX", "FKX"); + test("Gallegos", "KLKS", "KKS"); + test("Germanic", "KRMNK", "JRMNK"); + test("Ghiradelli", "JRTL", "JRTL"); + test("Ghislane", "JLN", "JLN"); + test("Gospel", "KSPL", "KSPL"); + test("Gough", "KF", "KF"); + test("Greek", "KRK", "KRK"); + test("Hochmeier", "HKMR", "HKMR"); + test("Hugh", "H", "H"); + test("Island", "ALNT", "ALNT"); + test("Isle", "AL", "AL"); + test("Italian", "ATLN", "ATLN"); + test("Jankelowicz", "JNKLTS", "ANKLFX"); + test("Jose", "HS", "HS"); + test("Laugh", "LF", "LF"); + test("Mac Caffrey", "MKFR", "MKFR"); + test("Mac Gregor", "MKRKR", "MKRKR"); + test("Manager", "MNKR", "MNJR"); + test("McHugh", "MK", "MK"); + test("McLaughlin", "MKLFLN", "MKLFLN"); + test("Michael", "MKL", "MXL"); + test("Middle", "MTL", "MTL"); + test("Orchestra", "ARKSTR", "ARKSTR"); + test("Orchid", "ARKT", "ARKT"); + test("Pinyin", "PNN", "PNN"); + test("Raspberry", "RSPR", "RSPR"); + test("Resnais", "RSN", "RSNS"); + test("Rogier", "RJ", "RJR"); + test("Rough", "RF", "RF"); + test("Salvador", "SLFTR", "SLFTR"); + test("San jacinto", "SNHSNT", "SNHSNT"); + test("Schenker", "XNKR", "SKNKR"); + test("Schermerhorn", "XRMRRN", "SKRMRRN"); + test("Schlesinger", "XLSNKR", "SLSNJR"); + test("School", "SKL", "SKL"); + test("Schooner", "SKNR", "SKNR"); + test("Succeed", "SKST", "SKST"); + test("Sugar", "XKR", "SKR"); + test("Sugary", "XKR", "SKR"); + test("Tagliaro", "TKLR", "TLR"); + test("Thames", "TMS", "TMS"); + test("Thomas", "TMS", "TMS"); + test("Thumb", "0M", "TM"); + test("Tichner", "TXNR", "TKNR"); + test("Tough", "TF", "TF"); + test("Vghee", "FK", "FK"); + test("Wachtler", "AKTLR", "FKTLR"); + test("Wechsler", "AKSLR", "FKSLR"); + test("Word", "ART", "FRT"); + test("Xavier", "SF", "SFR"); + test("Yankelovich", "ANKLFX", "ANKLFK"); + test("Zhao", "J", "J"); + + /** Intereesting Edge Case: "McClellan" **/ + /*** Note: Sources (1) and (3) both include a double K ("MKKLLN"), but the + *** original code on GitHub and mainegenealogy.net do not. I chose "MKLLN" + *** to be correct because I personally do not pronounce the second c. + ***/ + test("McClellan", "MKLLN", "MKLLN"); + + /** Maurice Aubrey's Tests. **/ + /** Source: https://github.com/gitpan/Text-DoubleMetaphone/blob/master/t/words.txt **/ + test("maurice", "MRS", "MRS"); + test("aubrey", "APR", "APR"); + test("cambrillo", "KMPRL", "KMPR"); + test("heidi", "HT", "HT"); + test("katherine", "K0RN", "KTRN"); + test("catherine", "K0RN", "KTRN"); + test("richard", "RXRT", "RKRT"); + test("bob", "PP", "PP"); + test("eric", "ARK", "ARK"); + test("geoff", "JF", "KF"); + test("dave", "TF", "TF"); + test("ray", "R", "R"); + test("steven", "STFN", "STFN"); + test("bryce", "PRS", "PRS"); + test("randy", "RNT", "RNT"); + test("bryan", "PRN", "PRN"); + test("brian", "PRN", "PRN"); + test("otto", "AT", "AT"); + test("auto", "AT", "AT"); + + /** GPT-5 Coverage Tests. **/ + /*** GPT-5 mini (Preview) running in GitHub Copilot suggested the words + *** after analizing a generated coverage report, and I (Israel) used + *** them to write the tests below. I kept the AI's reasoning for tests, + *** while removing tests that did not contribute any coverage, but after + *** a few reprompts, the AI started just giving words without reasoning. + *** I guess we were both getting pretty tired. + ***/ + test("Abbott", "APT", "APT"); /* double-B ("BB") handling. */ + test("Back", "PK", "PK"); /* "CK"/"CG"/"CQ" branch. */ + test("Bacher", "PKR", "PKR"); /* matches "...BACHER" / ACH special-case. */ + test("Charles", "XRLS", "XRLS"); /* initial "CH" -> the branch that maps to "X"/"X" at start. */ + test("Ghana", "KN", "KN"); /* initial "GH" special-start handling. */ + test("Gnome", "NM", "NM"); /* "GN" sequence handling. */ + test("Raj", "RJ", "R"); /* J at end (exercise J-last behavior). */ + test("Quentin", "KNTN", "KNTN"); /* Q case (Q -> K mapping). */ + test("Who", "A", "A"); /* "WH" at start handling. */ + test("Shoemaker", "XMKR", "XMKR"); /* "SH" general mapping paths. */ + test("Sian", "SN", "XN"); /* "SIO"/"SIA"/"SIAN" branch. */ + test("Scold", "SKLT", "SKLT"); /* "SC" default / "SK" vs other SC subcases. */ + test("Station", "STXN", "STXN"); /* "TION" -> X mapping. */ + test("Match", "MX", "MX"); /* "TCH"/"TIA" -> X mapping. */ + test("Pizza", "PS", "PTS"); /* double-Z ("ZZ") handling. */ + test("Agnes", "AKNS", "ANS"); /* "GN" at index 1 (GN handling that yields KN / N). */ + test("Science", "SNS", "SNS"); /* "SC" followed by I (SC + I/E/Y branch). */ + test("Van Gogh", "FNKK", "FNKK"); + test("Josef", "JSF", "HSF"); + test("Object", "APJKT", "APJKT"); + test("Sholz", "SLS", "SLS"); + test("Scharf", "XRF", "XRF"); + test("Kasia", "KS", "KS"); + test("Van Geller", "FNKLR", "FNKLR"); + + const unsigned int total_tests = num_tests_passed + num_tests_failed; + printf("\nTests completed!\n"); + printf(" > Failed: %u\n", num_tests_failed); + printf(" > Skipped: %u\n", 0u); /* Implementation removed. */ + printf(" > Passed: %u/%u\n", num_tests_passed, total_tests); +} + +int main(void) { + run_tests(); + return 0; +} + +/** Prevent scope leak. **/ +#undef META_FREE +#undef META_MALLOC +#undef META_REALLOC +#undef SAFE_MALLOC +#undef SAFE_REALLOC + +#endif diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 6425114d..b2e3e84a 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -1,27 +1,3 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include "obj.h" -#include "cxlib/mtask.h" -#include "cxlib/xarray.h" -#include "cxlib/xhash.h" -#include "cxlib/mtlexer.h" -#include "expression.h" -#include "cxlib/mtsession.h" -#include "cxss/cxss.h" -#include -#include -#include -#include -#include - - /************************************************************************/ /* Centrallix Application Server System */ /* Centrallix Core */ @@ -65,6 +41,63 @@ /* that issue in exp_evaluate.c */ /************************************************************************/ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cxlib/clusters.h" +#include "cxlib/mtask.h" +#include "cxlib/mtlexer.h" +#include "cxlib/mtsession.h" +#include "cxlib/newmalloc.h" +#include "cxlib/util.h" +#include "cxlib/xarray.h" +#include "cxlib/xhash.h" +#include "cxss/cxss.h" +#include "expression.h" +#include "obj.h" + + +/** TODO: Greg - I think this should be moved to datatypes. **/ +/** Should maybe replace duplocate functionality elsewhere. **/ +static char* ci_TypeToStr(const int type) + { + switch (type) + { + case DATA_T_UNAVAILABLE: return "Unknown"; + case DATA_T_INTEGER: return "Integer"; + case DATA_T_STRING: return "String"; + case DATA_T_DOUBLE: return "Double"; + case DATA_T_DATETIME: return "DateTime"; + case DATA_T_INTVEC: return "IntVector"; + case DATA_T_STRINGVEC: return "StringVector"; + case DATA_T_MONEY: return "Money"; + case DATA_T_ARRAY: return "Array"; + case DATA_T_CODE: return "Code"; + case DATA_T_BINARY: return "Binary"; + } + + /** Invalid type. **/ + mssErrorf(1, "Cluster", "Invalid type %d.\n", type); + return "Invalid"; /* Shall not parse to a valid type in ci_TypeFromStr(). */ + } /****** Evaluator functions follow for expEvalFunction ******/ @@ -1111,7 +1144,7 @@ int exp_fn_reverse(pExpression tree, pParamObjects objlist, pExpression i0, pExp return 0; } - +/** Leading zero trim. */ int exp_fn_lztrim(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) { char* ptr; @@ -1220,6 +1253,31 @@ int exp_fn_rtrim(pExpression tree, pParamObjects objlist, pExpression i0, pExpre } +int exp_fn_trim(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) + { + int ret; + + /** Invoke left trim. **/ + ret = exp_fn_ltrim(tree, objlist, i0, i1, i2); + if (ret != 0) + { + mssErrorf(0, "EXP", "Failed to left trim (error code: %d).", ret); + return ret; + } + + /** Invoke right trim. **/ + ret = exp_fn_rtrim(tree, objlist, i0, i1, i2); + if (ret != 0) + { + mssErrorf(0, "EXP", "Failed to right trim (error code: %d).", ret); + return ret; + } + + /** Success. **/ + return 0; + } + + int exp_fn_right(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) { int n,i; @@ -2356,17 +2414,37 @@ int exp_fn_truncate(pExpression tree, pParamObjects objlist, pExpression i0, pEx /*** constrain(value, min, max) ***/ int exp_fn_constrain(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) { - if (!i0 || !i1 || !i2 || (i0->DataType != i1->DataType) || i0->DataType != i2->DataType || !(i0->DataType == DATA_T_INTEGER || i0->DataType == DATA_T_MONEY || i0->DataType == DATA_T_DOUBLE)) - { - mssError(1,"EXP","constrain() requires three numeric parameters of the same data type"); - return -1; - } + /** Skip null value. **/ tree->DataType = i0->DataType; if ((i0->Flags & EXPR_F_NULL)) { tree->Flags |= EXPR_F_NULL; return 0; } + + /** Verify parameters. **/ + if (i0 == NULL || i1 == NULL || i2 == NULL) + { + mssError(1, "EXP", "constrain() expects three parameters."); + return -1; + } + if (i0->DataType != DATA_T_INTEGER && i0->DataType != DATA_T_DOUBLE && i0->DataType != DATA_T_MONEY) + { + mssError(1, "EXP", + "constrain() expects three numeric parameters: %s is not numeric.", + ci_TypeToStr(i0->DataType) + ); + if (i0->DataType == DATA_T_STRING) printf("Value: '%s'\n", i0->String); + return -1; + } + if (i0->DataType != i1->DataType || i1->DataType != i2->DataType) + { + mssError(1, "EXP", + "constrain() expects three numeric parameters of the same data type but got types %s, %s, and %s.", + ci_TypeToStr(i0->DataType), ci_TypeToStr(i1->DataType), ci_TypeToStr(i2->DataType) + ); + return -1; + } /* check min */ if (!(i1->Flags & EXPR_F_NULL)) @@ -3269,6 +3347,101 @@ int exp_fn_log10(pExpression tree, pParamObjects objlist, pExpression i0, pExpre } +int exp_fn_log_natural(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) + { + double n; + + if (!i0) + { + mssError(1, "EXP", "ln() requires a number as its first parameter"); + goto error; + } + if (i0->Flags & EXPR_F_NULL) + { + tree->DataType = DATA_T_DOUBLE; + tree->Flags |= EXPR_F_NULL; + return 0; + } + switch(i0->DataType) + { + case DATA_T_INTEGER: + n = i0->Integer; + break; + case DATA_T_DOUBLE: + n = i0->Types.Double; + break; + case DATA_T_MONEY: + n = objDataToDouble(DATA_T_MONEY, &(i0->Types.Money)); + break; + default: + mssError(1, "EXP", "ln() requires a number as its first parameter"); + goto error; + } + if (n < 0) + { + mssError(1, "EXP", "ln(): cannot compute the logarithm of a negative number"); + goto error; + } + tree->DataType = DATA_T_DOUBLE; + tree->Types.Double = log(n); + return 0; + + error: + return -1; + } + + +int exp_fn_log_base_n(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) + { + double n, p; + + if (!i0 || !i1) + { + mssError(1, "EXP", "logn() requires numbers as its first and second parameters"); + goto error; + } + if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL)) + { + tree->DataType = DATA_T_DOUBLE; + tree->Flags |= EXPR_F_NULL; + return 0; + } + switch(i0->DataType) + { + case DATA_T_INTEGER: + n = i0->Integer; + break; + case DATA_T_DOUBLE: + n = i0->Types.Double; + break; + case DATA_T_MONEY: + n = objDataToDouble(DATA_T_MONEY, &(i0->Types.Money)); + break; + default: + mssError(1, "EXP", "logn() requires a number as its first parameter"); + goto error; + } + switch(i1->DataType) + { + case DATA_T_INTEGER: + p = i1->Integer; + break; + case DATA_T_DOUBLE: + p = i1->Types.Double; + break; + default: + mssError(1, "EXP", "logn() requires an integer or double as its second parameter"); + goto error; + } + tree->DataType = DATA_T_DOUBLE; + tree->Types.Double = log(n) / log(p); + return 0; + + error: + return -1; + } + + int exp_fn_power(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) { double n, p; @@ -3978,368 +4151,232 @@ int exp_fn_nth(pExpression tree, pParamObjects objlist, pExpression i0, pExpress return 0; } -/* See centrallix-sysdoc/string_comparison.md for more information. */ -int exp_fn_levenshtein(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) +static int exp_fn_verify_schema( + const char* fn_name, + const int* param_types, + const int num_params, + pExpression tree, + pParamObjects obj_list) { - - if (!i0 || !i1) + /** Verify object list and session. **/ + if (obj_list == NULL) { - mssError(1,"EXP","levenshtein() requires two parameters"); - return -1; + mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); + return -1; } - - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL)) - { - tree->DataType = DATA_T_INTEGER; - tree->Flags |= EXPR_F_NULL; - return 0; + ASSERTMAGIC(obj_list->Session, MGK_OBJSESSION); + + /** Verify expression tree. **/ + ASSERTMAGIC(tree, MGK_EXPRESSION); + + /** Verify parameter number. **/ + const int num_params_actual = tree->Children.nItems; + if (num_params != num_params_actual) + { + mssErrorf(1, "EXP", + "%s(?) expects %u param%s, got %d param%s.", + fn_name, num_params, (num_params > 1) ? "s" : "", num_params_actual, (num_params_actual > 1) ? "s" : "" + ); + return -1; } - - if ((i0->DataType != DATA_T_STRING) || (i1->DataType != DATA_T_STRING)) + + /** Verify parameter datatypes. **/ + for (int i = 0; i < num_params; i++) { - mssError(1,"EXP","levenshtein() requires two string parameters"); - return -1; + const pExpression arg = tree->Children.Items[i]; + ASSERTMAGIC(arg, MGK_EXPRESSION); + + /** Skip null values. **/ + if (arg->Flags & EXPR_F_NULL) continue; + + /** Extract datatypes. **/ + const int expected_datatype = param_types[i]; + const int actual_datatype = arg->DataType; + + /** Verify datatypes. **/ + if (expected_datatype != actual_datatype) + { + mssErrorf(1, "EXP", + "%s(...) param #%d/%d expects type %s (%d) but got type %s (%d).", + fn_name, i + 1, num_params, ci_TypeToStr(expected_datatype), expected_datatype, ci_TypeToStr(actual_datatype), actual_datatype + ); + return -1; + } } - - // for all i and j, d[i,j] will hold the Levenshtein distance between - // the first i characters of s and the first j characters of t - int length1 = strlen(i0->String); - int length2 = strlen(i1->String); - //int levMatrix[length1+1][length2+1]; - int (*levMatrix)[length1+1][length2+1] = nmSysMalloc(sizeof(*levMatrix)); - int i; - int j; - //set each element in d to zero - for (i = 0; i < length1; i++) - { - for (j = 0; j < length2; j++) - { - (*levMatrix)[i][j] = 0; - } - } - // source prefixes can be transformed into empty string by - // dropping all characters - for (i = 0; i <= length1; i++) - { - (*levMatrix)[i][0] = i; - } - - // target prefixes can be reached from empty source prefix - // by inserting every character - for (j = 0; j <= length2; j++) - { - (*levMatrix)[0][j] = j; - } - - for (i = 1; i <= length1; i++) - { - for (j = 1; j <= length2; j++) - { - if (i0->String[i-1] == i1->String[j-1]) - { - (*levMatrix)[i][j] = (*levMatrix)[i-1][j-1]; - } - else - { - int value1 = (*levMatrix)[i - 1][j] + 1; - int value2 = (*levMatrix)[i][j-1] + 1; - int value3 = (*levMatrix)[i-1][j-1] + 1; - (*levMatrix)[i][j] = (value1 < value2) ? - ((value1 < value3) ? value1 : value3) : - (value2 < value3) ? value2 : value3; - } - } - } - tree->DataType = DATA_T_INTEGER; - tree->Integer = (*levMatrix)[length1][length2]; - nmSysFree(levMatrix); + /** Pass. **/ return 0; } -/* See centrallix-sysdoc/string_comparison.md for more information. */ -int exp_fn_lev_compare(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) - { - - if (!i0 || !i1) - { - mssError(1,"EXP","lev_compare() requires two or three parameters"); - return -1; - } - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL) || (i2 && (i2->Flags & EXPR_F_NULL))) +int exp_fn_metaphone(pExpression tree, pParamObjects obj_list) + { + const char fn_name[] = "metaphone"; + + /** Verify function schema. **/ + if (exp_fn_verify_schema(fn_name, (int[]){ DATA_T_STRING }, 1, tree, obj_list) != 0) { - tree->DataType = DATA_T_DOUBLE; - tree->Flags |= EXPR_F_NULL; - return 0; + mssErrorf(0, "EXP", "%s(?) Call does not match function schema.", fn_name); + return -1; } - - if ((i0->DataType != DATA_T_STRING) || (i1->DataType != DATA_T_STRING) || (i2 && i2->DataType != DATA_T_INTEGER)) + + /** Extract string param. **/ + pExpression maybe_str = check_ptr(tree->Children.Items[0]); + if (maybe_str->Flags & EXPR_F_NULL) { - mssError(1,"EXP","lev_compare() requires two string and one optional integer parameters"); - return -1; + tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_STRING; + return 0; } - - exp_fn_levenshtein(tree, objlist, i0, i1, i2); - //!!! I am not checking for errors here, because IN THEORY we have two strings... if we don't, big uh-oh. - int lev_dist = tree->Integer; - - int length1 = strlen(i0->String); - int length2 = strlen(i1->String); - - double clamped_dist = 1.0; - - if (length1 == 0 || length2 == 0) //empty string + const char* str = check_ptr(maybe_str->String); + const size_t str_len = strlen(str); + if (str_len == 0u) { - clamped_dist = 0.5; - } - else //normal case - { - int max_len = (length1 > length2) ? length1 : length2; - clamped_dist = ((double) lev_dist) / max_len; - - if (abs(length1-length2) == lev_dist) //only inserts. Maybe substring. - { - clamped_dist /= 2; - } - - //use max_field_width if it was provided as a sensible value. If not, don't use it. - double max_field_width = i2?(i2->Integer):0; - if (max_field_width && max_field_width >= max_len) { - double mod = (lev_dist + max_field_width * 3/4) / max_field_width; - if (mod < 1) { //don't make clamped_dist bigger - clamped_dist *= mod; - } - } - } - - - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = 1.0 - clamped_dist; + tree->String = ""; + tree->DataType = DATA_T_STRING; return 0; -} - -// This is the size of the vector table. It is also used in calculating the table indices. -const int EXP_VECTOR_TABLE_SIZE = 251; - -/* - * hash_char_pair - * This method creates an vector table index based a given character pair. The characters are represented - * as their ASCII code points. - * - * Parameters: - * num1 : first ASCII code point (double) - * num2 : second ASCII code point (double) - * - * Returns: - * vector table index (integer) - */ -int exp_fn_i_hash_char_pair(double num1, double num2) - { - int func_result = round(((num1 * num1 * num1) + (num2 * num2 * num2)) * ((num1+1)/(num2+1))) -1; - return func_result % EXP_VECTOR_TABLE_SIZE; + } + + /** Compute DoubleMetaphone. **/ + char* primary = NULL; + char* secondary = NULL; + meta_double_metaphone(str, &primary, &secondary); + + /** Process result. **/ + const size_t result_length = strlen(primary) + 1u + strlen(secondary) + 1u; + char* result = check_ptr(nmSysMalloc(result_length * sizeof(char*))); + if (result == NULL) return -1; + sprintf(result, "%s%c%s", primary, CA_BOUNDARY_CHAR, secondary); + + /** Return the result. **/ + tree->String = result; + tree->DataType = DATA_T_STRING; + return 0; } -/* - * exp_fn_i_frequency_table - * This method creates a vector frequency table based on a string of characters. - * - * Parameters: - * table : integer pointer to vector frequency table (unsigned short) - * term : the string of characters (char*) - * - * Returns: - * 0 - */ -int exp_fn_i_frequency_table(unsigned short *table, char *term) +/*** Computes cosine or Levenshtein similarity between two strings. These two + *** tasks have a large amount of overlapping logic (mostly error checking), + *** so doing them with one function greatly reduces code duplocation. + *** + *** @param tree The tree resulting from this function. + *** @param obj_list The evaluation "scope", including available variables. + *** @param fn_name Either `cos_compare()` or `lev_compare()`. + *** @returns 0 for success, -1 for failure. + ***/ +static int exp_fn_compare(pExpression tree, pParamObjects obj_list, const char* fn_name) { - int i; - // Initialize hash table with 0 values - for (i = 0; i < EXP_VECTOR_TABLE_SIZE; i++) + /** Verify function schema. **/ + if (exp_fn_verify_schema(fn_name, (int[]){ DATA_T_STRING, DATA_T_STRING }, 2, tree, obj_list) != 0) { - table[i] = 0; + mssErrorf(0, "EXP", "%s(?) Call does not match function schema.", fn_name); + return -1; } - - int j = -1; - for(i = 0; i < strlen(term) + 1; i++) + + /** Extract strings. **/ + pExpression maybe_str1 = check_ptr(tree->Children.Items[0]); + pExpression maybe_str2 = check_ptr(tree->Children.Items[1]); + if (maybe_str1->Flags & EXPR_F_NULL || maybe_str2->Flags & EXPR_F_NULL) { - // If latter character is punctuation or whitespace, skip it - if (ispunct(term[i]) || isspace(term[i])) - { - continue; - } - - double temp1 = 0.0; - double temp2 = 0.0; - - // If previous character is null - if (j == -1) - { - temp1 = 96; - } - - // Else character is not null - else - { - temp1 = (int)tolower(term[j]); - } - - // If latter character is null - if (i == strlen(term)) + tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_DOUBLE; + return 0; + } + char* str1 = check_ptr(maybe_str1->String); + char* str2 = check_ptr(maybe_str2->String); + + /** Handle either cos_compare() or lev_compare(). **/ + if (fn_name[0] == 'c') + { /* cos_compare() */ + int ret; + + /** Build vectors. **/ + const pVector v1 = check_ptr(ca_build_vector(str1)); + const pVector v2 = check_ptr(ca_build_vector(str2)); + if (v1 == NULL || v2 == NULL) { - temp2 = 96; + mssErrorf(1, "EXP", + "%s(\"%s\", \"%s\") - Failed to build vectors.", + fn_name, str1, str2 + ); + ret = -1; } - - // Else character is not null else { - temp2 = (int)tolower(term[i]); - } - - // Else character is not null // If either character is a number, reassign the code point - if (temp1 >= 48 && temp1 <= 57) - { - temp1 += 75; + /** Compute the similarity. **/ + tree->Types.Double = ca_cos_compare(v1, v2); + tree->DataType = DATA_T_DOUBLE; + ret = 0; } - - if (temp2 >= 48 && temp2 <= 57) + + /** Clean up. **/ + if (v1 != NULL) ca_free_vector(v1); + if (v2 != NULL) ca_free_vector(v2); + return ret; + } + else + { /* lev_compare() */ + double lev_sim = check_double(ca_lev_compare(str1, str2)); + if (isnan(lev_sim)) { - temp2 += 75; + mssErrorf(1, "EXP", "%s(\"%s\", \"%s\") Failed to compute levenstein edit distance."); + return -1; } - - // Hash the character pair into an index - int index = exp_fn_i_hash_char_pair(temp1, temp2); - - // Increment Frequency Table value by number from 0 to 13 - table[index] += ((unsigned short)temp1 + (unsigned short)temp2) % 13 + 1; - - // Move j up to latter character before incrementing i - j = i; - + + /** Return the computed result. **/ + tree->Types.Double = lev_sim; + tree->DataType = DATA_T_DOUBLE; + return 0; } - - return 0; - + return -1; } -/* - * exp_fn_i_dot_product - * This method calculautes the dot product of two vectors. - * - * Parameters: - * dot_product : the place where the result is stored (double) - * r_freq_table1 : the first vector (unsigned short) - * r_freq_table2 : the second vector (unsigned short) - * - * Returns: - * 0 - */ -int exp_fn_i_dot_product(double *dot_product, unsigned short *r_freq_table1, unsigned short *r_freq_table2) + +int exp_fn_cos_compare(pExpression tree, pParamObjects obj_list) { - int i; - for (i = 0; i < EXP_VECTOR_TABLE_SIZE; i++) - { - *dot_product = *dot_product + ((double)r_freq_table1[i] * (double)r_freq_table2[i]); - } - return 0; + return exp_fn_compare(tree, obj_list, "cos_compare"); } - -/* - * exp_fn_i_magnitude - * This method calculates the magnitude of a vector - * - * Parameters: - * magnitude : the place where the result is stored (double) - * r_freq_table : the vector (unsigned short) - */ -int exp_fn_i_magnitude(double *magnitude, unsigned short *r_freq_table) +int exp_fn_lev_compare(pExpression tree, pParamObjects obj_list) { - int i; - for (i = 0; i < EXP_VECTOR_TABLE_SIZE; i++) - { - *magnitude = *magnitude + ((double)r_freq_table[i] * (double)r_freq_table[i]); - } - *magnitude = sqrt(*magnitude); - return 0; + return exp_fn_compare(tree, obj_list, "lev_compare"); } -/* - * exp_fn_cos_compare - * This method calculates the cosine similarity of two vector frequency tables - * See centrallix-sysdoc/string_comparison.md for more information. - * - * Parameters: - * tree : structure where output is stored - * objlist: - * i0 : first data entry (pExpression) - * i1 : second data entry (pExpression) - * i2 : - * - * Returns: - * 0 - */ -int exp_fn_cos_compare(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) + +int exp_fn_levenshtein(pExpression tree, pParamObjects obj_list) { - // Ensure function receives two non-null parameters - if (!i0 || !i1) + const char fn_name[] = "levenshtein"; + + /** Verify function schema. **/ + if (exp_fn_verify_schema(fn_name, (int[]){ DATA_T_STRING, DATA_T_STRING }, 2, tree, obj_list) != 0) { - mssError(1,"EXP","cos_compare() requires two parameter."); + mssErrorf(0, "EXP", "%s(?) Call does not match function schema.", fn_name); return -1; } - - // Ensure value passed in both parameters is not null - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL)) + + /** Extract strings. **/ + pExpression maybe_str1 = check_ptr(tree->Children.Items[0]); + pExpression maybe_str2 = check_ptr(tree->Children.Items[1]); + if (maybe_str1->Flags & EXPR_F_NULL || maybe_str2->Flags & EXPR_F_NULL) { - tree->DataType = DATA_T_DOUBLE; tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_INTEGER; return 0; } - - // Ensure both parameters contain string values - if ((i0->DataType != DATA_T_STRING) || (i1->DataType != DATA_T_STRING)) - { - mssError(1,"EXP","cos_compare() requires two string parameters."); - return -1; - } - - //If the two strings are identical, don't bother running cosine compare - if (strcmp(i0->String, i1->String) == 0) - { - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = 1.0; - return 0; - } - - // Allocate frequency tables (arrays of integers) for each term - unsigned short *table1 = nmMalloc(EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - unsigned short *table2 = nmMalloc(EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - - if (table1 == NULL || table2 == NULL) + char* str1 = check_ptr(maybe_str1->String); + char* str2 = check_ptr(maybe_str2->String); + + /** Compute edit distance. **/ + /** Length 0 is provided for both strings so that the function will compute it for us. **/ + int edit_dist = ca_edit_dist(str1, str2, 0lu, 0lu); + if (!check_neg(edit_dist)) { - mssError(1,"EXP","Memory allocation failed."); + mssErrorf(1, "EXP", "%s(\"%s\", \"%s\") Failed to compute edit distance.\n", fn_name, str1, str2); return -1; } - - // Calculate frequency tables for each term - exp_fn_i_frequency_table(table1, i0->String); - exp_fn_i_frequency_table(table2, i1->String); - - // Calculate dot product - double dot_product = 0; - exp_fn_i_dot_product(&dot_product, table1, table2); - - // Calculate magnitudes of each relative frequency vector - double magnitude1 = 0; - double magnitude2 = 0; - exp_fn_i_magnitude(&magnitude1, table1); - exp_fn_i_magnitude(&magnitude2, table2); - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = dot_product / (magnitude1 * magnitude2); - nmFree(table1, EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - nmFree(table2, EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - + /** Return the computed distance. **/ + tree->Integer = edit_dist; + tree->DataType = DATA_T_INTEGER; return 0; } @@ -4351,7 +4388,7 @@ int exp_fn_cos_compare(pExpression tree, pParamObjects objlist, pExpression i0, * Parameters: * pExpression tree: * pParamObjects: - * pExpression passowrd: The password, passed as a pExpression + * pExpression password: The password, passed as a pExpression * pExpression salt: The salt, passed as a pExpression * * returns: @@ -4466,25 +4503,42 @@ int exp_fn_argon2id(pExpression tree, pParamObjects objlist, pExpression passwor int exp_internal_DefineFunctions() { - - /** Function list for EXPR_N_FUNCTION nodes **/ - xhAdd(&EXP.Functions, "getdate", (char*)exp_fn_getdate); + /** Initialize library. **/ + ca_init(); + + /** Function list for EXPR_N_FUNCTION nodes. **/ + + /** General. **/ xhAdd(&EXP.Functions, "user_name", (char*)exp_fn_user_name); xhAdd(&EXP.Functions, "convert", (char*)exp_fn_convert); xhAdd(&EXP.Functions, "wordify", (char*)exp_fn_wordify); xhAdd(&EXP.Functions, "abs", (char*)exp_fn_abs); xhAdd(&EXP.Functions, "ascii", (char*)exp_fn_ascii); xhAdd(&EXP.Functions, "condition", (char*)exp_fn_condition); - xhAdd(&EXP.Functions, "charindex", (char*)exp_fn_charindex); - xhAdd(&EXP.Functions, "upper", (char*)exp_fn_upper); - xhAdd(&EXP.Functions, "lower", (char*)exp_fn_lower); - xhAdd(&EXP.Functions, "mixed", (char*)exp_fn_mixed); - xhAdd(&EXP.Functions, "char_length", (char*)exp_fn_char_length); - xhAdd(&EXP.Functions, "datepart", (char*)exp_fn_datepart); xhAdd(&EXP.Functions, "isnull", (char*)exp_fn_isnull); + xhAdd(&EXP.Functions, "eval", (char*)exp_fn_eval); + xhAdd(&EXP.Functions, "truncate", (char*)exp_fn_truncate); + xhAdd(&EXP.Functions, "constrain", (char*)exp_fn_constrain); + xhAdd(&EXP.Functions, "has_endorsement", (char*)exp_fn_has_endorsement); + xhAdd(&EXP.Functions, "rand", (char*)exp_fn_rand); + xhAdd(&EXP.Functions, "nullif", (char*)exp_fn_nullif); + xhAdd(&EXP.Functions, "hash", (char*)exp_fn_hash); + xhAdd(&EXP.Functions, "hmac", (char*)exp_fn_hmac); + xhAdd(&EXP.Functions, "pbkdf2", (char*)exp_fn_pbkdf2); + xhAdd(&EXP.Functions, "octet_length", (char*)exp_fn_octet_length); + xhAdd(&EXP.Functions, "argon2id",(char*)exp_fn_argon2id); + + /** Dates. **/ + xhAdd(&EXP.Functions, "getdate", (char*)exp_fn_getdate); + xhAdd(&EXP.Functions, "datepart", (char*)exp_fn_datepart); + xhAdd(&EXP.Functions, "dateadd", (char*)exp_fn_dateadd); + xhAdd(&EXP.Functions, "datediff", (char*)exp_fn_datediff); + + /** Strings. **/ xhAdd(&EXP.Functions, "ltrim", (char*)exp_fn_ltrim); xhAdd(&EXP.Functions, "lztrim", (char*)exp_fn_lztrim); xhAdd(&EXP.Functions, "rtrim", (char*)exp_fn_rtrim); + xhAdd(&EXP.Functions, "trim", (char*)exp_fn_trim); xhAdd(&EXP.Functions, "substring", (char*)exp_fn_substring); xhAdd(&EXP.Functions, "right", (char*)exp_fn_right); xhAdd(&EXP.Functions, "ralign", (char*)exp_fn_ralign); @@ -4494,12 +4548,22 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "escape", (char*)exp_fn_escape); xhAdd(&EXP.Functions, "quote", (char*)exp_fn_quote); xhAdd(&EXP.Functions, "substitute", (char*)exp_fn_substitute); - xhAdd(&EXP.Functions, "eval", (char*)exp_fn_eval); + xhAdd(&EXP.Functions, "upper", (char*)exp_fn_upper); + xhAdd(&EXP.Functions, "lower", (char*)exp_fn_lower); + xhAdd(&EXP.Functions, "mixed", (char*)exp_fn_mixed); + xhAdd(&EXP.Functions, "char_length", (char*)exp_fn_char_length); + xhAdd(&EXP.Functions, "charindex", (char*)exp_fn_charindex); + xhAdd(&EXP.Functions, "dateformat", (char*)exp_fn_dateformat); + xhAdd(&EXP.Functions, "moneyformat", (char*)exp_fn_moneyformat); + + /** Numbering systems (e.g. base 16 aka. hex, base 8 aka. octal, etc.). **/ + xhAdd(&EXP.Functions, "to_base64", (char*)exp_fn_to_base64); + xhAdd(&EXP.Functions, "from_base64", (char*)exp_fn_from_base64); + xhAdd(&EXP.Functions, "to_hex", (char*)exp_fn_to_hex); + xhAdd(&EXP.Functions, "from_hex", (char*)exp_fn_from_hex); + + /** Math. **/ xhAdd(&EXP.Functions, "round", (char*)exp_fn_round); - xhAdd(&EXP.Functions, "dateadd", (char*)exp_fn_dateadd); - xhAdd(&EXP.Functions, "datediff", (char*)exp_fn_datediff); - xhAdd(&EXP.Functions, "truncate", (char*)exp_fn_truncate); - xhAdd(&EXP.Functions, "constrain", (char*)exp_fn_constrain); xhAdd(&EXP.Functions, "sin", (char*)exp_fn_sin); xhAdd(&EXP.Functions, "cos", (char*)exp_fn_cos); xhAdd(&EXP.Functions, "tan", (char*)exp_fn_tan); @@ -4511,32 +4575,23 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "square", (char*)exp_fn_square); xhAdd(&EXP.Functions, "degrees", (char*)exp_fn_degrees); xhAdd(&EXP.Functions, "radians", (char*)exp_fn_radians); - xhAdd(&EXP.Functions, "has_endorsement", (char*)exp_fn_has_endorsement); - xhAdd(&EXP.Functions, "rand", (char*)exp_fn_rand); - xhAdd(&EXP.Functions, "nullif", (char*)exp_fn_nullif); - xhAdd(&EXP.Functions, "dateformat", (char*)exp_fn_dateformat); - xhAdd(&EXP.Functions, "moneyformat", (char*)exp_fn_moneyformat); - xhAdd(&EXP.Functions, "hash", (char*)exp_fn_hash); - xhAdd(&EXP.Functions, "hmac", (char*)exp_fn_hmac); xhAdd(&EXP.Functions, "log10", (char*)exp_fn_log10); + xhAdd(&EXP.Functions, "ln", (char*)exp_fn_log_natural); + xhAdd(&EXP.Functions, "logn", (char*)exp_fn_log_base_n); xhAdd(&EXP.Functions, "power", (char*)exp_fn_power); - xhAdd(&EXP.Functions, "pbkdf2", (char*)exp_fn_pbkdf2); - xhAdd(&EXP.Functions, "levenshtein", (char*)exp_fn_levenshtein); - xhAdd(&EXP.Functions, "lev_compare", (char*)exp_fn_lev_compare); + + /** Duplicate detection. **/ + xhAdd(&EXP.Functions, "metaphone", (char*)exp_fn_metaphone); xhAdd(&EXP.Functions, "cos_compare", (char*)exp_fn_cos_compare); - xhAdd(&EXP.Functions, "to_base64", (char*)exp_fn_to_base64); - xhAdd(&EXP.Functions, "from_base64", (char*)exp_fn_from_base64); - xhAdd(&EXP.Functions, "to_hex", (char*)exp_fn_to_hex); - xhAdd(&EXP.Functions, "from_hex", (char*)exp_fn_from_hex); - xhAdd(&EXP.Functions, "octet_length", (char*)exp_fn_octet_length); - xhAdd(&EXP.Functions, "argon2id",(char*)exp_fn_argon2id); - - /** Windowing **/ + xhAdd(&EXP.Functions, "lev_compare", (char*)exp_fn_lev_compare); + xhAdd(&EXP.Functions, "levenshtein", (char*)exp_fn_levenshtein); + + /** Windowing. **/ xhAdd(&EXP.Functions, "row_number", (char*)exp_fn_row_number); xhAdd(&EXP.Functions, "dense_rank", (char*)exp_fn_dense_rank); xhAdd(&EXP.Functions, "lag", (char*)exp_fn_lag); - - /** Aggregate **/ + + /** Aggregate. **/ xhAdd(&EXP.Functions, "count", (char*)exp_fn_count); xhAdd(&EXP.Functions, "avg", (char*)exp_fn_avg); xhAdd(&EXP.Functions, "sum", (char*)exp_fn_sum); @@ -4545,9 +4600,10 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "first", (char*)exp_fn_first); xhAdd(&EXP.Functions, "last", (char*)exp_fn_last); xhAdd(&EXP.Functions, "nth", (char*)exp_fn_nth); - - /** Reverse functions **/ + + + /** Reverse functions. **/ xhAdd(&EXP.ReverseFunctions, "isnull", (char*)exp_fn_reverse_isnull); - + return 0; } diff --git a/centrallix/include/cxss/policy.h b/centrallix/include/cxss/policy.h index aeee11ce..6f9ca7d8 100644 --- a/centrallix/include/cxss/policy.h +++ b/centrallix/include/cxss/policy.h @@ -2,6 +2,7 @@ #define _CXSS_POLICY_H #include "cxss/cxss.h" +#include "obj.h" /************************************************************************/ /* Centrallix Application Server System */ @@ -89,4 +90,3 @@ typedef struct _CXSSPOL CxssPolicy, *pCxssPolicy; #endif /* defined _CXSS_POLICY_H */ - diff --git a/centrallix/include/expression.h b/centrallix/include/expression.h index 8d506f72..3b334606 100644 --- a/centrallix/include/expression.h +++ b/centrallix/include/expression.h @@ -307,6 +307,7 @@ int exp_internal_SetupControl(pExpression exp); pExpControl exp_internal_LinkControl(pExpControl ctl); int exp_internal_UnlinkControl(pExpControl ctl); +void meta_double_metaphone(const char* str, char** primary_code, char** secondary_code); /*** Evaluator functions ***/ int expEvalIsNull(pExpression tree, pParamObjects objlist); diff --git a/centrallix/include/stparse.h b/centrallix/include/stparse.h index b19a77cc..2f7e4163 100644 --- a/centrallix/include/stparse.h +++ b/centrallix/include/stparse.h @@ -46,7 +46,7 @@ typedef struct _SI int Magic; int LinkCnt; char* Name; /* name of attrib or group */ - char* UsrType; /* type of group, null if attrib */ + char* UsrType; /* type of group (e.g. "system/object"), null if attrib */ pExpression Value; /* value; EXPR_N_LIST if several listed */ struct _SI* Parent; /* Parent inf, null if toplevel */ struct _SI** SubInf; /* List of attrs/groups included */ diff --git a/centrallix/multiquery/multiquery.c b/centrallix/multiquery/multiquery.c index 89736275..069186e8 100644 --- a/centrallix/multiquery/multiquery.c +++ b/centrallix/multiquery/multiquery.c @@ -2086,6 +2086,7 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p mssError(1,"MQ","Expected equals after EXEC parameter"); mlxNoteError(lxs); xsFree(xs); + xs = NULL; break; } @@ -2098,6 +2099,7 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p mssError(1,"MQ","Error in EXEC parameter"); mlxNoteError(lxs); xsFree(xs); + xs = NULL; xsFree(param); break; } @@ -2108,6 +2110,7 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p mssError(1,"MQ","Could not evaluate EXEC parameter"); mlxNoteError(lxs); xsFree(xs); + xs = NULL; xsFree(param); break; } @@ -2120,7 +2123,8 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p } } - strtcpy(new_qs->Source, xs->String, sizeof(new_qs->Source)); + if (xs != NULL) + strtcpy(new_qs->Source, xs->String, sizeof(new_qs->Source)); next_state = LookForClause; } else @@ -4774,5 +4778,3 @@ mqInitialize() return 0; } - - diff --git a/centrallix/objectsystem/obj_query.c b/centrallix/objectsystem/obj_query.c index 9b64f241..c4dff40a 100644 --- a/centrallix/objectsystem/obj_query.c +++ b/centrallix/objectsystem/obj_query.c @@ -414,7 +414,6 @@ objQueryFetch(pObjQuery this, int mode) { pObject obj = NULL; void* obj_data; - char* name; char buf[OBJSYS_MAX_PATH + 32]; pObjQuerySortItem sort_item; int rval; @@ -529,14 +528,6 @@ objQueryFetch(pObjQuery this, int mode) goto error; } obj->Data = obj_data; - - this->Obj->Driver->GetAttrValue(obj_data, "name", DATA_T_STRING, &name, NULL); - if (strlen(name) + strlen(this->Obj->Pathname->Pathbuf) + 2 > OBJSYS_MAX_PATH) - { - mssError(1,"OSML","Filename in query result exceeded internal limits"); - OSMLDEBUG(OBJ_DEBUG_F_APITRACE, " null\n"); - goto error; - } /** If we need to check it, do so now. **/ if (!(this->Flags & OBJ_QY_F_FULLQUERY) && this->Tree) @@ -778,4 +769,3 @@ objGetQueryIdentityPath(pObjQuery this, char* pathbuf, int maxlen) return 0; } - diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c new file mode 100644 index 00000000..a41ffbd2 --- /dev/null +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -0,0 +1,4504 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: objdrv_cluster.c */ +/* Author: Israel Fuller */ +/* Creation: September 17, 2025 */ +/* Description: Cluster object driver. */ +/************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cxlib/clusters.h" +#include "cxlib/mtsession.h" +#include "cxlib/newmalloc.h" +#include "cxlib/util.h" +#include "cxlib/xarray.h" +#include "cxlib/xhash.h" +#include "expression.h" +#include "obj.h" +#include "param.h" +#include "st_node.h" +#include "stparse.h" + +/*** File notes: + *** This file uses comment anchors, provided by the Comment Anchors VSCode + *** extension from Starlane Studios. This allows developers with the extension + *** to control click the "LINK " comments to navigate to the coresponding + *** "ANCHOR[id=]" comment. (Note: Invalid or broken links will default to + *** the first line of the file.) + *** + *** For example, this link should take you to the function signatures: + *** LINK #functions + *** + *** Any developers without this extension can safely ignore these comments, + *** although please try not to break them. :) + *** + *** Comment Anchors VSCode Extension: + *** https://marketplace.visualstudio.com/items?itemName=ExodiusStudios.comment-anchors + ***/ + +/** Defaults for unspecified optional attributes. **/ +#define DEFAULT_MIN_IMPROVEMENT 0.0001 +#define DEFAULT_MAX_ITERATIONS 64u + +/** ================ Stuff That Should Be Somewhere Else ================ **/ +/** ANCHOR[id=temp] **/ + +/** TODO: Greg - I think this should be moved to mtsession. **/ +/*** I caused at least 10 bugs so far trying to pass format specifiers to + *** mssError without realizing that it didn't support them. Eventually, I + *** got fed up enough with the whole thing to write the following function. + ***/ +/*** Displays error text to the user. Does not print a stack trace. Does not + *** exit the program, allowing for the calling function to fail, generating + *** an error cascade which may be useful to the user since a stack trace is + *** not readily available. + *** + *** @param clr Whether to clear the current error stack. As a rule of thumb, + *** if you are the first one to detect the error, clear the stack so that + *** other unrelated messages are not shown. If you are detecting an error + *** from another function that may also call an mssError() function, do + *** not clear the stack. + *** @param module The name or abbreviation of the module in which this + *** function is being called, to help developers narrow down the location + *** of the error. + *** @param format The format text for the error, which accepts any format + *** specifier that would be accepted by printf(). + *** @param ... Variables matching format specifiers in the format. + *** @returns Nothing, always succeeds. + ***/ +void mssErrorf(int clr, char* module, const char* format, ...) + { + /** Prevent interlacing with stdout flushing at a weird time. **/ + check(fflush(stdout)); /* Failure ignored. */ + + /** Insert convenient newline before error stack begins. **/ + if (clr == 1) fprintf(stderr, "\n"); + + /** Process the format with all the same rules as printf(). **/ + char buf[BUFSIZ]; + va_list args; + va_start(args, format); + const int num_chars = vsnprintf(buf, sizeof(buf), format, args); + va_end(args); + + /** Error check vsnprintf, just to be safe. **/ + if (num_chars < 0) + { + perror("vsnprintf() failed"); + fprintf(stderr, "FAIL: mssErrorf(%d, \"%s\", \"%s\", ...)\n", clr, module, format); + return; + } + if (num_chars > BUFSIZ) + fprintf(stderr, "Warning: Error truncated (length %d > buffer size %d).\n", num_chars, BUFSIZ); + + /** Print the error. **/ + const int ret = mssError(clr, module, "%s", buf); + + /** Not sure why you have to error check the error function... **/ + if (ret != 0) fprintf(stderr, "FAIL %d: mssError(%d, \"%s\", \"%%s\", \"%s\")\n", ret, clr, module, buf); + } + + +/** TODO: Greg - I think this should be moved to datatypes. **/ +/** Should maybe replace current type parsing in the presentation hints. **/ +/*** Parse the given string into a datatype. The case of the first character + *** is ignored, but all other characters must be capitalized correctly. + *** + *** @attention - This function is optimized to prevent performance hits + *** situations where it may need to be called many thousands of times. + *** + *** @param str The string to be parsed to a datatype. + *** @returns The datatype. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +static int ci_TypeFromStr(const char* str) + { + /** All valid types are non-null strings, at least 2 characters long. **/ + if (str == NULL || str[0] == '\0' || str[1] == '\0') return -1; + + /** Check type. **/ + switch (str[0]) + { + case 'A': case 'a': + if (strcmp(str+1, "Array"+1) == 0) return DATA_T_ARRAY; + if (strcmp(str+1, "Any"+1) == 0) return DATA_T_ANY; + break; + + case 'B': case 'b': + if (strcmp(str+1, "Binary"+1) == 0) return DATA_T_BINARY; + break; + + case 'C': case 'c': + if (strcmp(str+1, "Code"+1) == 0) return DATA_T_CODE; + break; + + case 'D': case 'd': + if (strcmp(str+1, "Double"+1) == 0) return DATA_T_DOUBLE; + if (strcmp(str+1, "DateTime"+1) == 0) return DATA_T_DATETIME; + break; + + case 'I': case 'i': + if (strcmp(str+1, "Integer"+1) == 0) return DATA_T_INTEGER; + if (strcmp(str+1, "IntVector"+1) == 0) return DATA_T_INTVEC; + break; + + case 'M': case 'm': + if (strcmp(str+1, "Money"+1) == 0) return DATA_T_MONEY; + break; + + case 'S': case 's': + if (strcmp(str+1, "String"+1) == 0) return DATA_T_STRING; + if (strcmp(str+1, "StringVector"+1) == 0) return DATA_T_STRINGVEC; + break; + + case 'U': case 'u': + if (strcmp(str+1, "Unknown"+1) == 0) return DATA_T_UNAVAILABLE; + if (strcmp(str+1, "Unavailable"+1) == 0) return DATA_T_UNAVAILABLE; + break; + } + + /** Invalid type. **/ + return -1; + } + +/** TODO: Greg - I think this should be moved to datatypes. **/ +/** Should maybe replace this functionality where it appears elsewhere. **/ +static char* ci_TypeToStr(const int type) + { + switch (type) + { + case DATA_T_UNAVAILABLE: return "Unknown"; + case DATA_T_INTEGER: return "Integer"; + case DATA_T_STRING: return "String"; + case DATA_T_DOUBLE: return "Double"; + case DATA_T_DATETIME: return "DateTime"; + case DATA_T_INTVEC: return "IntVector"; + case DATA_T_STRINGVEC: return "StringVector"; + case DATA_T_MONEY: return "Money"; + case DATA_T_ARRAY: return "Array"; + case DATA_T_CODE: return "Code"; + case DATA_T_BINARY: return "Binary"; + } + + /** Invalid type. **/ + mssErrorf(1, "Cluster", "Invalid type %d.\n", type); + return "Invalid"; /* Shall not parse to a valid type in ci_TypeFromStr(). */ + } + +/** TODO: Greg - I think this should be moved to xarray. **/ +/*** Trims an xArray, returning a new array (with nmSysMalloc). + *** + *** @param arr The array to be trimmed. + *** @param cleanup 0: No clean up. + *** 1: DeInit arr. + *** 2: Free arr. + *** *: Any other value prints a warning and does nothing. + *** @returns The new array, or null if and only if the passed pXArray has 0 items. + ***/ +static void** ci_xaToTrimmedArray(pXArray arr, int array_handling) + { + const size_t arr_size = arr->nItems * sizeof(void*); + void** result = check_ptr(nmSysMalloc(arr_size)); + if (result == NULL) return NULL; + memcpy(result, arr->Items, arr_size); + + /** Handle the array. **/ + switch (array_handling) + { + case 0: break; + case 1: check(xaDeInit(arr)); arr->nAlloc = 0; break; /* Failure ignored. */ + case 2: check(xaFree(arr)); break; /* Failure ignored. */ + default: + /** Uh oh, there might be a memory leak... **/ + fprintf(stderr, + "Warning: ci_xaToTrimmedArray(%p, %d) - Unknown value (%d) for array_handling.\n", + arr, array_handling, array_handling + ); + break; + } + + return result; + } + +/** I got tired of forgetting how to do these. **/ +#define ci_file_name(obj) \ + ({ \ + __typeof__ (obj) _obj = (obj); \ + obj_internal_PathPart(_obj->Pathname, _obj->SubPtr - 1, 1); \ + }) +#define ci_file_path(obj) \ + ({ \ + __typeof__ (obj) _obj = (obj); \ + obj_internal_PathPart(_obj->Pathname, 0, _obj->SubPtr); \ + }) + + +/** ================ Enum Declarations ================ **/ +/** ANCHOR[id=enums] **/ + +/** Enum representing a clustering algorithm. **/ +typedef unsigned char ClusterAlgorithm; +#define ALGORITHM_NULL (ClusterAlgorithm)0u +#define ALGORITHM_NONE (ClusterAlgorithm)1u +#define ALGORITHM_SLIDING_WINDOW (ClusterAlgorithm)2u +#define ALGORITHM_KMEANS (ClusterAlgorithm)3u +#define ALGORITHM_KMEANS_PLUS_PLUS (ClusterAlgorithm)4u +#define ALGORITHM_KMEDOIDS (ClusterAlgorithm)5u +#define ALGORITHM_DB_SCAN (ClusterAlgorithm)6u + +#define nClusteringAlgorithms 7u +ClusterAlgorithm ALL_CLUSTERING_ALGORITHMS[nClusteringAlgorithms] = + { + ALGORITHM_NULL, + ALGORITHM_NONE, + ALGORITHM_SLIDING_WINDOW, + ALGORITHM_KMEANS, + ALGORITHM_KMEANS_PLUS_PLUS, + ALGORITHM_KMEDOIDS, + ALGORITHM_DB_SCAN, + }; + +/** Converts a clustering algorithm to its string name. **/ +char* ci_ClusteringAlgorithmToString(ClusterAlgorithm clustering_algorithm) + { + switch (clustering_algorithm) + { + case ALGORITHM_NULL: return "NULL algorithm"; + case ALGORITHM_NONE: return "none"; + case ALGORITHM_SLIDING_WINDOW: return "sliding-window"; + case ALGORITHM_KMEANS: return "k-means"; + case ALGORITHM_KMEANS_PLUS_PLUS: return "k-means++"; + case ALGORITHM_KMEDOIDS: return "k-medoids"; + case ALGORITHM_DB_SCAN: return "db-scan"; + default: return "Unknown algorithm"; + } + } + +/** Enum representing a similarity measurement algorithm. **/ +typedef unsigned char SimilarityMeasure; +#define SIMILARITY_NULL (SimilarityMeasure)0u +#define SIMILARITY_COSINE (SimilarityMeasure)1u +#define SIMILARITY_LEVENSHTEIN (SimilarityMeasure)2u + +#define nSimilarityMeasures 3u +SimilarityMeasure ALL_SIMILARITY_MEASURES[nSimilarityMeasures] = + { + SIMILARITY_NULL, + SIMILARITY_COSINE, + SIMILARITY_LEVENSHTEIN, + }; + +/** Converts a similarity measure to its string name. **/ +char* ci_SimilarityMeasureToString(SimilarityMeasure similarity_measure) + { + switch (similarity_measure) + { + case SIMILARITY_NULL: return "NULL similarity measure"; + case SIMILARITY_COSINE: return "cosine"; + case SIMILARITY_LEVENSHTEIN: return "levenshtein"; + default: return "Unknown similarity measure"; + } + } + +/*** Enum representing the type of data targetted by the driver, + *** set based on the path given when the driver is used to open + *** a cluster file. + *** + *** `0u` is reserved for a possible `NULL` value in the future. + *** However, there is currently no allowed `NULL` TargetType. + ***/ +typedef unsigned char TargetType; +#define TARGET_ROOT (TargetType)1u +#define TARGET_CLUSTER (TargetType)2u +#define TARGET_SEARCH (TargetType)3u +#define TARGET_CLUSTER_ENTRY (TargetType)4u +#define TARGET_SEARCH_ENTRY (TargetType)5u + +/** Attribute name lists by TargetType. **/ +#define END_OF_ARRAY NULL +char* const ATTR_ROOT[] = + { + "source", + "attr_name", + "date_created", + "date_computed", + END_OF_ARRAY, + }; +char* const ATTR_CLUSTER[] = + { + "algorithm", + "similarity_measure", + "num_clusters", + "min_improvement", + "max_iterations", + "date_created", + "date_computed", + END_OF_ARRAY, + }; +char* const ATTR_SEARCH[] = + { + "source", + "threshold", + "similarity_measure", + "date_created", + "date_computed", + END_OF_ARRAY, + }; +char* const ATTR_CLUSTER_ENTRY[] = + { + "items", + "date_created", + "date_computed", + END_OF_ARRAY, + }; +char* const ATTR_SEARCH_ENTRY[] = + { + "key1", + "key2", + "sim", + "date_created", + "date_computed", + END_OF_ARRAY, + }; + +/** Method name list. **/ +char* const METHOD_NAME[] = + { + "cache", + "stat", + END_OF_ARRAY, + }; + + +/** ================ Struct Declarations ================ **/ +/** ANCHOR[id=structs] **/ + +/*** Represents the data source which may have data already fetched. + *** + *** Memory Stats: + *** - Padding: 4 bytes + *** - Total size: 80 bytes + *** + *** @skip --> Attribute Data. + *** @param Name The source name, specified in the .cluster file. + *** @param Key The key associated with this object in the SourceDataCache. + *** @param SourcePath The path to the data source from which to retrieve data. + *** @param KeyAttr The name of the attribute to use when getting keys from + *** the SourcePath. + *** @param NameAttr The name of the attribute to use when getting data from + *** the SourcePath. + *** + *** @skip --> Computed data. + *** @param Strings The keys for each data string strings received from the + *** database, allowing them to be lined up again when queried. + *** @param Strings The data strings to be clustered and searched, or NULL if + *** they have not been fetched from the source. + *** @param Vectors The cosine comparison vectors from the fetched data, or + *** NULL if they haven't been computed. Note that vectors are no longer + *** needed once all clusters and searches have been computed, so they are + *** automatically freed in that case to save memory. + *** @param nVectors The number of vectors and data strings. + *** + *** @skip --> Time. + *** @param DateCreated The date and time that this object was created and initialized. + *** @param DateComputed The date and time that the computed attributes were computed. + ***/ +typedef struct _SOURCE + { + char* Name; + char* Key; + char* SourcePath; + char* KeyAttr; + char* NameAttr; + char** Keys; + char** Strings; + pVector* Vectors; + unsigned int nVectors; + DateTime DateCreated; + DateTime DateComputed; + } + SourceData, *pSourceData; + + +/*** Computed data for a single cluster. + *** + *** Memory Stats: + *** - Padding: 4 bytes + *** - Total size: 24 bytes + *** + *** @param Size The number of items in the cluster. + *** @param Strings The string values of each item. + *** @param Vectors The cosine vectors for each item. + ***/ +typedef struct + { + unsigned int Size; + char** Strings; + pVector* Vectors; + } + Cluster, *pCluster; + + +/*** Data for each cluster. Only attribute data is checked for caching. + *** + *** Memory Stats: + *** - Padding: 2 bytes + *** - Total size: 96 bytes + *** + *** @skip --> Attribute Data. + *** @param Name The cluster name, specified in the .cluster file. + *** @param Key The key associated with this object in the ClusterDataCache. + *** @param ClusterAlgorithm The clustering algorithm to be used. + *** @param SimilarityMeasure The similarity measure used to compare items. + *** @param nClusters The number of clusters. 1 if algorithm = none. + *** @param MinImprovement The minimum amount of improvement that must be met + *** each clustering iteration. If there is less improvement, the algorithm + *** will stop. The "max" in a .cluster file is represented by -inf. + *** @param MaxIterations The maximum number of iterations that a clustering + *** algorithm can run for. Note: Sliding window uses this attribute to store + *** the window_size. + *** + *** @skip --> Relationship Data. + *** @param nSubClusters The number of subclusters of this cluster. + *** @param SubClusters A pClusterData array, NULL if nSubClusters == 0. + *** @param Parent This cluster's parent. NULL if it is not a subcluster. + *** @param SourceData Pointer to the source data that this cluster uses. + *** + *** @skip --> Computed data. + *** @param Clusters An array of length num_clusters, NULL if the clusters + *** have not yet been computed. + *** @param Sims An array of num_vectors elements, where index i stores the + *** similarity of vector i to its assigned cluster. This attribute is NULL + *** if the clusters have not yet been computed. + *** + *** @skip --> Time. + *** @param DateCreated The date and time that this object was created and initialized. + *** @param DateComputed The date and time that the computed attributes were computed. + ***/ +typedef struct _CLUSTER + { + char* Name; + char* Key; + ClusterAlgorithm ClusterAlgorithm; + SimilarityMeasure SimilarityMeasure; + unsigned int nClusters; + double MinImprovement; + unsigned int MaxIterations; + unsigned int nSubClusters; + struct _CLUSTER** SubClusters; + struct _CLUSTER* Parent; + pSourceData SourceData; + Cluster* Clusters; + double* Sims; + DateTime DateCreated; + DateTime DateComputed; + } + ClusterData, *pClusterData; + + +/*** Data for each search. + *** + *** Memory Stats: + *** - Padding: 3 bytes + *** - Total size: 64 bytes + *** + *** @skip --> Attribute Data. + *** @param Name The search name, specified in the .cluster file. + *** @param Key The key associated with this object in the SearchDataCache. + *** @param Source The cluster from which this search is to be derived. + *** @param SimilarityMeasure The similarity measure used to compare items. + *** @param Threshold The minimum similarity threshold for elements to be + *** included in the results of the search. + *** + *** @skip --> Computed data. + *** @param Dups An array holding the dups found by the search, or NULL if the + *** search has not been computed. + *** @param nDups The number of dups found. + *** + *** @skip --> Time. + *** @param DateCreated The date and time that this object was created and initialized. + *** @param DateComputed The date and time that the computed attributes were computed. + ***/ +typedef struct _SEARCH + { + char* Name; + char* Key; + pClusterData SourceCluster; + double Threshold; + pDup* Dups; + unsigned int nDups; + SimilarityMeasure SimilarityMeasure; + DateTime DateCreated; + DateTime DateComputed; + } + SearchData, *pSearchData; + + +/*** Node instance data. + *** + *** Memory Stats: + *** - Padding: 4 bytes + *** - Total size: 64 bytes + *** + *** @note When a .cluster file is openned, there will be only one node for that + *** file. However, in the course of the query, many driver instance structs + *** may be created by functions like clusterQueryFetch(), and closed by the + *** object system using clusterClose(). + *** + *** @param SourceData Data from the provided source. + *** @param Params A pParam array storing the params in the .cluster file. + *** @param nParams The number of specified params. + *** @param ParamList Functions as a "scope" for resolving values during parsing. + *** @param ClusterDatas A pCluster array storing the clusters in the .cluster file. + *** Will be NULL if nClusters = 0. + *** @param nClusterDatas The number of specified clusters. + *** @param SearchDatas A SearchData array storing the searches in the .cluster file. + *** @param nSearches The number of specified searches. + *** @param nSearchDatas The parent object used to open this NodeData instance. + ***/ +typedef struct _NODE + { + pObject Parent; + pParam* Params; + pParamObjects ParamList; + pSourceData SourceData; + pClusterData* ClusterDatas; + pSearchData* SearchDatas; + unsigned int nParams; + unsigned int nClusterDatas; + unsigned int nSearchDatas; + } + NodeData, *pNodeData; + +/*** Driver instance data. + *** + *** Memory Stats: + *** - Padding: 1 bytes + *** - Total size: 24 bytes + *** + *** This struct can be thought of like a "pointer" to specific data accessible + *** through the stored pNodeData struct. This struct also communicates whether + *** that data is guaranteed to have been computed. + *** + *** For example, if target type is the root, a cluster, or a search, no data + *** is guaranteed to be computed. These three types can be returned from + *** clusterOpen(), based on the provided path. + *** + *** Alternatively, a cluster entry or search entry can be targetted by calling + *** fetch on a query pointing to a driver instance that targets a cluster or + *** search (respectively). These two entry target types ensure that the data + *** they indicate has been computed, so the GetAttrType() and GetAttrValue() + *** functions do not need to check this repeatedly each time they are called. + *** + *** @param NodeData The associated node data struct. There can be many driver + *** instances pointing to one NodeData at a time, but each driver instance + *** always points to singular NodeData struct. + *** @param TargetType The type of data targetted (see above). + *** @param TargetData If target type is: + *** ```csv + *** Root: A pointer to the SourceData struct. + *** Cluster or ClusterEntry: A pointer to the targetted cluster. + *** Search or SearchEntry: A pointer to the targetted search. + *** ``` + *** @param TargetAttrIndex An index into an attribute list (for GetNextAttr()). + *** @param TargetMethodIndex An index into an method list (for GetNextMethod()). + ***/ +typedef struct _DRIVER + { + pNodeData NodeData; + void* TargetData; + unsigned int TargetIndex; + unsigned char TargetAttrIndex; + unsigned char TargetMethodIndex; + TargetType TargetType; + } + DriverData, *pDriverData; + +/*** Query instance data. + *** + *** Memory Stats: + *** - Padding: 4 bytes + *** - Total size: 16 bytes + *** + *** @param DriverData The associated driver instance being queried. + *** @param RowIndex The selected row of the data targetted by the driver. + ***/ +typedef struct + { + pDriverData DriverData; + unsigned int RowIndex; + } + ClusterQuery, *pClusterQuery; + + +/** Global storage for caches. **/ +struct + { + XHashTable SourceDataCache; + XHashTable ClusterDataCache; + XHashTable SearchDataCache; + } + ClusterDriverCaches; + +struct + { + unsigned long long OpenCalls; + unsigned long long OpenQueryCalls; + unsigned long long FetchCalls; + unsigned long long CloseCalls; + unsigned long long GetTypeCalls; + unsigned long long GetValCalls; + unsigned long long GetValCalls_name; + unsigned long long GetValCalls_key1; + unsigned long long GetValCalls_key2; + unsigned long long GetValCalls_sim; + } ClusterStatistics; + + +/** ================ Function Declarations ================ **/ +/** ANCHOR[id=functions] **/ + +/** Note: ci stands for "cluster_internal". **/ + +/** Parsing Functions. **/ +// LINK #parsing +static void ci_GiveHint(const char* hint); +static bool ci_TryHint(char* value, char** valid_values, const unsigned int n_valid_values); +static int ci_ParseAttribute(pStructInf inf, char* attr_name, int datatype, pObjData data, pParamObjects param_list, bool required, bool print_type_error); +static ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf cluster_inf, pParamObjects param_list); +static SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf cluster_inf, pParamObjects param_list); +static pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* path); +static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data); +static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data); +static pNodeData ci_ParseNodeData(pStructInf inf, pObject obj); + +/** Freeing Functions. **/ +// LINK #freeing +static void ci_FreeSourceData(pSourceData source_data); +static void ci_FreeClusterData(pClusterData cluster_data, bool recursive); +static void ci_FreeSearchData(pSearchData search_data); +static void ci_FreeNodeData(pNodeData node_data); +static void ci_ClearCaches(void); + +/** Deep Size Computation Functions. **/ +// LINK #sizing +static unsigned int ci_SizeOfSourceData(pSourceData source_data); +static unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive); +static unsigned int ci_SizeOfSearchData(pSearchData search_data); + +/** Computation Functions. (Ensure data is computed.) **/ +// LINK #computation +static int ci_ComputeSourceData(pSourceData source_data, pObjSession session); +static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data); +static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data); + +/** Parameter Functions. **/ +// LINK #params +static int ci_GetParamType(void* inf_v, const char* attr_name); +static int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val); +static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val); + +/** Driver Functions. **/ +// LINK #driver +void* clusterOpen(pObject parent, int mask, pContentType systype, char* usr_type, pObjTrxTree* oxt); +int clusterClose(void* inf_v, pObjTrxTree* oxt); +void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt); +void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt); +int clusterQueryClose(void* qy_v, pObjTrxTree* oxt); +int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt); +int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt); +char* clusterGetFirstAttr(void* inf_v, pObjTrxTree* oxt); +char* clusterGetNextAttr(void* inf_v, pObjTrxTree* oxt); +int clusterInfo(void* inf_v, pObjectInfo info); + +/** Method Execution Functions. **/ +// LINK #method +char* clusterGetFirstMethod(void* inf_v, pObjTrxTree* oxt); +char* clusterGetNextMethod(void* inf_v, pObjTrxTree* oxt); +static int ci_PrintEntry(pXHashEntry entry, void* arg); +static void ci_CacheFreeSourceData(pXHashEntry entry, void* path); +static void ci_CacheFreeCluster(pXHashEntry entry, void* path); +static void ci_CacheFreeSearch(pXHashEntry entry, void* path); +int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree* oxt); + +/** Unimplemented DriverFunctions. **/ +// LINK #unimplemented +int clusterCreate(pObject obj, int mask, pContentType systype, char* usrtype, pObjTrxTree* oxt); +int clusterDelete(pObject obj, pObjTrxTree* oxt); +int clusterDeleteObj(void* inf_v, pObjTrxTree* oxt); +int clusterRead(void* inf_v, char* buffer, int max_cnt, int offset, int flags, pObjTrxTree* oxt); +int clusterWrite(void* inf_v, char* buffer, int cnt, int offset, int flags, pObjTrxTree* oxt); +int clusterSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree* oxt); +void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree* oxt); +int clusterCommit(void* inf_v, pObjTrxTree *oxt); + +/** ================ Parsing Functions ================ **/ +/** ANCHOR[id=parsing] **/ +// LINK #functions + +/** Format a hint to give to the user. **/ +static void ci_GiveHint(const char* hint) + { + fprintf(stderr, " > Hint: Did you mean \"%s\"?\n", hint); + } + + +/*** Given the user a hint when they specify an invalid string for an attribute + *** where we know the list of valid strings. The hint is only displayed if + *** their string is close enough to a valid string. + *** + *** @param value The value the user gave. + *** @param valid_values The valid values that could be what they meant. + *** @param n_valid_values The number of valid values. Specify 0 to detect + *** length on a null terminated array of values. + *** @returns Whether a hint was given. + ***/ +static bool ci_TryHint(char* value, char** valid_values, const unsigned int n_valid_values) + { + char* guess = ca_most_similar(value, (void**)valid_values, n_valid_values, ca_lev_compare, 0.25); + if (guess == NULL) return false; /* No hint. */ + + /** Issue hint. **/ + ci_GiveHint(guess); + return true; + } + + +// LINK #functions +/*** Returns 0 for success and -1 on failure. Promises that mssError() will be + *** invoked on failure, so the caller need not specify their own error message. + *** Returns 1 if attribute is available, printing an error if the attribute was + *** marked as required. + *** + *** @attention - Promises that a failure invokes mssError() at least once. + *** + *** TODO: Greg - Review carefully. I think this code is the reason that runserver() + *** is NOT REQUIRED for dynamic attributes in the cluster driver. I had to debug + *** and rewrite this for ages and it uses several functions I don't 100% understand. + ***/ +static int ci_ParseAttribute( + pStructInf inf, + char* attr_name, + int datatype, + pObjData data, + pParamObjects param_list, + bool required, + bool print_type_error) + { + int ret; + + /** Get attribute inf. **/ + pStructInf attr_info = stLookup(inf, attr_name); + if (attr_info == NULL) + { + if (required) mssErrorf(1, "Cluster", "'%s' must be specified for clustering.", attr_name); + return 1; + } + ASSERTMAGIC(attr_info, MGK_STRUCTINF); + + /** Allocate expression. **/ + pExpression exp = check_ptr(stGetExpression(attr_info, 0)); + if (exp == NULL) goto err; + + /** Bind parameters. **/ + /** TODO: Greg - What does this return? How do I know if it fails? **/ + expBindExpression(exp, param_list, EXPR_F_RUNSERVER); + + /** Evaluate expression. **/ + ret = expEvalTree(exp, param_list); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Expression evaluation failed (error code %d).", ret); + goto err; + } + + /** Check for data type mismatch. **/ + if (datatype != exp->DataType) + { + mssErrorf(1, "Cluster", + "Expected ['%s' : %s], but got type %s.", + attr_name, ci_TypeToStr(datatype), ci_TypeToStr(exp->DataType) + ); + goto err; + } + + /** Get the data out of the expression. **/ + ret = expExpressionToPod(exp, datatype, data); + if (ret != 0) + { + mssErrorf(1, "Cluster", + "Failed to get ['%s' : %s] using expression \"%s\" (error code %d).", + attr_name, ci_TypeToStr(datatype), exp->Name, ret + ); + goto err; + } + + /** Success. **/ + return 0; + + err: + mssErrorf(0, "Cluster", + "Failed to parse attribute \"%s\" from group \"%s\"", + attr_name, inf->Name + ); + return -1; + } + + +// LINK #functions +/*** Parses a ClusteringAlgorithm from the algorithm attribute in the pStructInf + *** representing some structure with that attribute in a parsed structure file. + *** + *** @attention - Promises that a failure invokes mssError() at least once. + *** + *** @param inf A parsed pStructInf. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @returns The data algorithm, or ALGORITHM_NULL on failure. + ***/ +static ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf inf, pParamObjects param_list) + { + /** Get the algorithm attribute. **/ + char* algorithm; + if (ci_ParseAttribute(inf, "algorithm", DATA_T_STRING, POD(&algorithm), param_list, true, true) != 0) + { + mssErrorf(0, "Cluster", "Failed to parse attribute 'algorithm' in group \"%s\".", inf->Name); + return ALGORITHM_NULL; + } + + /** Parse known clustering algorithms. **/ + if (!strcasecmp(algorithm, "none")) return ALGORITHM_NONE; + if (!strcasecmp(algorithm, "sliding-window")) return ALGORITHM_SLIDING_WINDOW; + if (!strcasecmp(algorithm, "k-means")) return ALGORITHM_KMEANS; + if (!strcasecmp(algorithm, "k-means++")) return ALGORITHM_KMEANS_PLUS_PLUS; + if (!strcasecmp(algorithm, "k-medoids")) return ALGORITHM_KMEDOIDS; + if (!strcasecmp(algorithm, "db-scan")) return ALGORITHM_DB_SCAN; + + /** Unknown value for clustering algorithm. **/ + mssErrorf(1, "Cluster", "Unknown \"clustering algorithm\": %s", algorithm); + + /** Attempt to give a hint. **/ + char* all_names[nClusteringAlgorithms] = {NULL}; + for (unsigned int i = 0u; i < nClusteringAlgorithms; i++) + all_names[i] = ci_ClusteringAlgorithmToString(ALL_CLUSTERING_ALGORITHMS[i]); + if (ci_TryHint(algorithm, all_names, nClusteringAlgorithms)); + else if (strcasecmp(algorithm, "sliding") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_SLIDING_WINDOW)); + else if (strcasecmp(algorithm, "window") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_SLIDING_WINDOW)); + else if (strcasecmp(algorithm, "null") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_NONE)); + else if (strcasecmp(algorithm, "nothing") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_NONE)); + + /** Fail. **/ + return ALGORITHM_NULL; + } + + +// LINK #functions +/*** Parses a SimilarityMeasure from the similarity_measure attribute in the given + *** pStructInf parameter, which represents some structure with that attribute + *** in a parsed structure file. + *** + *** @attention - Promises that a failure invokes mssError() at least once. + *** + *** @param inf A parsed pStructInf. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @returns The similarity measure, or SIMILARITY_NULL on failure. + ***/ +static SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf inf, pParamObjects param_list) + { + /** Get the similarity_measure attribute. **/ + char* measure; + if (ci_ParseAttribute(inf, "similarity_measure", DATA_T_STRING, POD(&measure), param_list, true, true) != 0) + { + mssErrorf(0, "Cluster", "Failed to parse attribute 'similarity_measure' in group \"%s\".", inf->Name); + return SIMILARITY_NULL; + } + + /** Parse known clustering algorithms. **/ + if (!strcasecmp(measure, "cosine")) return SIMILARITY_COSINE; + if (!strcasecmp(measure, "levenshtein")) return SIMILARITY_LEVENSHTEIN; + + /** Unknown similarity measure. **/ + mssErrorf(1, "Cluster", "Unknown \"similarity measure\": %s", measure); + + /** Attempt to give a hint. **/ + char* all_names[nSimilarityMeasures] = {NULL}; + for (unsigned int i = 0u; i < nSimilarityMeasures; i++) + all_names[i] = ci_SimilarityMeasureToString(ALL_SIMILARITY_MEASURES[i]); + if (ci_TryHint(measure, all_names, nSimilarityMeasures)); + else if (strcasecmp(measure, "cos") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_COSINE)); + else if (strcasecmp(measure, "lev") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_LEVENSHTEIN)); + else if (strcasecmp(measure, "edit-dist") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_LEVENSHTEIN)); + else if (strcasecmp(measure, "edit-distance") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_LEVENSHTEIN)); + + /** Fail. **/ + return SIMILARITY_NULL; + } + + +// LINK #functions +/*** Allocates a new pSourceData struct from a parsed pStructInf representing + *** a .cluster structure file. + *** + *** @attention - Warning: Caching in use. + *** @attention - Promises that a failure invokes mssError() at least once. + *** + *** @param inf A parsed pStructInf for a .cluster structure file. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @param path The file path to the parsed structure file, used to generate + *** cache entry keys. + *** @returns A new pSourceData struct on success, or NULL on failure. + ***/ +static pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* path) + { + char* buf = NULL; + + /** Allocate SourceData. **/ + pSourceData source_data = check_ptr(nmMalloc(sizeof(SourceData))); + if (source_data == NULL) goto err_free; + memset(source_data, 0, sizeof(SourceData)); + + /** Initialize obvious values for SourceData. **/ + source_data->Name = check_ptr(nmSysStrdup(inf->Name)); + if (source_data->Name == NULL) goto err_free; + if (!check(objCurrentDate(&source_data->DateCreated))) goto err_free; + + /** Get source. **/ + if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err_free; + source_data->SourcePath = check_ptr(nmSysStrdup(buf)); + if (source_data->SourcePath == NULL) goto err_free; + + /** Get the attribute name to use when querying keys from the source. **/ + if (ci_ParseAttribute(inf, "key_attr", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err_free; + source_data->KeyAttr = check_ptr(nmSysStrdup(buf)); + if (source_data->KeyAttr == NULL) goto err_free; + + /** Get the attribute name to use for querying data from the source. **/ + if (ci_ParseAttribute(inf, "data_attr", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err_free; + source_data->NameAttr = check_ptr(nmSysStrdup(buf)); + if (source_data->NameAttr == NULL) goto err_free; + + /** Create cache entry key. **/ + const size_t len = strlen(path) + strlen(source_data->SourcePath) + strlen(source_data->KeyAttr) + strlen(source_data->NameAttr) + 5lu; + source_data->Key = check_ptr(nmSysMalloc(len * sizeof(char))); + if (source_data->Key == NULL) goto err_free; + snprintf(source_data->Key, len, "%s?%s->%s:%s", path, source_data->SourcePath, source_data->KeyAttr, source_data->NameAttr); + + /** Check for a cached version. **/ + pSourceData source_maybe = (pSourceData)xhLookup(&ClusterDriverCaches.SourceDataCache, source_data->Key); + if (source_maybe != NULL) + { + /** Cache hit. **/ + + /** Cause an immediate invalid read if cache was incorrectly freed. **/ + + /** Free data we don't need. **/ + nmSysFree(source_data->Key); + ci_FreeSourceData(source_data); + + /** Return the cached source data. **/ + return source_maybe; + } + + /** Cache miss: Add the new object to the cache for next time. **/ + if (!check(xhAdd(&ClusterDriverCaches.SourceDataCache, source_data->Key, (void*)source_data))) + goto err_free; + + /** Success. **/ + return source_data; + + /** Error handling. **/ + err_free: + if (source_data->Key != NULL) nmSysFree(source_data->Key); + if (source_data != NULL) ci_FreeSourceData(source_data); + + mssErrorf(0, "Cluster", + "Failed to parse source data from group \"%s\" in file: %s", + inf->Name, path + ); + return NULL; + } + + +// LINK #functions +/*** Allocates a new pClusterData struct from a parsed pStructInf. + *** + *** @attention - Warning: Caching in use. + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf for a cluster group in a structure file. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @param source_data The pSourceData that clusters are to be built from, also + *** used to generate cache entry keys. + *** @returns A new pClusterData struct on success, or NULL on failure. + ***/ +static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) + { + int result; + + /** Extract values. **/ + pParamObjects param_list = node_data->ParamList; + pSourceData source_data = node_data->SourceData; + + /** Allocate space for data struct. **/ + pClusterData cluster_data = check_ptr(nmMalloc(sizeof(ClusterData))); + if (cluster_data == NULL) goto err; + memset(cluster_data, 0, sizeof(ClusterData)); + + /** Basic Properties. **/ + cluster_data->Name = check_ptr(nmSysStrdup(inf->Name)); + if (cluster_data->Name == NULL) goto err_free_cluster; + cluster_data->SourceData = check_ptr(source_data); + if (cluster_data->SourceData == NULL) goto err_free_cluster; + if (!check(objCurrentDate(&cluster_data->DateCreated))) goto err_free_cluster; + + /** Get algorithm. **/ + cluster_data->ClusterAlgorithm = ci_ParseClusteringAlgorithm(inf, param_list); + if (cluster_data->ClusterAlgorithm == ALGORITHM_NULL) goto err_free_cluster; + + /** Handle no clustering case. **/ + if (cluster_data->ClusterAlgorithm == ALGORITHM_NONE) + { + cluster_data->nClusters = 1u; + goto parsing_done; + } + + /** Get similarity_measure. **/ + cluster_data->SimilarityMeasure = ci_ParseSimilarityMeasure(inf, param_list); + if (cluster_data->SimilarityMeasure == SIMILARITY_NULL) goto err_free_cluster; + + /** Handle sliding window case. **/ + if (cluster_data->ClusterAlgorithm == ALGORITHM_SLIDING_WINDOW) + { + /** Sliding window doesn't allocate any clusters. **/ + cluster_data->nClusters = 0u; + + /** Get window_size. **/ + int window_size; + if (ci_ParseAttribute(inf, "window_size", DATA_T_INTEGER, POD(&window_size), param_list, true, true) != 0) + goto err_free_cluster; + if (window_size < 1) + { + mssErrorf(1, "Cluster", "Invalid value for [window_size : uint > 0]: %d", window_size); + goto err_free_cluster; + } + + /** Store value. **/ + cluster_data->MaxIterations = (unsigned int)window_size; + goto parsing_done; + } + + /** Get num_clusters. **/ + int num_clusters; + if (ci_ParseAttribute(inf, "num_clusters", DATA_T_INTEGER, POD(&num_clusters), param_list, true, true) != 0) + goto err_free_cluster; + if (num_clusters < 2) + { + mssErrorf(1, "Cluster", "Invalid value for [num_clusters : uint > 1]: %d", num_clusters); + if (num_clusters == 1) fprintf(stderr, "HINT: Use algorithm=\"none\" to disable clustering.\n"); + goto err_free_cluster; + } + cluster_data->nClusters = (unsigned int)num_clusters; + + /** Get min_improvement. **/ + double improvement; + result = ci_ParseAttribute(inf, "min_improvement", DATA_T_DOUBLE, POD(&improvement), param_list, false, false); + if (result == 1) cluster_data->MinImprovement = DEFAULT_MIN_IMPROVEMENT; + else if (result == 0) + { + if (improvement <= 0.0 || 1.0 <= improvement) + { + mssErrorf(1, "Cluster", "Invalid value for [min_improvement : 0.0 < x < 1.0 | \"none\"]: %g", improvement); + goto err_free_cluster; + } + + /** Successfully got value. **/ + cluster_data->MinImprovement = improvement; + } + else if (result == -1) + { + char* str; + result = ci_ParseAttribute(inf, "min_improvement", DATA_T_STRING, POD(&str), param_list, false, true); + if (result != 0) goto err_free_cluster; + if (strcasecmp(str, "none") != 0) + { + mssErrorf(1, "Cluster", "Invalid value for [min_improvement : 0.0 < x < 1.0 | \"none\"]: %s", str); + goto err_free_cluster; + } + + /** Successfully got none. **/ + cluster_data->MinImprovement = -INFINITY; + } + + /** Get max_iterations. **/ + int max_iterations; + result = ci_ParseAttribute(inf, "max_iterations", DATA_T_INTEGER, POD(&max_iterations), param_list, false, true); + if (result == -1) goto err_free_cluster; + if (result == 0) + { + if (max_iterations < 1) + { + mssErrorf(1, "Cluster", "Invalid value for [max_iterations : uint]: %d", max_iterations); + goto err_free_cluster; + } + cluster_data->MaxIterations = (unsigned int)max_iterations; + } + else cluster_data->MaxIterations = DEFAULT_MAX_ITERATIONS; + + /** Search for sub-clusters. **/ + XArray sub_clusters; + if (!check(xaInit(&sub_clusters, 4u))) goto err_free_cluster; + for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + pStructInf sub_inf = check_ptr(inf->SubInf[i]); + ASSERTMAGIC(sub_inf, MGK_STRUCTINF); + char* name = sub_inf->Name; + + /** Handle various struct types. **/ + const int struct_type = stStructType(sub_inf); + switch (struct_type) + { + case ST_T_ATTRIB: + { + /** Valid attribute names. **/ + char* attrs[] = { + "algorithm", + "similarity_measure", + "num_clusters", + "min_improvement", + "max_iterations", + "window_size", + }; + const unsigned int nattrs = sizeof(attrs) / sizeof(char*); + + /** Ignore valid attribute names. **/ + bool is_valid = false; + for (unsigned int i = 0u; i < nattrs; i++) + { + if (strcmp(name, attrs[i]) == 0) + { + is_valid = true; + break; + } + } + if (is_valid) continue; /* Next inf. */ + + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown attribute '%s' in cluster \"%s\".\n", name, inf->Name); + if (ci_TryHint(name, attrs, nattrs)); + else if (strcasecmp(name, "k") == 0) ci_GiveHint("num_clusters"); + else if (strcasecmp(name, "threshold") == 0) ci_GiveHint("min_improvement"); + + break; + } + + case ST_T_SUBGROUP: + { + /** Select array by group type. **/ + char* group_type = check_ptr(sub_inf->UsrType); + if (group_type == NULL) goto err_free_subclusters; + if (strcmp(group_type, "cluster/cluster") != 0) + { + mssErrorf(1, "Cluster", + "Warning: Unknown group [\"%s\" : \"%s\"] in cluster \"%s\".\n", + name, group_type, inf->Name + ); + continue; + } + + /** Subcluster found. **/ + pClusterData sub_cluster = ci_ParseClusterData(sub_inf, node_data); + if (sub_cluster == NULL) goto err_free_subclusters; + sub_cluster->Parent = cluster_data; + if (!check_neg(xaAddItem(&sub_clusters, sub_cluster))) goto err_free_subclusters; + + break; + } + + default: + { + mssErrorf(1, "Cluster", + "Warning: Unknown struct type %d in cluster \"%s\".", + struct_type, inf->Name + ); + goto err_free_subclusters; + } + } + } + cluster_data->nSubClusters = sub_clusters.nItems; + cluster_data->SubClusters = (pClusterData*)ci_xaToTrimmedArray(&sub_clusters, 1); + + /** Create the cache key. **/ + parsing_done:; + char* key; + switch (cluster_data->ClusterAlgorithm) + { + case ALGORITHM_NONE: + { + const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 8lu; + key = nmSysMalloc(len * sizeof(char)); + snprintf(key, len, "%s/%s?%u", + source_data->Key, + cluster_data->Name, + ALGORITHM_NONE + ); + break; + } + + case ALGORITHM_SLIDING_WINDOW: + { + const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 16lu; + key = nmSysMalloc(len * sizeof(char)); + snprintf(key, len, "%s/%s?%u&%u&%u", + source_data->Key, + cluster_data->Name, + ALGORITHM_SLIDING_WINDOW, + cluster_data->SimilarityMeasure, + cluster_data->MaxIterations + ); + break; + } + + default: + { + const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 32lu; + key = nmSysMalloc(len * sizeof(char)); + snprintf(key, len, "%s/%s?%u&%u&%u&%g&%u", + source_data->Key, + cluster_data->Name, + cluster_data->ClusterAlgorithm, + cluster_data->SimilarityMeasure, + cluster_data->nClusters, + cluster_data->MinImprovement, + cluster_data->MaxIterations + ); + break; + } + } + cluster_data->Key = key; + + /** Check for a cached version. **/ + pClusterData cluster_maybe = (pClusterData)xhLookup(&ClusterDriverCaches.ClusterDataCache, key); + if (cluster_maybe != NULL) + { /* Cache hit. */ + /** Free the parsed cluster that we no longer need. */ + ci_FreeClusterData(cluster_data, false); + nmSysFree(key); + + /** Return the cached cluster. **/ + return cluster_maybe; + } + + /** Cache miss. **/ + if (!check(xhAdd(&ClusterDriverCaches.ClusterDataCache, key, (void*)cluster_data))) goto err_free_key; + return cluster_data; + + /** Error cleanup. **/ + err_free_key: + nmSysFree(key); + + err_free_subclusters: + for (unsigned int i = 0u; i < sub_clusters.nItems; i++) + ci_FreeClusterData(sub_clusters.Items[i], true); + check(xaDeInit(&sub_clusters)); /* Failure ignored. */ + + err_free_cluster: + ci_FreeClusterData(cluster_data, false); + + err: + mssErrorf(0, "Cluster", "Failed to parse cluster from group \"%s\".", inf->Name); + return NULL; + } + + +// LINK #functions +/*** Allocates a new pSearchData struct from a parsed pStructInf. + *** + *** @attention - Warning: Caching in use. + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf for a search group in a structure file. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @param node_data The pNodeData, used to get the param list and to look up + *** the cluster pointed to by the source attribute. + *** @returns A new pSearchData struct on success, or NULL on failure. + ***/ +static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) + { + /** Allocate space for search struct. **/ + pSearchData search_data = check_ptr(nmMalloc(sizeof(SearchData))); + if (search_data == NULL) goto err; + memset(search_data, 0, sizeof(SearchData)); + + /** Get basic information. **/ + search_data->Name = check_ptr(nmSysStrdup(inf->Name)); + if (search_data->Name == NULL) goto err_free_search; + if (!check(objCurrentDate(&search_data->DateCreated))) goto err_free_search; + + /** Get source cluster. **/ + char* source_cluster_name; + if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&source_cluster_name), node_data->ParamList, true, true) != 0) return NULL; + for (unsigned int i = 0; i < node_data->nClusterDatas; i++) + { + pClusterData cluster_data = node_data->ClusterDatas[i]; + if (strcmp(source_cluster_name, cluster_data->Name) == 0) + { + /** SourceCluster found. **/ + search_data->SourceCluster = cluster_data; + break; + } + + /** Note: Subclusters should probably be parsed here, if they were implemented. **/ + } + + /** Did we find the requested source? **/ + if (search_data->SourceCluster == NULL) + { + /** Print error. **/ + mssErrorf(1, "Cluster", "Could not find cluster \"%s\" for search \"%s\".", source_cluster_name, search_data->Name); + + /** Attempt to give a hint. **/ + char* cluster_names[node_data->nClusterDatas]; + for (unsigned int i = 0; i < node_data->nClusterDatas; i++) + cluster_names[i] = node_data->ClusterDatas[i]->Name; + ci_TryHint(source_cluster_name, cluster_names, node_data->nClusterDatas); + + /** Fail. **/ + goto err_free_search; + } + + /** Get threshold attribute. **/ + if (ci_ParseAttribute(inf, "threshold", DATA_T_DOUBLE, POD(&search_data->Threshold), node_data->ParamList, true, true) != 0) goto err_free_search; + if (search_data->Threshold <= 0.0 || 1.0 <= search_data->Threshold) + { + mssErrorf(1, "Cluster", + "Invalid value for [threshold : 0.0 < x < 1.0 | \"none\"]: %g", + search_data->Threshold + ); + goto err_free_search; + } + + /** Get similarity measure. **/ + search_data->SimilarityMeasure = ci_ParseSimilarityMeasure(inf, node_data->ParamList); + if (search_data->SimilarityMeasure == SIMILARITY_NULL) goto err_free_search; + + /** Check for additional data to warn the user about. **/ + for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + pStructInf sub_inf = check_ptr(inf->SubInf[i]); + ASSERTMAGIC(sub_inf, MGK_STRUCTINF); + char* name = sub_inf->Name; + + /** Handle various struct types. **/ + const int struct_type = stStructType(sub_inf); + switch (struct_type) + { + case ST_T_ATTRIB: + { + /** Valid attribute names. **/ + char* attrs[] = { + "source", + "threshold", + "similarity_measure", + }; + const unsigned int nattrs = sizeof(attrs) / sizeof(char*); + + /** Ignore valid attribute names. **/ + bool is_valid = false; + for (unsigned int i = 0u; i < nattrs; i++) + { + if (strcmp(name, attrs[i]) == 0) + { + is_valid = true; + break; + } + } + if (is_valid) continue; /* Next inf. */ + + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown attribute '%s' in search \"%s\".\n", name, inf->Name); + ci_TryHint(name, attrs, nattrs); + + break; + } + + case ST_T_SUBGROUP: + { + /** The spec does not specify any valid sub-groups for searches. **/ + char* group_type = check_ptr(sub_inf->UsrType); + if (group_type == NULL) goto err_free_search; + fprintf(stderr, + "Warning: Unknown group [\"%s\" : \"%s\"] in search \"%s\".\n", + name, group_type, inf->Name + ); + break; + } + + default: + { + mssErrorf(1, "Cluster", + "Warning: Unknown struct type %d in search \"%s\".", + struct_type, inf->Name + ); + goto err_free_search; + } + } + } + + /** Create cache entry key. **/ + char* source_key = search_data->SourceCluster->Key; + const size_t len = strlen(source_key) + strlen(search_data->Name) + 16lu; + char* key = check_ptr(nmSysMalloc(len * sizeof(char))); + if (key == NULL) goto err_free_search; + snprintf(key, len, "%s/%s?%g&%u", + source_key, + search_data->Name, + search_data->Threshold, + search_data->SimilarityMeasure + ); + pXHashTable search_cache = &ClusterDriverCaches.SearchDataCache; + + /** Check for a cached version. **/ + pSearchData search_maybe = (pSearchData)xhLookup(search_cache, key); + if (search_maybe != NULL) + { /* Cache hit. */ + + /** Free the parsed search that we no longer need. **/ + ci_FreeSearchData(search_data); + nmSysFree(key); + + /** Return the cached search. **/ + return search_maybe; + } + + /** Cache miss. **/ + check(xhAdd(search_cache, key, (void*)search_data)); + return search_data; + + /** Error cleanup. **/ + err_free_search: + ci_FreeSearchData(search_data); + + err: + mssErrorf(0, "Cluster", "Failed to parse SearchData from group \"%s\".", inf->Name); + return NULL; + } + + +// LINK #functions +/*** Allocates a new pNodeData struct from a parsed pStructInf. + *** + *** @attention - Does not use caching directly, but uses subfunctions to + *** handle caching of substructures. + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf for the top level group in a .cluster + *** structure file. + *** @param parent The parent object struct. + *** @returns A new pNodeData struct on success, or NULL on failure. + ***/ +static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) + { + int ret; + char* path = check_ptr(ci_file_path(parent)); + if (path == NULL) goto err; + + /** Allocate node struct data. **/ + pNodeData node_data = check_ptr(nmMalloc(sizeof(NodeData))); + if (node_data == NULL) goto err; + memset(node_data, 0, sizeof(NodeData)); + node_data->Parent = parent; + + /** Set up param list. **/ + node_data->ParamList = check_ptr(expCreateParamList()); + if (node_data->ParamList == NULL) goto err; + node_data->ParamList->Session = check_ptr(parent->Session); + if (node_data->ParamList->Session == NULL) goto err; + ret = expAddParamToList(node_data->ParamList, "parameters", (void*)node_data, 0); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to add parameters to the param list scope (error code %d).", ret); + goto err_free_node; + } + + /** Set the param functions, defined later in the file. **/ + ret = expSetParamFunctions( + node_data->ParamList, + "parameters", + ci_GetParamType, + ci_GetParamValue, + ci_SetParamValue + ); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to set param functions (error code %d).", ret); + goto err_free_node; + } + + /** Detect relevant groups. **/ + XArray param_infs, cluster_infs, search_infs; + memset(¶m_infs, 0, sizeof(XArray)); + memset(&cluster_infs, 0, sizeof(XArray)); + memset(&search_infs, 0, sizeof(XArray)); + if (!check(xaInit(¶m_infs, 8))) goto err_free_arrs; + if (!check(xaInit(&cluster_infs, 8))) goto err_free_arrs; + if (!check(xaInit(&search_infs, 8))) goto err_free_arrs; + for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + pStructInf sub_inf = check_ptr(inf->SubInf[i]); + ASSERTMAGIC(sub_inf, MGK_STRUCTINF); + char* name = sub_inf->Name; + + /** Handle various struct types. **/ + const int struct_type = stStructType(sub_inf); + switch (struct_type) + { + case ST_T_ATTRIB: + { + /** Valid attribute names. **/ + char* attrs[] = { + "source", + "key_attr", + "data_attr", + }; + const unsigned int nattrs = sizeof(attrs) / sizeof(char*); + + /** Ignore valid attribute names. **/ + bool is_valid = false; + for (unsigned int i = 0u; i < nattrs; i++) + { + if (strcmp(name, attrs[i]) == 0) + { + is_valid = true; + break; + } + } + if (is_valid) continue; /* Next inf. */ + + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown attribute '%s' in cluster node \"%s\".\n", name, inf->Name); + ci_TryHint(name, attrs, nattrs); + + break; + } + + case ST_T_SUBGROUP: + { + /** The spec does not specify any valid sub-groups for searches. **/ + char* group_type = check_ptr(sub_inf->UsrType); + if (group_type == NULL) goto err_free_arrs; + if (strcmp(group_type, "cluster/parameter") == 0) + { + if (!check_neg(xaAddItem(¶m_infs, sub_inf))) + goto err_free_arrs; + } + else if (strcmp(group_type, "cluster/cluster") == 0) + { + if (!check_neg(xaAddItem(&cluster_infs, sub_inf))) + goto err_free_arrs; + } + else if (strcmp(group_type, "cluster/search") == 0) + { + if (!check_neg(xaAddItem(&search_infs, sub_inf))) + goto err_free_arrs; + } + else + { + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, + "Warning: Unknown group type \"%s\" on group \"%s\".\n", + group_type, sub_inf->Name + ); + ci_TryHint(group_type, (char*[]){ + "cluster/parameter", + "cluster/cluster", + "cluster/search", + NULL, + }, 0u); + } + break; + } + + default: + { + mssErrorf(1, "Cluster", + "Warning: Unknown struct type %d in search \"%s\".", + struct_type, inf->Name + ); + goto err_free_arrs; + } + } + } + + /** Extract OpenCtl for use below. **/ + bool has_provided_params = parent != NULL + && parent->Pathname != NULL + && parent->Pathname->OpenCtl != NULL + && parent->Pathname->OpenCtl[parent->SubPtr - 1] != NULL + && parent->Pathname->OpenCtl[parent->SubPtr - 1]->nSubInf > 0 + && parent->Pathname->OpenCtl[parent->SubPtr - 1]->SubInf != NULL; + int num_provided_params = (has_provided_params) ? parent->Pathname->OpenCtl[parent->SubPtr - 1]->nSubInf : 0; + pStruct* provided_params = (has_provided_params) ? parent->Pathname->OpenCtl[parent->SubPtr - 1]->SubInf : NULL; + + /** Iterate over each param in the structure file. **/ + node_data->nParams = param_infs.nItems; + const size_t params_size = node_data->nParams * sizeof(pParam); + node_data->Params = check_ptr(nmSysMalloc(params_size)); + if (node_data->Params == NULL) goto err_free_arrs; + memset(node_data->Params, 0, params_size); + for (unsigned int i = 0u; i < node_data->nParams; i++) + { + pParam param = paramCreateFromInf(param_infs.Items[i]); + if (param == NULL) + { + mssErrorf(0, "Cluster", + "Failed to create param from inf for param #%u: %s", + i, ((pStructInf)param_infs.Items[i])->Name + ); + goto err_free_arrs; + } + node_data->Params[i] = param; + + /** Check each provided param to see if the user provided value. **/ + for (unsigned int j = 0u; j < num_provided_params; j++) + { + pStruct provided_param = check_ptr(provided_params[j]); /* Failure ignored. */ + + /** If this provided param value isn't for the param, ignore it. **/ + if (strcmp(provided_param->Name, param->Name) != 0) continue; + + /** Matched! The user is providing a value for this param. **/ + ret = paramSetValueFromInfNe(param, provided_param, 0, node_data->ParamList, node_data->ParamList->Session); + if (ret != 0) + { + mssErrorf(0, "Cluster", + "Failed to set param value from struct info.\n" + " > Param #%u: %s\n" + " > Provided Param #%u: %n\n" + " > Error code: %d", + i, param->Name, + j, provided_param->Name, + ret + ); + goto err_free_arrs; + } + + /** Provided value successfully handled, we're done. **/ + break; + } + + /** Invoke param hints parsing. **/ + ret = paramEvalHints(param, node_data->ParamList, node_data->ParamList->Session); + if (ret != 0) + { + mssErrorf(0, "Cluster", + "Failed to evaluate parameter hints for parameter \"%s\" (error code %d).", + param->Name, ret + ); + goto err_free_arrs; + } + } + check(xaDeInit(¶m_infs)); /* Failure ignored. */ + param_infs.nAlloc = 0; + + /** Iterate over provided parameters and warn the user if they specified a parameter that does not exist. **/ + for (unsigned int i = 0u; i < num_provided_params; i++) + { + pStruct provided_param = check_ptr(provided_params[i]); /* Failure ignored. */ + char* provided_name = provided_param->Name; + + /** Look to see if this provided param actually exists for this driver instance. **/ + for (unsigned int j = 0u; j < node_data->nParams; j++) + if (strcmp(provided_name, node_data->Params[j]->Name) == 0) + goto next_provided_param; + + /** This param doesn't exist, warn the user and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown provided parameter '%s' for cluster file: %s.\n", provided_name, ci_file_name(parent)); + char** param_names = check_ptr(nmSysMalloc(node_data->nParams * sizeof(char*))); + for (unsigned int j = 0u; j < node_data->nParams; j++) + param_names[j] = node_data->Params[j]->Name; + ci_TryHint(provided_name, param_names, node_data->nParams); + nmSysFree(param_names); + + next_provided_param:; + } + + /** Parse source data. **/ + node_data->SourceData = ci_ParseSourceData(inf, node_data->ParamList, path); + if (node_data->SourceData == NULL) goto err_free_arrs; + + /** Parse each cluster. **/ + node_data->nClusterDatas = cluster_infs.nItems; + if (node_data->nClusterDatas > 0) + { + const size_t clusters_size = node_data->nClusterDatas * sizeof(pClusterData); + node_data->ClusterDatas = check_ptr(nmSysMalloc(clusters_size)); + if (node_data->ClusterDatas == NULL) goto err_free_arrs; + memset(node_data->ClusterDatas, 0, clusters_size); + for (unsigned int i = 0u; i < node_data->nClusterDatas; i++) + { + node_data->ClusterDatas[i] = ci_ParseClusterData(cluster_infs.Items[i], node_data); + if (node_data->ClusterDatas[i] == NULL) goto err_free_arrs; + } + } + else node_data->ClusterDatas = NULL; + check(xaDeInit(&cluster_infs)); /* Failure ignored. */ + cluster_infs.nAlloc = 0; + + /** Parse each search. **/ + node_data->nSearchDatas = search_infs.nItems; + if (node_data->nSearchDatas > 0) + { + const size_t searches_size = node_data->nSearchDatas * sizeof(pSearchData); + node_data->SearchDatas = check_ptr(nmSysMalloc(searches_size)); + if (node_data->SearchDatas == NULL) goto err_free_arrs; + memset(node_data->SearchDatas, 0, searches_size); + for (unsigned int i = 0u; i < node_data->nSearchDatas; i++) + { + node_data->SearchDatas[i] = ci_ParseSearchData(search_infs.Items[i], node_data); + if (node_data->SearchDatas[i] == NULL) goto err_free_arrs; + } + } + else node_data->SearchDatas = NULL; + check(xaDeInit(&search_infs)); /* Failure ignored. */ + search_infs.nAlloc = 0; + + /** Success. **/ + return node_data; + + err_free_arrs: + if (param_infs.nAlloc != 0) check(xaDeInit(¶m_infs)); /* Failure ignored. */ + if (cluster_infs.nAlloc != 0) check(xaDeInit(&cluster_infs)); /* Failure ignored. */ + if (search_infs.nAlloc != 0) check(xaDeInit(&search_infs)); /* Failure ignored. */ + + err_free_node: + ci_FreeNodeData(node_data); + + err: + mssErrorf(0, "Cluster", "Failed to parse node from group \"%s\" in file: %s", inf->Name, path); + return NULL; + } + + +/** ================ Freeing Functions ================ **/ +/** ANCHOR[id=freeing] **/ +// LINK #functions + +/** @param source_data A pSourceData struct, freed by this function. **/ +static void ci_FreeSourceData(pSourceData source_data) + { + /** Guard segfault. **/ + if (source_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_FreeSourceData(NULL);\n"); + return; + } + + /** Free top level attributes, if they exist. **/ + if (source_data->Name != NULL) + { + nmSysFree(source_data->Name); + source_data->Name = NULL; + } + if (source_data->SourcePath != NULL) + { + nmSysFree(source_data->SourcePath); + source_data->SourcePath = NULL; + } + if (source_data->KeyAttr != NULL) + { + nmSysFree(source_data->KeyAttr); + source_data->KeyAttr = NULL; + } + if (source_data->NameAttr != NULL) + { + nmSysFree(source_data->NameAttr); + source_data->NameAttr = NULL; + } + + /** Free fetched data, if it exists. **/ + if (source_data->Strings != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + { + nmSysFree(source_data->Strings[i]); + source_data->Strings[i] = NULL; + } + nmSysFree(source_data->Strings); + source_data->Strings = NULL; + } + + /** Free computed vectors, if they exist. **/ + if (source_data->Vectors != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + { + ca_free_vector(source_data->Vectors[i]); + source_data->Vectors[i] = NULL; + } + nmSysFree(source_data->Vectors); + source_data->Vectors = NULL; + } + + /** Free the source data struct. **/ + nmFree(source_data, sizeof(SourceData)); + source_data = NULL; + } + + +// LINK #functions +/*** Free pClusterData struct with an option to recursively free subclusters. + *** + *** @param cluster_data The cluster data struct to free. + *** @param recursive Whether to recursively free subclusters. + ***/ +static void ci_FreeClusterData(pClusterData cluster_data, bool recursive) + { + /** Guard segfault. **/ + if (cluster_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_FreeClusterData(NULL, %s);\n", (recursive) ? "true" : "false"); + return; + } + + /** Free attribute data. **/ + if (cluster_data->Name != NULL) + { + nmSysFree(cluster_data->Name); + cluster_data->Name = NULL; + } + + /** Free computed data, if it exists. **/ + if (cluster_data->Clusters != NULL) + { + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + pCluster cluster = &cluster_data->Clusters[i]; + nmSysFree(cluster->Strings); + nmSysFree(cluster->Vectors); + cluster->Strings = NULL; + cluster->Vectors = NULL; + } + nmSysFree(cluster_data->Clusters); + nmSysFree(cluster_data->Sims); + cluster_data->Clusters = NULL; + cluster_data->Sims = NULL; + } + + /** Free subclusters recursively. **/ + if (cluster_data->SubClusters != NULL) + { + if (recursive) + { + for (unsigned int i = 0u; i < cluster_data->nSubClusters; i++) + { + ci_FreeClusterData(cluster_data->SubClusters[i], recursive); + cluster_data->SubClusters[i] = NULL; + } + } + nmSysFree(cluster_data->SubClusters); + cluster_data->SubClusters = NULL; + } + + /** Free the cluster data struct. **/ + nmFree(cluster_data, sizeof(ClusterData)); + cluster_data = NULL; + } + + +// LINK #functions +/** @param search_data A pSearchData struct, freed by this function. **/ +static void ci_FreeSearchData(pSearchData search_data) + { + /** Guard segfault. **/ + if (search_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_FreeSearchData(NULL);\n"); + return; + } + + /** Free attribute data. **/ + if (search_data->Name != NULL) + { + nmSysFree(search_data->Name); + search_data->Name = NULL; + } + + /** Free computed data. **/ + if (search_data->Dups != NULL) + { + for (unsigned int i = 0; i < search_data->nDups; i++) + { + nmFree(search_data->Dups[i], sizeof(Dup)); + search_data->Dups[i] = NULL; + } + nmSysFree(search_data->Dups); + search_data->Dups = NULL; + } + + /** Free the search data struct. **/ + nmFree(search_data, sizeof(SearchData)); + search_data = NULL; + } + + +// LINK #functions +/** @param node_data A pNodeData struct, freed by this function. **/ +static void ci_FreeNodeData(pNodeData node_data) + { + /** Guard segfault. **/ + if (node_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_FreeNodeData(NULL);\n"); + return; + } + + /** Free parsed params, if they exist. **/ + if (node_data->Params != NULL) + { + for (unsigned int i = 0u; i < node_data->nParams; i++) + { + if (node_data->Params[i] == NULL) break; + paramFree(node_data->Params[i]); + node_data->Params[i] = NULL; + } + nmSysFree(node_data->Params); + node_data->Params = NULL; + } + if (node_data->ParamList != NULL) + { + expFreeParamList(node_data->ParamList); + node_data->ParamList = NULL; + } + + /** Free parsed clusters, if they exist. **/ + if (node_data->ClusterDatas != NULL) + { + /*** This data is cached, so we should NOT free it! The caching system + *** is responsible for the memory. We only need to free the array + *** holding our pointers to said cached memory. + ***/ + nmSysFree(node_data->ClusterDatas); + node_data->ClusterDatas = NULL; + } + + /** Free parsed searches, if they exist. **/ + if (node_data->SearchDatas != NULL) + { + /*** This data is cached, so we should NOT free it! The caching system + *** is responsible for the memory. We only need to free the array + *** holding our pointers to said cached memory. + ***/ + nmSysFree(node_data->SearchDatas); + node_data->SearchDatas = NULL; + } + + /** Free data source, if one exists. **/ + /*** Note: SourceData is freed last since other free functions may need to + *** access information from this structure when freeing data. + *** (For example, nVector which is used to determine the size of the + *** label struct in each cluster.) + ***/ + if (node_data->SourceData != NULL) + { + /*** This data is cached, so we should NOT free it! The caching system + *** is responsible for the memory. We only need to free the array + *** holding our pointers to said cached memory. + ***/ + node_data->SourceData = NULL; + } + + /** Free the node data. **/ + nmFree(node_data, sizeof(NodeData)); + node_data = NULL; + } + +/** Frees all data in caches for all cluster driver instances. **/ +static void ci_ClearCaches(void) + { + /*** Free caches in reverse of the order they are created in case + *** cached data relies on its source during the freeing process. + ***/ + check(xhClearKeySafe(&ClusterDriverCaches.SearchDataCache, ci_CacheFreeSearch, NULL)); /* Failure ignored. */ + check(xhClearKeySafe(&ClusterDriverCaches.ClusterDataCache, ci_CacheFreeCluster, NULL)); /* Failure ignored. */ + check(xhClearKeySafe(&ClusterDriverCaches.SourceDataCache, ci_CacheFreeSourceData, NULL)); /* Failure ignored. */ + } + + +/** ================ Deep Size Computation Functions ================ **/ +/** ANCHOR[id=sizing] **/ +// LINK #functions + +/*** Returns the deep size of a SourceData struct, including the size of all + *** allocated substructures. As far as I can tell, this is probably only + *** useful for cache management and debugging. + *** + *** Note that Key is ignored because it is a pointer to data managed by the + *** caching systems, so it is not technically part of the struct. + *** + *** @param source_data The source data struct to be queried. + *** @returns The size in bytes of the struct and all internal allocated data. + ***/ +static unsigned int ci_SizeOfSourceData(pSourceData source_data) + { + /** Guard segfault. **/ + if (source_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_SizeOfSourceData(NULL);\n"); + return 0u; + } + + unsigned int size = 0u; + if (source_data->Name != NULL) size += strlen(source_data->Name) * sizeof(char); + if (source_data->SourcePath != NULL) size += strlen(source_data->SourcePath) * sizeof(char); + if (source_data->KeyAttr != NULL) size += strlen(source_data->KeyAttr) * sizeof(char); + if (source_data->NameAttr != NULL) size += strlen(source_data->NameAttr) * sizeof(char); + if (source_data->Strings != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + size += strlen(source_data->Strings[i]) * sizeof(char); + size += source_data->nVectors * sizeof(char*); + } + if (source_data->Vectors != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + size += ca_sparse_len(source_data->Vectors[i]) * sizeof(int); + size += source_data->nVectors * sizeof(pVector); + } + size += sizeof(SourceData); + return size; + } + + +// LINK #functions +/*** Returns the deep size of a ClusterData struct, including the size of all + *** allocated substructures. As far as I can tell, this is probably only + *** useful for cache management and debugging. + *** + *** Note that Key is ignored because it is a pointer to data managed by the + *** caching systems, so it is not technically part of the struct. + *** + *** @param cluster_data The cluster data struct to be queried. + *** @param recursive Whether to recursively free subclusters. + *** @returns The size in bytes of the struct and all internal allocated data. + ***/ +static unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive) + { + /** Guard segfault. **/ + if (cluster_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_SizeOfClusterData(NULL, %s);\n", (recursive) ? "true" : "false"); + return 0u; + } + + unsigned int size = 0u; + if (cluster_data->Name != NULL) size += strlen(cluster_data->Name) * sizeof(char); + if (cluster_data->Clusters != NULL) + { + const unsigned int nVectors = cluster_data->SourceData->nVectors; + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + size += cluster_data->Clusters[i].Size * (sizeof(char*) + sizeof(pVector)); + size += nVectors * (sizeof(Cluster) + sizeof(double)); + } + if (cluster_data->SubClusters != NULL) + { + if (recursive) + { + for (unsigned int i = 0u; i < cluster_data->nSubClusters; i++) + size += ci_SizeOfClusterData(cluster_data->SubClusters[i], recursive); + } + size += cluster_data->nSubClusters * sizeof(void*); + } + size += sizeof(ClusterData); + return size; + } + + +// LINK #functions +/*** Returns the deep size of a SearchData struct, including the size of all + *** allocated substructures. As far as I can tell, this is probably only + *** useful for cache management and debugging. + *** + *** Note that Key is ignored because it is a pointer to data managed by the + *** caching systems, so it is not technically part of the struct. + *** + *** @param search_data The search data struct to be queried. + *** @returns The size in bytes of the struct and all internal allocated data. + ***/ +static unsigned int ci_SizeOfSearchData(pSearchData search_data) + { + /** Guard segfault. **/ + if (search_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_SizeOfSearchData(NULL);\n"); + return 0u; + } + + unsigned int size = 0u; + if (search_data->Name != NULL) size += strlen(search_data->Name) * sizeof(char); + if (search_data->Dups != NULL) size += search_data->nDups * (sizeof(void*) + sizeof(Dup)); + size += sizeof(SearchData); + return size; + } + + +/** ================ Computation Functions ================ **/ +/** ANCHOR[id=computation] **/ +// LINK #functions + +/*** Ensures that the source_data->Data has been fetched from the data source + *** and that source_data->nVectors has been computed from the fetched data. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param source_data The pSourceData affected by the computation. + *** @param session The current session, used to open the data source. + *** @returns 0 if successful, or + *** -1 other value on failure. + ***/ +static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) + { + /** If the vectors are already computed, we're done. **/ + if (source_data->Vectors != NULL) return 0; + + /** Time to play shoots-and-ladders in an error-handling jungle of gotos. **/ + bool successful = false; + int ret; + + /** Record the date and time. **/ + if (!check(objCurrentDate(&source_data->DateComputed))) goto end; + + /** Open the source path specified by the .cluster file. **/ + pObject obj = objOpen(session, source_data->SourcePath, OBJ_O_RDONLY, 0600, "system/directory"); + if (obj == NULL) + { + mssErrorf(0, "Cluster", + "Failed to open object driver:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n", + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath + ); + goto end; + } + + /** Generate a "query" for retrieving data. **/ + pObjQuery query = objOpenQuery(obj, NULL, NULL, NULL, NULL, 0); + if (query == NULL) + { + mssErrorf(0, "Cluster", + "Failed to open query:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name + ); + goto end_close; + } + + /** Initialize an xarray to store the retrieved data. **/ + XArray key_xarray, data_xarray, vector_xarray; + memset(&key_xarray, 0, sizeof(XArray)); + memset(&data_xarray, 0, sizeof(XArray)); + memset(&vector_xarray, 0, sizeof(XArray)); + if (!check(xaInit(&key_xarray, 64))) goto end_close_query; + if (!check(xaInit(&data_xarray, 64))) goto end_free_data; + if (!check(xaInit(&vector_xarray, 64))) goto end_free_data; + + /** Fetch data and build vectors. **/ + while (true) + { + pObject entry = objQueryFetch(query, O_RDONLY); + if (entry == NULL) break; /* Done. */ + + /** Data value: Type checking. **/ + const int data_datatype = objGetAttrType(entry, source_data->NameAttr); + if (data_datatype == -1) + { + mssErrorf(0, "Cluster", + "Failed to get type for %uth entry:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name + ); + goto end_free_data; + } + if (data_datatype != DATA_T_STRING) + { + mssErrorf(1, "Cluster", + "Type for %uth entry was not a string:\n" + " > Attribute: ['%s':'%s' : %s]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, ci_TypeToStr(data_datatype), + source_data->SourcePath, + obj->Driver->Name + ); + goto end_free_data; + } + + /** Data value: Get value from database. **/ + char* data; + ret = objGetAttrValue(entry, source_data->NameAttr, DATA_T_STRING, POD(&data)); + if (ret != 0) + { + mssErrorf(0, "Cluster", + "Failed to value for %uth entry:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n" + " > Error code: %d\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name, + ret + ); + goto end_free_data; + } + + /** Skip empty strings. **/ + if (strlen(data) == 0) + { + check(fflush(stdout)); /* Failure ignored. */ + continue; + } + + /** Convert the string to a vector. **/ + pVector vector = ca_build_vector(data); + if (vector == NULL) + { + mssErrorf(1, "Cluster", "Failed to build vectors for string \"%s\".", data); + successful = false; + goto end_free_data; + } + if (ca_is_empty(vector)) + { + mssErrorf(1, "Cluster", "Vector building for string \"%s\" produced no character pairs.", data); + successful = false; + goto end_free_data; + } + if (ca_has_no_pairs(vector)) + { + /** Skip pVector with no pairs. **/ + check(fflush(stdout)); /* Failure ignored. */ + ca_free_vector(vector); + continue; + } + + + /** Key value: Type checking. **/ + const int key_datatype = objGetAttrType(entry, source_data->KeyAttr); + if (key_datatype == -1) + { + mssErrorf(0, "Cluster", + "Failed to get type for key on %uth entry:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name + ); + goto end_free_data; + } + if (key_datatype != DATA_T_STRING) + { + mssErrorf(1, "Cluster", + "Type for key on %uth entry was not a string:\n" + " > Attribute: ['%s':'%s' : %s]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, ci_TypeToStr(key_datatype), + source_data->SourcePath, + obj->Driver->Name + ); + goto end_free_data; + } + + /** key value: Get value from database. **/ + char* key; + ret = objGetAttrValue(entry, source_data->KeyAttr, DATA_T_STRING, POD(&key)); + if (ret != 0) + { + mssErrorf(0, "Cluster", + "Failed to value for key on %uth entry:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n" + " > Error code: %d\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name, + ret + ); + goto end_free_data; + } + + /** Store values. **/ + char* key_dup = check_ptr(nmSysStrdup(key)); + if (key_dup == NULL) goto end_free_data; + char* data_dup = check_ptr(nmSysStrdup(data)); + if (data_dup == NULL) goto end_free_data; + if (!check_neg(xaAddItem(&key_xarray, (void*)key_dup))) goto end_free_data; + if (!check_neg(xaAddItem(&data_xarray, (void*)data_dup))) goto end_free_data; + if (!check_neg(xaAddItem(&vector_xarray, (void*)vector))) goto end_free_data; + + /** Clean up. **/ + ret = objClose(entry); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to close object entry (error code %d).", ret); + // ret = ret; // Fall-through: Failure ignored. + } + } + + source_data->nVectors = vector_xarray.nItems; + if (source_data->nVectors == 0) + { + mssErrorf(0, "Cluster", + "Data source path did not contain any valid data:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name + ); + } + + /** Trim and store: keys, data, and vectors. **/ + source_data->Keys = (char**)check_ptr(ci_xaToTrimmedArray(&key_xarray, 1)); + source_data->Strings = (char**)check_ptr(ci_xaToTrimmedArray(&data_xarray, 1)); + source_data->Vectors = (int**)check_ptr(ci_xaToTrimmedArray(&vector_xarray, 1)); + if (source_data->Keys == NULL) goto end_free_data; + if (source_data->Strings == NULL) goto end_free_data; + if (source_data->Vectors == NULL) goto end_free_data; + + /** Success. **/ + successful = true; + + end_free_data: + if (key_xarray.nAlloc != 0) + { + for (unsigned int i = 0u; i < vector_xarray.nItems; i++) + nmSysFree(key_xarray.Items[i]); + check(xaDeInit(&key_xarray)); /* Failure ignored. */ + } + if (data_xarray.nAlloc != 0) + { + for (unsigned int i = 0u; i < data_xarray.nItems; i++) + nmSysFree(data_xarray.Items[i]); + check(xaDeInit(&data_xarray)); /* Failure ignored. */ + } + if (vector_xarray.nAlloc != 0) + { + for (unsigned int i = 0u; i < vector_xarray.nItems; i++) + ca_free_vector(vector_xarray.Items[i]); + check(xaDeInit(&vector_xarray)); /* Failure ignored. */ + } + + end_close_query: + ret = objQueryClose(query); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to close query (error code %d).", ret); + // ret = ret; // Fall-through: Failure ignored. + } + + end_close: + ret = objClose(obj); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to close object driver (error code %d).", ret); + // ret = ret; // Fall-through: Failure ignored. + } + + end: + if (!successful) mssErrorf(0, "Cluster", "SourceData computation failed."); + return (successful) ? 0 : -1; + } + + +// LINK #functions +/*** Ensures that the cluster_data->Labels has been computed, running the + *** specified clustering algorithm if necessary. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param cluster_data The pClusterData affected by the computation. + *** @param node_data The current pNodeData, used to get vectors to cluster. + *** @returns 0 if successful, or + *** -1 other value on failure. + ***/ +static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) + { + /** If the clusters are already computed, we're done. **/ + if (cluster_data->Clusters != NULL) return 0; + + /** Make source data available. **/ + pSourceData source_data = node_data->SourceData; + + /** We need the SourceData vectors to compute clusters. **/ + if (ci_ComputeSourceData(source_data, node_data->ParamList->Session) != 0) + { + mssErrorf(0, "Cluster", "ClusterData computation failed due to missing SourceData."); + goto err; + } + + /** Record the date and time. **/ + if (!check(objCurrentDate(&cluster_data->DateComputed))) goto err; + + /** Allocate static memory for finding clusters. **/ + const size_t clusters_size = cluster_data->nClusters * sizeof(Cluster); + cluster_data->Clusters = check_ptr(nmSysMalloc(clusters_size)); + if (cluster_data->Clusters == NULL) goto err; + memset(cluster_data->Clusters, 0, clusters_size); + const size_t sims_size = source_data->nVectors * sizeof(double); + cluster_data->Sims = check_ptr(nmSysMalloc(sims_size)); + if (cluster_data->Sims == NULL) goto err_free_clusters; + memset(cluster_data->Sims, 0, sims_size); + + /** Execute clustering. **/ + switch (cluster_data->ClusterAlgorithm) + { + case ALGORITHM_NONE: + { + /** Put all the data into one cluster. **/ + pCluster first_cluster = &cluster_data->Clusters[0]; + first_cluster->Size = source_data->nVectors; + first_cluster->Strings = check_ptr(nmSysMalloc(source_data->nVectors * sizeof(char*))); + if (first_cluster->Strings == NULL) goto err_free_sims; + first_cluster->Vectors = check_ptr(nmSysMalloc(source_data->nVectors * sizeof(pVector))); + if (first_cluster->Vectors == NULL) goto err_free_sims; + memcpy(first_cluster->Strings, source_data->Strings, source_data->nVectors * sizeof(char*)); + memcpy(first_cluster->Vectors, source_data->Vectors, source_data->nVectors * sizeof(pVector)); + break; + } + + case ALGORITHM_SLIDING_WINDOW: + /** Computed in each search for efficiency. **/ + memset(cluster_data->Clusters, 0, clusters_size); + break; + + case ALGORITHM_KMEANS: + { + /** Check for unimplemented similarity measures. **/ + if (cluster_data->SimilarityMeasure != SIMILARITY_COSINE) + { + mssErrorf(1, "Cluster", + "The similarity measure \"%s\" is not implemented.", + ci_SimilarityMeasureToString(cluster_data->SimilarityMeasure) + ); + goto err_free_sims; + } + + /** Allocate lables. Note: kmeans does not require us to initialize them. **/ + const size_t lables_size = source_data->nVectors * sizeof(unsigned int); + unsigned int* labels = check_ptr(nmSysMalloc(lables_size)); + if (labels == NULL) goto err_free_sims; + + /** Run kmeans. **/ + const bool successful = check(ca_kmeans( + source_data->Vectors, + source_data->nVectors, + cluster_data->nClusters, + cluster_data->MaxIterations, + cluster_data->MinImprovement, + labels, + cluster_data->Sims + )); + if (!successful) goto err_free_sims; + + /** Convert the labels into clusters. **/ + + /** Allocate space for clusters. **/ + XArray indexes_in_cluster[cluster_data->nClusters]; + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + if (!check(xaInit(&indexes_in_cluster[i], 8))) goto err_free_sims; + + /** Iterate through each label and add the index of the specified cluster to the xArray. **/ + for (unsigned long long i = 0llu; i < source_data->nVectors; i++) + if (!check_neg(xaAddItem(&indexes_in_cluster[labels[i]], (void*)i))) goto err_free_sims; + nmSysFree(labels); /* Free unused data. */ + + /** Iterate through each cluster, store it, and free the xArray. **/ + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + pXArray indexes_in_this_cluster = &indexes_in_cluster[i]; + pCluster cluster = &cluster_data->Clusters[i]; + cluster->Size = indexes_in_this_cluster->nItems; + cluster->Strings = check_ptr(nmSysMalloc(cluster->Size * sizeof(char*))); + if (cluster->Strings == NULL) goto err_free_sims; + cluster->Vectors = check_ptr(nmSysMalloc(cluster->Size * sizeof(pVector))); + if (cluster->Vectors == NULL) goto err_free_sims; + for (unsigned int j = 0u; j < cluster->Size; j++) + { + const unsigned long long index = (unsigned long long)indexes_in_this_cluster->Items[j]; + cluster->Strings[j] = source_data->Strings[index]; + cluster->Vectors[j] = source_data->Vectors[index]; + } + check(xaDeInit(indexes_in_this_cluster)); /* Failure ignored. */ + } + + /** k-means is done. **/ + break; + } + + default: + mssErrorf(1, "Cluster", + "Clustering algorithm \"%s\" is not implemented.", + ci_ClusteringAlgorithmToString(cluster_data->ClusterAlgorithm) + ); + goto err; + } + + /** Success. **/ + return 0; + + err_free_sims: + nmFree(cluster_data->Sims, sims_size); + cluster_data->Sims = NULL; + + err_free_clusters: + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + pCluster cluster = &cluster_data->Clusters[i]; + if (cluster->Strings != NULL) nmFree(cluster->Strings, cluster->Size * sizeof(char*)); + else break; + if (cluster->Vectors != NULL) nmFree(cluster->Vectors, cluster->Size * sizeof(pVector)); + else break; + } + nmFree(cluster_data->Clusters, clusters_size); + cluster_data->Clusters = NULL; + + err: + mssErrorf(0, "Cluster", "ClusterData computation failed for \"%s\".", cluster_data->Name); + return -1; + } + + +// LINK #functions +/*** Ensures that the search_data->Dups has been computed, running the a + *** search with the specified similarity measure if necessary. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param cluster_data The pClusterData affected by the computation. + *** @param node_data The current pNodeData, used to get vectors to cluster. + *** @returns 0 if successful, or + *** -1 other value on failure. + ***/ +static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) + { + int ret; + + /** If the clusters are already computed, we're done. **/ + if (search_data->Dups != NULL) return 0; + + /** We need the cluster data to be computed before we search it. **/ + pClusterData cluster_data = search_data->SourceCluster; + ret = ci_ComputeClusterData(cluster_data, node_data); + if (ret != 0) + { + mssErrorf(0, "Cluster", "SearchData computation failed due to missing clusters."); + goto err; + } + + /** Record the date and time. **/ + if (!check(objCurrentDate(&search_data->DateComputed))) goto err; + + /** Execute the search using the specified source and comparison function. **/ + pXArray dups = NULL, dups_temp = NULL; + switch (search_data->SimilarityMeasure) + { + case SIMILARITY_COSINE: + { + if (cluster_data->ClusterAlgorithm == ALGORITHM_SLIDING_WINDOW) + { + dups_temp = check_ptr(ca_sliding_search( + (void**)cluster_data->SourceData->Vectors, + cluster_data->SourceData->nVectors, + cluster_data->MaxIterations, /* Window size. */ + ca_cos_compare, + search_data->Threshold, + (void**)cluster_data->SourceData->Keys, + dups + )); + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute sliding search with cosine similarity measure."); + goto err_free; + } + } + else + { + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + dups_temp = check_ptr(ca_complete_search( + (void**)cluster_data->Clusters[i].Vectors, + cluster_data->Clusters[i].Size, + ca_cos_compare, + search_data->Threshold, + (void**)cluster_data->SourceData->Keys, + dups + )); + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute complete search with cosine similarity measure."); + goto err_free; + } + else dups = dups_temp; + } + } + break; + } + + case SIMILARITY_LEVENSHTEIN: + { + if (cluster_data->ClusterAlgorithm == ALGORITHM_SLIDING_WINDOW) + { + dups_temp = check_ptr(ca_sliding_search( + (void**)cluster_data->SourceData->Vectors, + cluster_data->SourceData->nVectors, + cluster_data->MaxIterations, /* Window size. */ + ca_lev_compare, + search_data->Threshold, + (void**)cluster_data->SourceData->Keys, + dups + )); + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute sliding search with Levenstein similarity measure."); + goto err_free; + } + } + else + { + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + dups_temp = check_ptr(ca_complete_search( + (void**)cluster_data->Clusters[i].Strings, + cluster_data->Clusters[i].Size, + ca_lev_compare, + search_data->Threshold, + (void**)cluster_data->SourceData->Keys, + dups + )); + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute complete search with Levenstein similarity measure."); + goto err_free; + } + else dups = dups_temp; + } + } + break; + } + + default: + mssErrorf(1, "Cluster", + "Unknown similarity meansure \"%s\".", + ci_SimilarityMeasureToString(search_data->SimilarityMeasure) + ); + goto err_free; + } + if (dups_temp == NULL) goto err_free; + else dups = dups_temp; + // fprintf(stderr, "Done searching, found %d dups.\n", dups->nItems); + + /** Store dups. **/ + search_data->nDups = dups->nItems; + search_data->Dups = (dups->nItems == 0) + ? check_ptr(nmSysMalloc(0)) + : ci_xaToTrimmedArray(dups, 2); + + /** Success. **/ + return 0; + + err_free: + if (dups != NULL) + { + for (unsigned int i = 0u; i < dups->nItems; i++) + { + if (dups->Items[i] != NULL) nmFree(dups->Items[i], sizeof(Dup)); + else break; + } + check(xaFree(dups)); /* Failure ignored. */ + } + + err: + mssErrorf(0, "Cluster", "SearchData computation failed for \"%s\".", search_data->Name); + return -1; + } + + +/** ================ Parameter Functions ================ **/ +/** ANCHOR[id=params] **/ +// LINK #functions + +/*** Get the type of a parameter. Intended for expSetParamFunctions(). + *** + *** @param inf_v Node data containing the list of paramenters. + *** @param attr_name The name of the requested paramenter. + *** @returns The datatype, see datatypes.h for a list of valid datatypes. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +static int ci_GetParamType(void* inf_v, const char* attr_name) + { + pNodeData node_data = (pNodeData)inf_v; + + /** Find the parameter. **/ + for (unsigned int i = 0; i < node_data->nParams; i++) + { + pParam param = node_data->Params[i]; + if (strcmp(param->Name, attr_name) != 0) continue; + + /** Parameter found. **/ + return (param->Value == NULL) ? DATA_T_UNAVAILABLE : param->Value->DataType; + } + + /** Parameter not found. **/ + return DATA_T_UNAVAILABLE; + } + + +// LINK #functions +/*** Get the value of a parameter. Intended for `expSetParamFunctions()`. + *** + *** @attention - Warning: If the retrieved value is `NULL`, the pObjectData + *** val is not updated, and the function returns 1, indicating `NULL`. + *** This is intended behavior, for consistency with other Centrallix + *** functions, so keep it in mind so you're not surprised. + *** + *** @param inf_v Node data containing the list of paramenters. + *** @param attr_name The name of the requested paramenter. + *** @param datatype The expected datatype of the parameter value. + *** See datatypes.h for a list of valid datatypes. + *** @param val A pointer to a location where a pointer to the requested + *** data should be stored. Typically, the caller creates a local variable + *** to store this pointer, then passes a pointer to that local variable + *** so that they will have a pointer to the data. + *** This buffer will not be modified unless the data is successfully + *** found. If a value other than 0 is returned, the buffer is not updated. + *** @returns 0 if successful, + *** 1 if the variable is null, + *** -1 if an error occurs. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +static int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) + { + pNodeData node_data = (pNodeData)inf_v; + + /** Find the parameter. **/ + for (unsigned int i = 0; i < node_data->nParams; i++) + { + pParam param = (pParam)node_data->Params[i]; + if (strcmp(param->Name, attr_name) != 0) continue; + + /** Parameter found. **/ + if (param->Value == NULL) return 1; + if (param->Value->Flags & DATA_TF_NULL) return 1; + if (param->Value->DataType != datatype) + { + mssErrorf(1, "Cluster", "Type mismatch accessing parameter '%s'.", param->Name); + return -1; + } + + /** Return param value. **/ + if (!check(objCopyData(&(param->Value->Data), val, datatype))) goto err; + return 0; + } + + err: + mssErrorf(1, "Cluster", + "Failed to get parameter ['%s' : %s]", + attr_name, ci_TypeToStr(datatype) + ); + return -1; + } + +// LINK #functions +/** Not implemented. **/ +static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) + { + mssErrorf(1, "Cluster", "SetParamValue() is not implemented because clusters are imutable."); + return -1; + } + + +/** ================ Driver functions ================ **/ +/** ANCHOR[id=driver] **/ +// LINK #functions + +/*** Opens a new cluster driver instance by parsing a `.cluster` file found + *** at the path provided in parent. + *** + *** @param parent The parent of the object to be openned, including useful + *** information such as the pathname, session, etc. + *** @param mask Driver permission mask (unused). + *** @param sys_type ? (unused) + *** @param usr_type The object system file type being openned. Should always + *** be "system/cluster" because this driver is only registered for that + *** type of file. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** + *** @returns A pDriverData struct representing a driver instance, or + *** NULL if an error occurs. + ***/ +void* clusterOpen(pObject parent, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt) + { + /** Update statistics. **/ + ClusterStatistics.OpenCalls++; + + /** If CREAT and EXCL are specified, exclusively create it, failing if the file already exists. **/ + pSnNode node_struct = NULL; + bool can_create = (parent->Mode & O_CREAT) && (parent->SubPtr == parent->Pathname->nElements); + if (can_create && (parent->Mode & O_EXCL)) + { + node_struct = snNewNode(parent->Prev, usr_type); + if (node_struct == NULL) + { + mssErrorf(0, "Cluster", "Failed to exclusively create new node struct."); + goto err; + } + } + + /** Read the node if it exists. **/ + if (node_struct == NULL) + node_struct = snReadNode(parent->Prev); + + /** If we can't read it, create it (if allowed). **/ + if (node_struct == NULL && can_create) + node_struct = snNewNode(parent->Prev, usr_type); + + /** If there still isn't a node, fail early. **/ + if (node_struct == NULL) + { + mssErrorf(0, "Cluster", "Failed to create node struct."); + goto err; + } + + /** Magic. **/ + ASSERTMAGIC(node_struct, MGK_STNODE); + ASSERTMAGIC(node_struct->Data, MGK_STRUCTINF); + + /** Parse node data from the node_struct. **/ + pNodeData node_data = ci_ParseNodeData(node_struct->Data, parent); + if (node_data == NULL) + { + mssErrorf(0, "Cluster", "Failed to parse structure file \"%s\".", ci_file_name(parent)); + goto err; + } + + /** Allocate driver instance data. **/ + pDriverData driver_data = check_ptr(nmMalloc(sizeof(DriverData))); + if (driver_data == NULL) goto err_free_node; + memset(driver_data, 0, sizeof(DriverData)); + driver_data->NodeData = node_data; + + /** Detect target from path. **/ + char* target_name = obj_internal_PathPart(parent->Pathname, parent->SubPtr + parent->SubCnt++, 1); + if (target_name == NULL) + { + /** Target found: Root **/ + driver_data->TargetType = TARGET_ROOT; + driver_data->TargetData = (void*)driver_data->NodeData->SourceData; + return (void*)driver_data; /* Success. */ + } + + /** Search clusters. **/ + for (unsigned int i = 0u; i < node_data->nClusterDatas; i++) + { + pClusterData cluster = node_data->ClusterDatas[i]; + if (strcmp(cluster->Name, target_name) != 0) continue; + + /** Target found: Cluster **/ + driver_data->TargetType = TARGET_CLUSTER; + + /** Check for sub-clusters in the path. **/ + while (true) + { + /** Decend one path part deeper into the path. **/ + const char* path_part = obj_internal_PathPart(parent->Pathname, parent->SubPtr + parent->SubCnt++, 1); + + /** If the path does not go any deeper, we're done. **/ + if (path_part == NULL) + { + driver_data->TargetData = (void*)cluster; + break; + } + + /** Need to go deeper: Search for the requested sub-cluster. **/ + for (unsigned int i = 0u; i < cluster->nSubClusters; i++) + { + pClusterData sub_cluster = cluster->SubClusters[i]; + if (strcmp(sub_cluster->Name, path_part) != 0) continue; + + /** Target found: Sub-cluster **/ + cluster = sub_cluster; + goto continue_descent; + } + + /** Path names sub-cluster that does not exist. **/ + mssErrorf(1, "Cluster", "Sub-cluster \"%s\" does not exist.", path_part); + goto err_free_node; + + continue_descent:; + } + return (void*)driver_data; /* Success. */ + } + + /** Search searches. **/ + for (unsigned int i = 0u; i < node_data->nSearchDatas; i++) + { + pSearchData search = node_data->SearchDatas[i]; + if (strcmp(search->Name, target_name) != 0) continue; + + /** Target found: Search **/ + driver_data->TargetType = TARGET_SEARCH; + driver_data->TargetData = (void*)search; + + /** Check for extra, invalid path parts. **/ + char* extra_data = obj_internal_PathPart(parent->Pathname, parent->SubPtr + parent->SubCnt++, 1); + if (extra_data != NULL) + { + mssErrorf(1, "Cluster", "Unknown path part %s.", extra_data); + goto err_free_node; + } + return (void*)driver_data; /* Success. */ + } + + /** We were unable to find the requested cluster or search. **/ + mssErrorf(1, "Cluster", "\"%s\" is not the name of a declared cluster or search.", target_name); + + /** Attempt to give a hint. **/ + { + const unsigned int n_targets = node_data->nClusterDatas + node_data->nSearchDatas; + char* target_names[n_targets]; + for (unsigned int i = 0u; i < node_data->nClusterDatas; i++) + target_names[i] = node_data->ClusterDatas[i]->Name; + for (unsigned int i = 0u; i < node_data->nSearchDatas; i++) + target_names[i + node_data->nClusterDatas] = node_data->SearchDatas[i]->Name; + ci_TryHint(target_name, target_names, n_targets); + } + + /** Error cleanup. **/ + err_free_node: + if (node_data != NULL) ci_FreeNodeData(node_data); + if (driver_data != NULL) nmFree(driver_data, sizeof(DriverData)); + + err: + mssErrorf(0, "Cluster", + "Failed to open cluster file \"%s\" at: %s", + ci_file_name(parent), ci_file_path(parent) + ); + return NULL; + } + + +// LINK #functions +/*** Close a cluster driver instance object, releasing any necessary memory + *** and closing any necessary underlying resources. However, most of that + *** data will be cached and won't be freed unless the cache is dropped. + *** + *** @param inf_v The affected driver instance. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns 0, success. + ***/ +int clusterClose(void* inf_v, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + ClusterStatistics.CloseCalls++; + + /** Entries are shallow copies so we shouldn't do a deep free. **/ + if (driver_data->TargetType == TARGET_CLUSTER_ENTRY + || driver_data->TargetType == TARGET_SEARCH_ENTRY) + { + nmFree(driver_data, sizeof(DriverData)); + return 0; + } + + /** Free the node data (which is held in cache). **/ + ci_FreeNodeData(driver_data->NodeData); + + /** Free driver data. **/ + nmFree(driver_data, sizeof(DriverData)); + + return 0; + } + + +// LINK #functions +/*** Opens a new query pointing to the first row of the data targetted by + *** the driver instance struct. The query has an internal index counter + *** that starts at the first row and increments as data is fetched. + *** + *** @param inf_v The driver instance to be queried. + *** @param query The query to use on this struct. This is assumed to be + *** handled elsewhere, so we don't read it here (unused). + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns The cluster query. + ***/ +void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt) + { + ClusterStatistics.OpenQueryCalls++; + pClusterQuery cluster_query = check_ptr(nmMalloc(sizeof(ClusterQuery))); + if (cluster_query == NULL) return NULL; + cluster_query->DriverData = (pDriverData)inf_v; + cluster_query->RowIndex = 0u; + return cluster_query; + } + + +// LINK #functions +/*** Get the next entry as an open driver instance object. + *** + *** @param qy_v A query instance, storing an internal index which is + *** incremented once that data has been fetched. + *** @param obj Unused. + *** @param mode Unused. + *** @param oxt Unused. + *** @returns pDriverData that is either a cluster entry or search entry, + *** pointing to a specific target index into the relevant data. + *** OR NULL, indicating that all data has been fetched. + ***/ +void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) + { + int ret; + pClusterQuery cluster_query = (pClusterQuery)qy_v; + + /** Update statistics. **/ + ClusterStatistics.FetchCalls++; + + /** Ensure that the data being fetched exists and is computed. **/ + TargetType target_type = cluster_query->DriverData->TargetType, new_target_type; + unsigned int data_amount = 0u; + switch (target_type) + { + case TARGET_ROOT: + mssErrorf(1, "Cluster", "Querying the root node of a cluster file is not allowed."); + fprintf(stderr, " > Hint: Try / or /\n"); + goto err; + + case TARGET_CLUSTER: + { + new_target_type = TARGET_CLUSTER_ENTRY; + pClusterData target = (pClusterData)cluster_query->DriverData->TargetData; + ret = ci_ComputeClusterData(target, cluster_query->DriverData->NodeData); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to compute ClusterData for query."); + goto err; + } + data_amount = target->nClusters; + break; + } + + case TARGET_SEARCH: + { + new_target_type = TARGET_SEARCH_ENTRY; + pSearchData target = (pSearchData)cluster_query->DriverData->TargetData; + ret = ci_ComputeSearchData(target, cluster_query->DriverData->NodeData); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to compute SearchData for query."); + goto err; + } + data_amount = target->nDups; + break; + } + + case TARGET_CLUSTER_ENTRY: + case TARGET_SEARCH_ENTRY: + mssErrorf(1, "Cluster", "Querying a query result is not allowed."); + goto err; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", target_type); + goto err; + } + + /** Check that the requested data exists, returning null if we've reached the end of the data. **/ + if (cluster_query->RowIndex >= data_amount) return NULL; + + /** Create the result struct. **/ + pDriverData driver_data = check_ptr(nmMalloc(sizeof(DriverData))); + if (driver_data == NULL) goto err; + memcpy(driver_data, cluster_query->DriverData, sizeof(DriverData)); + driver_data->TargetType = new_target_type; + driver_data->TargetIndex = cluster_query->RowIndex++; + + /** Success. **/ + return driver_data; + + err: + mssErrorf(0, "Cluster", "Failed to fetch query result."); + return NULL; + } + + +// LINK #functions +/*** Close a cluster query instance, releasing any necessary memory and + *** closing any necessary underlying resources. This does not close the + *** underlying driver instance, which must be closed with clusterClose(). + *** + *** @param qy_v The affected query instance. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns 0, success. + ***/ +int clusterQueryClose(void* qy_v, pObjTrxTree* oxt) + { + nmFree(qy_v, sizeof(ClusterQuery)); + return 0; + } + + +// LINK #functions +/*** Get the type of a cluster driver instance attribute. + *** + *** @param inf_v The driver instance. + *** @param attr_name The name of the requested attribute. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns The datatype, see datatypes.h for a list of valid datatypes. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + /** Update statistics. **/ + ClusterStatistics.GetTypeCalls++; + + /** Guard possible segfault. **/ + if (attr_name == NULL) + { + fprintf(stderr, "Warning: Call to clusterGetAttrType() with NULL attribute name.\n"); + return DATA_T_UNAVAILABLE; + } + + /** Performance shortcut for frequently requested attributes: key1, key2, and sim. **/ + if (attr_name[0] == 'k' || attr_name[0] == 's') goto handle_targets; + + /** Types for general attributes. **/ + if (strcmp(attr_name, "name") == 0 + || strcmp(attr_name, "annotation") == 0 + || strcmp(attr_name,"content_type") == 0 + || strcmp(attr_name, "inner_type") == 0 + || strcmp(attr_name,"outer_type") == 0) + return DATA_T_STRING; + if (strcmp(attr_name, "last_modification") == 0) + return DATA_T_DATETIME; + if ((strcmp(attr_name, "date_created") == 0 + || strcmp(attr_name, "date_computed") == 0) + && + (driver_data->TargetType == TARGET_CLUSTER + || driver_data->TargetType == TARGET_SEARCH)) + return DATA_T_DATETIME; + + /** Types for specific data targets. **/ + handle_targets: + switch (driver_data->TargetType) + { + case TARGET_ROOT: + if (strcmp(attr_name, "source") == 0 + || strcmp(attr_name, "data_attr") == 0 + || strcmp(attr_name, "key_attr") == 0) + return DATA_T_STRING; + break; + + case TARGET_CLUSTER: + if (strcmp(attr_name, "algorithm") == 0 + || strcmp(attr_name, "similarity_measure") == 0) + return DATA_T_STRING; + if (strcmp(attr_name, "num_clusters") == 0 + || strcmp(attr_name, "max_iterations") == 0) + return DATA_T_INTEGER; + if (strcmp(attr_name, "min_improvement") == 0) + return DATA_T_DOUBLE; + break; + + case TARGET_SEARCH: + if (strcmp(attr_name, "source") == 0 + || strcmp(attr_name, "similarity_measure") == 0) + return DATA_T_STRING; + if (strcmp(attr_name, "threshold") == 0) + return DATA_T_DOUBLE; + break; + + case TARGET_CLUSTER_ENTRY: + if (strcmp(attr_name, "items") == 0) + return DATA_T_STRINGVEC; + break; + + case TARGET_SEARCH_ENTRY: + if (strcmp(attr_name, "key1") == 0 + || strcmp(attr_name, "key2") == 0) + return DATA_T_STRING; + if (strcmp(attr_name, "sim") == 0) + return DATA_T_DOUBLE; + break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return DATA_T_UNAVAILABLE; + } + + return DATA_T_UNAVAILABLE; + } + + +// LINK #functions +/*** Get the value of a cluster driver instance attribute. + *** + *** @param inf_v The driver instance to be read. + *** @param attr_name The name of the requested attribute. + *** @param datatype The expected datatype of the attribute value. + *** See datatypes.h for a list of valid datatypes. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @param val A pointer to a location where a pointer to the requested + *** data should be stored. Typically, the caller creates a local variable + *** to store this pointer, then passes a pointer to that local variable + *** so that they will have a pointer to the data. + *** This buffer will not be modified unless the data is successfully + *** found. If a value other than 0 is returned, the buffer is not updated. + *** @returns 0 if successful, + *** -1 if an error occurs. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + ClusterStatistics.GetValCalls++; + + /** Guard possible segfault. **/ + if (attr_name == NULL) + { + fprintf(stderr, "Warning: Call to clusterGetAttrType() with NULL attribute name.\n"); + return DATA_T_UNAVAILABLE; + } + + /** Performance shortcut for frequently requested attributes: key1, key2, and sim. **/ + if ((attr_name[0] == 'k' && datatype == DATA_T_STRING) /* key1, key2 : string */ + || (attr_name[0] == 's' && datatype == DATA_T_DOUBLE) /* sim : double */ + ) goto handle_targets; + + /** Type check. **/ + const int expected_datatype = clusterGetAttrType(inf_v, attr_name, NULL); + if (datatype != expected_datatype) + { + mssErrorf(1, "Cluster", + "Type mismatch: Accessing attribute ['%s' : %s] as type %s.", + attr_name, ci_TypeToStr(expected_datatype), ci_TypeToStr(datatype) + ); + return -1; + } + + /** Handle name and annotation. **/ + if (strcmp(attr_name, "name") == 0) + { + ClusterStatistics.GetValCalls_name++; + switch (driver_data->TargetType) + { + case TARGET_ROOT: + val->String = ((pSourceData)driver_data->TargetData)->Name; + break; + + case TARGET_CLUSTER: + case TARGET_CLUSTER_ENTRY: + val->String = ((pClusterData)driver_data->TargetData)->Name; + break; + + case TARGET_SEARCH: + case TARGET_SEARCH_ENTRY: + val->String = ((pSearchData)driver_data->TargetData)->Name; + break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + + return 0; + } + if (strcmp(attr_name, "annotation") == 0) + { + switch (driver_data->TargetType) + { + case TARGET_ROOT: val->String = "Clustering driver."; break; + case TARGET_CLUSTER: val->String = "Clustering driver: Cluster."; break; + case TARGET_CLUSTER_ENTRY: val->String = "Clustering driver: Cluster Entry."; break; + case TARGET_SEARCH: val->String = "Clustering driver: Search."; break; + case TARGET_SEARCH_ENTRY: val->String = "Clustering driver: Cluster Entry."; break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + return 0; + } + + /** Return the appropriate types. **/ + if (strcmp(attr_name, "outer_type") == 0) + { + val->String = "system/row"; + return 0; + } + if (strcmp(attr_name, "content_type") == 0 + || strcmp(attr_name, "inner_type") == 0) + { + val->String = "system/void"; + return 0; + } + + /** Last modification is not implemented yet. **/ + if (strcmp(attr_name, "last_modification") == 0) return 1; /* null */ + + /** Handle creation and computation dates. **/ + if (strcmp(attr_name, "date_created") == 0) + { + switch (driver_data->TargetType) + { + case TARGET_ROOT: + case TARGET_CLUSTER_ENTRY: + case TARGET_SEARCH_ENTRY: + /** Attribute is not defined for this target type. **/ + return -1; + + case TARGET_CLUSTER: + val->DateTime = &((pClusterData)driver_data->TargetData)->DateCreated; + return 0; + + case TARGET_SEARCH: + val->DateTime = &((pSearchData)driver_data->TargetData)->DateCreated; + return 0; + } + return -1; + } + if (strcmp(attr_name, "date_computed") == 0) + { + switch (driver_data->TargetType) + { + case TARGET_ROOT: + case TARGET_CLUSTER_ENTRY: + case TARGET_SEARCH_ENTRY: + /** Attribute is not defined for this target type. **/ + return -1; + + case TARGET_CLUSTER: + { + pClusterData target = (pClusterData)driver_data->TargetData; + pDateTime date_time = &target->DateComputed; + if (date_time->Value == 0) return 1; /* null */ + else val->DateTime = date_time; + return 0; + } + + case TARGET_SEARCH: + { + pSearchData target = (pSearchData)driver_data->TargetData; + pDateTime date_time = &target->DateComputed; + if (date_time->Value == 0) return 1; /* null */ + else val->DateTime = date_time; + return 0; + } + } + + /** Default: Unknown type. **/ + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + + /** Handle attributes for specific data targets. **/ + handle_targets: + switch (driver_data->TargetType) + { + case TARGET_ROOT: + if (strcmp(attr_name, "source") == 0) + { + val->String = ((pSourceData)driver_data->TargetData)->SourcePath; + return 0; + } + if (strcmp(attr_name, "key_attr") == 0) + { + val->String = ((pSourceData)driver_data->TargetData)->KeyAttr; + return 0; + } + if (strcmp(attr_name, "name_attr") == 0) + { + val->String = ((pSourceData)driver_data->TargetData)->NameAttr; + return 0; + } + break; + + case TARGET_CLUSTER: + { + pClusterData target = (pClusterData)driver_data->TargetData; + + if (strcmp(attr_name, "algorithm") == 0) + { + val->String = ci_ClusteringAlgorithmToString(target->ClusterAlgorithm); + return 0; + } + if (strcmp(attr_name, "similarity_measure") == 0) + { + val->String = ci_SimilarityMeasureToString(target->SimilarityMeasure); + return 0; + } + if (strcmp(attr_name, "num_clusters") == 0) + { + if (target->nClusters > INT_MAX) + fprintf(stderr, "Warning: 'num_clusters' value of %u exceeds INT_MAX (%d).\n", target->nClusters, INT_MAX); + val->Integer = (int)target->nClusters; + return 0; + } + if (strcmp(attr_name, "max_iterations") == 0) + { + if (target->MaxIterations > INT_MAX) + fprintf(stderr, "Warning: 'max_iterations' value of %u exceeds INT_MAX (%d).\n", target->MaxIterations, INT_MAX); + val->Integer = (int)target->MaxIterations; + return 0; + } + if (strcmp(attr_name, "min_improvement") == 0) + { + val->Double = target->MinImprovement; + return 0; + } + break; + } + + case TARGET_SEARCH: + { + pSearchData target = (pSearchData)driver_data->TargetData; + + if (strcmp(attr_name, "source") == 0) + { + val->String = target->SourceCluster->Name; + return 0; + } + if (strcmp(attr_name, "similarity_measure") == 0) + { + val->String = ci_SimilarityMeasureToString(target->SimilarityMeasure); + return 0; + } + if (strcmp(attr_name, "threshold") == 0) + { + val->Double = target->Threshold; + return 0; + } + } + + case TARGET_CLUSTER_ENTRY: + { + pClusterData target = (pClusterData)driver_data->TargetData; + pCluster target_cluster = &target->Clusters[driver_data->TargetIndex]; + + if (strcmp(attr_name, "items") == 0) + { + /** Static variable to prevent leaking StringVec from previous calls. **/ + static StringVec* vec = NULL; + if (vec != NULL) nmFree(vec, sizeof(StringVec)); + + /** Allocate and initialize the requested data. **/ + val->StringVec = vec = check_ptr(nmMalloc(sizeof(StringVec))); + if (val->StringVec == NULL) return -1; + val->StringVec->nStrings = target_cluster->Size; + val->StringVec->Strings = target_cluster->Strings; + + /** Success. **/ + return 0; + } + break; + } + + case TARGET_SEARCH_ENTRY: + { + pSearchData target = (pSearchData)driver_data->TargetData; + pDup target_dup = target->Dups[driver_data->TargetIndex]; + + if (strcmp(attr_name, "sim") == 0) + { + ClusterStatistics.GetValCalls_sim++; + val->Double = target_dup->similarity; + return 0; + } + if (strcmp(attr_name, "key1") == 0) + { + ClusterStatistics.GetValCalls_key1++; + val->String = target_dup->key1; + return 0; + } + if (strcmp(attr_name, "key2") == 0) + { + ClusterStatistics.GetValCalls_key2++; + val->String = target_dup->key2; + return 0; + } + break; + } + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + + /** Unknown attribute. **/ + char* name; + clusterGetAttrValue(inf_v, "name", DATA_T_STRING, POD(&name), NULL); + mssErrorf(1, "Cluster", + "Unknown attribute '%s' for cluster object %s (target type: %u, \"%s\").", + attr_name, driver_data->NodeData->SourceData->Name, driver_data->TargetType, name + ); + + return -1; + } + + +// LINK #functions +/*** Create a new presentation hints object, describing this attribute on the + *** provided cluster driver instance. + *** + *** Note: Failures from nmSysStrdup() and several others are ignored because + *** the worst case scenario is that the attributes are set to null, which + *** will cause them to be ignored. I consider that to be better than than + *** throwing an error that could unnecessarily disrupt normal usage. + *** + *** @param inf_v The driver instance to be read. + *** @param attr_name The name of the requested attribute. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns A presentation hints object, if successful, + *** NULL if an error occurs. + ***/ +pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + /** Malloc presentation hints struct. **/ + pObjPresentationHints hints = check_ptr(nmMalloc(sizeof(ObjPresentationHints))); + if (hints == NULL) goto err; + memset(hints, 0, sizeof(ObjPresentationHints)); + + /** Hints that are the same for all attributes. **/ + hints->GroupID = -1; + hints->VisualLength2 = 1; + hints->Style |= OBJ_PH_STYLE_READONLY | OBJ_PH_STYLE_CREATEONLY | OBJ_PH_STYLE_NOTNULL; + hints->StyleMask |= OBJ_PH_STYLE_READONLY | OBJ_PH_STYLE_CREATEONLY | OBJ_PH_STYLE_NOTNULL; + + /** Temporary param list for compiling expressions. **/ + pParamObjects tmp_list = check_ptr(expCreateParamList()); + if (hints == NULL) goto err; + + /** Search for the requested attribute through attributes common to all instances. **/ + if (strcmp(attr_name, "name") == 0) + { + hints->Length = 32; + hints->VisualLength = 16; + goto success; + } + if (strcmp(attr_name, "annotation") == 0) + { + hints->Length = 36; + hints->VisualLength = 36; + goto success; + } + if (strcmp(attr_name, "inner_type") == 0 + || strcmp(attr_name, "inner_type") == 0 + || strcmp(attr_name, "outer_type") == 0 + || strcmp(attr_name, "content_type") == 0 + || strcmp(attr_name, "last_modification") == 0) + { + hints->VisualLength = 30; + goto success; + } + + /** Handle date created and date computed. */ + if (strcmp(attr_name, "date_created") == 0 + || strcmp(attr_name, "date_computed") == 0) + { + if (driver_data->TargetType == TARGET_CLUSTER || driver_data->TargetType == TARGET_SEARCH) + { + hints->Length = 24; + hints->VisualLength = 20; + hints->Format = check_ptr(nmSysStrdup("datetime")); /* Failure ignored. */ + goto success; + } + else goto unknown_attribute; + } + + /** Search by target type. **/ + switch (driver_data->TargetType) + { + case TARGET_ROOT: + if (strcmp(attr_name, "source") == 0) + { + hints->Length = _PC_PATH_MAX; + hints->VisualLength = 64; + hints->FriendlyName = check_ptr(nmSysStrdup("Source Path")); /* Failure ignored. */ + goto success; + } + if (strcmp(attr_name, "key_attr") == 0) + { + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Key Attribute Name")); /* Failure ignored. */ + goto success; + } + if (strcmp(attr_name, "data_attr") == 0) + { + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Data Attribute Name")); /* Failure ignored. */ + goto success; + } + break; + + case TARGET_CLUSTER: + if (strcmp(attr_name, "num_clusters") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("2", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("2147483647", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 8; + hints->VisualLength = 4; + hints->FriendlyName = check_ptr(nmSysStrdup("Number of Clusters")); /* Failure ignored. */ + goto success; + } + if (strcmp(attr_name, "min_improvement") == 0) + { + /** Min and max values. **/ + hints->DefaultExpr = expCompileExpression("0.0001", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = check_ptr(nmSysStrdup("Minimum Improvement Threshold")); /* Failure ignored. */ + goto success; + } + if (strcmp(attr_name, "max_iterations") == 0) + { + /** Min and max values. **/ + hints->DefaultExpr = expCompileExpression("64", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("2147483647", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 8; + hints->VisualLength = 4; + hints->FriendlyName = check_ptr(nmSysStrdup("Maximum Iterations")); /* Failure ignored. */ + goto success; + } + if (strcmp(attr_name, "algorithm") == 0) + { + /** Enum values. **/ + check(xaInit(&(hints->EnumList), nClusteringAlgorithms)); /* Failure ignored. */ + for (unsigned int i = 0u; i < nClusteringAlgorithms; i++) + check_neg(xaAddItem(&(hints->EnumList), &ALL_CLUSTERING_ALGORITHMS[i])); /* Failure ignored. */ + + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + char buf[8]; + snprintf(buf, sizeof(buf), "%d", nClusteringAlgorithms); + hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Display flags. **/ + hints->Style |= OBJ_PH_STYLE_BUTTONS; + hints->StyleMask |= OBJ_PH_STYLE_BUTTONS; + + /** Other hints. **/ + hints->Length = 24; + hints->VisualLength = 20; + hints->FriendlyName = check_ptr(nmSysStrdup("Clustering Algorithm")); /* Failure ignored. */ + goto success; + } + /** Fall-through: Start of overlapping region. **/ + + case TARGET_SEARCH: + if (strcmp(attr_name, "similarity_measure") == 0) + { + /** Enum values. **/ + check(xaInit(&(hints->EnumList), nSimilarityMeasures)); /* Failure ignored. */ + for (unsigned int i = 0u; i < nSimilarityMeasures; i++) + check_neg(xaAddItem(&(hints->EnumList), &ALL_SIMILARITY_MEASURES[i])); /* Failure ignored. */ + + /** Display flags. **/ + hints->Style |= OBJ_PH_STYLE_BUTTONS; + hints->StyleMask |= OBJ_PH_STYLE_BUTTONS; + + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + char buf[8]; + snprintf(buf, sizeof(buf), "%d", nSimilarityMeasures); + hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 32; + hints->VisualLength = 20; + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity Measure")); /* Failure ignored. */ + goto success; + } + + /** End of overlapping region. **/ + if (driver_data->TargetType == TARGET_CLUSTER) break; + + if (strcmp(attr_name, "source") == 0) + { + hints->Length = 64; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Source Cluster Name")); /* Failure ignored. */ + goto success; + } + if (strcmp(attr_name, "threshold") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity Threshold")); /* Failure ignored. */ + goto success; + } + break; + + case TARGET_CLUSTER_ENTRY: + { + pClusterData target = (pClusterData)check_ptr(driver_data->TargetData); + if (target == NULL) goto err; + + if (strcmp(attr_name, "items") == 0) + { + /** Other hints. **/ + hints->Length = 65536; + hints->VisualLength = 256; + hints->FriendlyName = check_ptr(nmSysStrdup("Cluster Data")); /* Failure ignored. */ + goto success; + } + if (strcmp(attr_name, "sim") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity")); /* Failure ignored. */ + goto success; + } + break; + } + + case TARGET_SEARCH_ENTRY: + { + pSearchData target = (pSearchData)check_ptr(driver_data->TargetData); + if (target == NULL) goto err; + + if (strcmp(attr_name, "key1") == 0) + { + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Key 1")); /* Failure ignored. */ + goto success; + } + if (strcmp(attr_name, "key2") == 0) + { + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Key 2")); /* Failure ignored. */ + goto success; + } + if (strcmp(attr_name, "sim") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity")); /* Failure ignored. */ + goto success; + } + break; + } + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + goto err; + } + + /** Unknown attribute. **/ + unknown_attribute:; + char* name; + check(clusterGetAttrValue(inf_v, "name", DATA_T_STRING, POD(&name), NULL)); /* Failure ignored. */ + mssErrorf(1, "Cluster", + "Unknown attribute '%s' for cluster object %s (target type: %u, \"%s\").", + attr_name, driver_data->NodeData->SourceData->Name, driver_data->TargetType, name + ); + + /** Error cleanup. **/ + err: + if (tmp_list != NULL) check(expFreeParamList(tmp_list)); /* Failure ignored. */ + if (hints != NULL) nmFree(hints, sizeof(ObjPresentationHints)); + mssErrorf(0, "Cluster", "Failed execute generate presentation hints."); + return NULL; + + /** Success. **/ + success: + check(expFreeParamList(tmp_list)); /* Failure ignored. */ + return hints; + } + + +// LINK #functions +/*** Returns the name of the first attribute that one can get from + *** this driver instance (using GetAttrType() and GetAttrValue()). + *** Resets the internal variable (TargetAttrIndex) used to maintain + *** iteration state for clusterGetNextAttr(). + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the first attribute. + ***/ +char* clusterGetFirstAttr(void* inf_v, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + driver_data->TargetAttrIndex = 0u; + return clusterGetNextAttr(inf_v, oxt); + } + + +// LINK #functions +/*** Returns the name of the next attribute that one can get from + *** this driver instance (using GetAttrType() and GetAttrValue()). + *** Uses an internal variable (TargetAttrIndex) used to maintain + *** the state of this iteration over repeated calls. + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the next attribute. + ***/ +char* clusterGetNextAttr(void* inf_v, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + const unsigned int i = driver_data->TargetAttrIndex++; + switch (driver_data->TargetType) + { + case TARGET_ROOT: return ATTR_ROOT[i]; + case TARGET_CLUSTER: return ATTR_CLUSTER[i]; + case TARGET_SEARCH: return ATTR_SEARCH[i]; + case TARGET_CLUSTER_ENTRY: return ATTR_CLUSTER_ENTRY[i]; + case TARGET_SEARCH_ENTRY: return ATTR_SEARCH_ENTRY[i]; + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return NULL; + } + } + + +// LINK #functions +/*** Get the capabilities of the driver instance object. + *** + *** @param inf_v The driver instance to be checked. + *** @param info The struct to be populated with driver flags. + *** @returns 0 if successful, + *** -1 if the driver is an unimplemented type (should never happen). + ***/ +int clusterInfo(void* inf_v, pObjectInfo info) + { + pDriverData driver_data = (pDriverData)inf_v; + pNodeData node_data = (pNodeData)driver_data->NodeData; + + /** Reset flags buffer. **/ + info->Flags = 0; + + /** Disallow unsupported functionality. **/ + info->Flags |= OBJ_INFO_F_CANT_ADD_ATTR; + info->Flags |= OBJ_INFO_F_CANT_HAVE_CONTENT; + info->Flags |= OBJ_INFO_F_NO_CONTENT; + + switch (driver_data->TargetType) + { + case TARGET_ROOT: + info->nSubobjects = node_data->nClusterDatas + node_data->nSearchDatas; + info->Flags |= OBJ_INFO_F_CAN_HAVE_SUBOBJ; + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->Flags |= (info->nSubobjects > 0) ? OBJ_INFO_F_HAS_SUBOBJ : OBJ_INFO_F_NO_SUBOBJ; + break; + + case TARGET_CLUSTER: + info->Flags |= OBJ_INFO_F_CAN_HAVE_SUBOBJ; + info->Flags |= OBJ_INFO_F_HAS_SUBOBJ; /* Data must not be empty. */ + + /*** Clusters always have one label per vector. + *** If we know how many vectors are in the dataset, + *** we know how many labels this cluster will have, + *** even if it hasn't been computed yet. + ***/ + if (node_data->SourceData->Vectors != NULL) + { + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->nSubobjects = node_data->SourceData->nVectors; + } + break; + + case TARGET_SEARCH: + { + pSearchData search_data = (pSearchData)driver_data->TargetData; + info->Flags |= OBJ_INFO_F_CAN_HAVE_SUBOBJ; + if (search_data->Dups != NULL) + { + info->nSubobjects = search_data->nDups; + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->Flags |= (info->nSubobjects > 0) ? OBJ_INFO_F_HAS_SUBOBJ : OBJ_INFO_F_NO_SUBOBJ; + } + break; + } + + case TARGET_CLUSTER_ENTRY: + case TARGET_SEARCH_ENTRY: + /** No Subobjects. **/ + info->Flags |= OBJ_INFO_F_CANT_HAVE_SUBOBJ; + info->Flags |= OBJ_INFO_F_NO_SUBOBJ; + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->nSubobjects = 0; + break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + goto err; + } + + return 0; + + err: + mssErrorf(0, "Cluster", "Failed execute get info."); + return -1; + } + + +/** ================ Method Execution Functions ================ **/ +/** ANCHOR[id=method] **/ +// LINK #functions + +/*** Returns the name of the first method that one can execute from + *** this driver instance (using clusterExecuteMethod()). Resets the + *** internal variable (TargetMethodIndex) used to maintain iteration + *** state for clusterGetNextMethod(). + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the first method. + ***/ +char* clusterGetFirstMethod(void* inf_v, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + driver_data->TargetMethodIndex = 0u; + return clusterGetNextMethod(inf_v, oxt); + } + + +// LINK #functions +/*** Returns the name of the next method that one can get from + *** this driver instance (using GetAttrType() and GetAttrValue()). + *** Uses an internal variable (TargetMethodIndex) used to maintain + *** the state of this iteration over repeated calls. + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the next method. + ***/ +char* clusterGetNextMethod(void* inf_v, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + return METHOD_NAME[driver_data->TargetMethodIndex++]; + } + + +// LINK #functions +/** Intended for use in xhForEach(). **/ +static int ci_PrintEntry(pXHashEntry entry, void* arg) + { + /** Extract entry. **/ + char* key = entry->Key; + void* data = entry->Data; + + /** Extract args. **/ + void** args = (void**)arg; + unsigned int* type_id_ptr = (unsigned int*)args[0]; + unsigned int* total_bytes_ptr = (unsigned int*)args[1]; + unsigned long long* less_ptr = (unsigned long long*)args[2]; + char* path = (char*)args[3]; + + /** If a path is provided, check that it matches the start of the key. **/ + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return 0; + + /** Handle type. **/ + char* type; + char* name; + unsigned int bytes; + switch (*type_id_ptr) + { + case 1u: + { + pSourceData source_data = (pSourceData)data; + + /** Compute size. **/ + bytes = ci_SizeOfSourceData(source_data); + + /** If less is specified, skip uncomputed source. **/ + if (*less_ptr > 0llu && source_data->Vectors == NULL) goto no_print; + + /** Compute printing information. **/ + type = "Source"; + name = source_data->Name; + break; + } + case 2u: + { + pClusterData cluster_data = (pClusterData)data; + + /** Compute size. **/ + bytes = ci_SizeOfClusterData(cluster_data, false); + + /** If less is specified, skip uncomputed source. **/ + if (*less_ptr > 0llu && cluster_data->Clusters == NULL) goto no_print; + + /** Compute printing information. **/ + type = "Cluster"; + name = cluster_data->Name; + break; + } + case 3u: + { + pSearchData search_data = (pSearchData)data; + + /** Compute size. **/ + bytes = ci_SizeOfSearchData(search_data); + + /** If less is specified, skip uncomputed source. **/ + if (*less_ptr > 0llu && search_data->Dups == NULL) goto no_print; + + /** Compute printing information. **/ + type = "Search"; + name = search_data->Name; + break; + } + default: + mssErrorf(0, "Cluster", "Unknown type_id %u.", *type_id_ptr); + return -1; + } + + + /** Print the cache entry data. **/ + char buf[12]; + snprint_bytes(buf, sizeof(buf), bytes); + printf("%-8s %-16s %-12s \"%s\"\n", type, name, buf, key); + + increment_total: + *total_bytes_ptr += bytes; + + return 0; + + no_print: + (*less_ptr)++; + goto increment_total; + } + + +// LINK #functions +/** Intended for use in xhClearKeySafe(). **/ +static void ci_CacheFreeSourceData(pXHashEntry entry, void* path) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pSourceData source_data = (pSourceData)entry->Data; + + /** If a path is provided, check that it matches the start of the key. **/ + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; + + /** Free data. **/ + ci_FreeSourceData(source_data); + nmSysFree(key); + } + + +// LINK #functions +/** Intended for use in xhClearKeySafe(). **/ +static void ci_CacheFreeCluster(pXHashEntry entry, void* path) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pClusterData cluster_data = (pClusterData)entry->Data; + + /** If a path is provided, check that it matches the start of the key. **/ + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; + + /** Free data. **/ + ci_FreeClusterData(cluster_data, false); + nmSysFree(key); + } + + +// LINK #functions +/** Intended for use in xhClearKeySafe(). **/ +static void ci_CacheFreeSearch(pXHashEntry entry, void* path) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pSearchData search_data = (pSearchData)entry->Data; + + /** If a path is provided, check that it matches the start of the key. **/ + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; + + /** Free data. **/ + ci_FreeSearchData(search_data); + nmSysFree(key); + } + + +// LINK #functions +/*** Executes a method with the given name. + *** + *** @param inf_v The affected driver instance. + *** @param method_name The name of the method. + *** @param param A possibly optional param passed to the method. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + ***/ +int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + /** Cache management method. **/ + if (strcmp(method_name, "cache") == 0) + { + char* path = NULL; + + /** Second parameter is required. **/ + if (param->String == NULL) + { + mssErrorf(1, "Cluster", + "[param : \"show\" | \"show_less\" | \"show_all\" | \"drop_all\"] is required for the cache method." + ); + goto err; + } + + /** 'show' and 'show_all'. **/ + bool show = false; + unsigned long long skip_uncomputed = 0llu; + if (strcmp(param->String, "show_less") == 0) + /** Specify show_less to skip uncomputed caches. **/ + skip_uncomputed = 1ull; + if (skip_uncomputed == 1ull || strcmp(param->String, "show") == 0) + { + show = true; + path = ci_file_path(driver_data->NodeData->Parent); + } + if (strcmp(param->String, "show_all") == 0) show = true; + + if (show) + { + /** Print cache info table. **/ + int ret = 0; + unsigned int i = 1u, source_bytes = 0u, cluster_bytes = 0u, search_bytes = 0u; + bool failed = false; + printf("\nShowing cache for "); + if (path != NULL) printf("\"%s\":\n", path); + else printf("all files:\n"); + printf("%-8s %-16s %-12s %s\n", "Type", "Name", "Size", "Cache Entry Key"); + failed |= !check(xhForEach( + &ClusterDriverCaches.SourceDataCache, + ci_PrintEntry, + (void*[]){&i, &source_bytes, (void*)&skip_uncomputed, path} + )); + i++; + failed |= !check(xhForEach( + &ClusterDriverCaches.ClusterDataCache, + ci_PrintEntry, + (void*[]){&i, &cluster_bytes, (void*)&skip_uncomputed, path} + )); + i++; + failed |= !check(xhForEach( + &ClusterDriverCaches.SearchDataCache, + ci_PrintEntry, + (void*[]){&i, &search_bytes, (void*)&skip_uncomputed, path} + )); + if (failed) + { + mssErrorf(0, "Cluster", "Unexpected error occurred while showhing caches."); + ret = -1; + } + + /** Precomputations. **/ + unsigned int total_caches = 0u + + (unsigned int)ClusterDriverCaches.SourceDataCache.nItems + + (unsigned int)ClusterDriverCaches.ClusterDataCache.nItems + + (unsigned int)ClusterDriverCaches.SearchDataCache.nItems; + if (total_caches <= skip_uncomputed) printf("All caches skipped, nothing to show...\n"); + + /** Print stats. **/ + char buf[16]; + printf("\nCache Stats:\n"); + printf("%-8s %-4s %-12s\n", "", "#", "Total Size"); + printf("%-8s %-4d %-12s\n", "Source", ClusterDriverCaches.SourceDataCache.nItems, snprint_bytes(buf, sizeof(buf), source_bytes)); + printf("%-8s %-4d %-12s\n", "Cluster", ClusterDriverCaches.ClusterDataCache.nItems, snprint_bytes(buf, sizeof(buf), cluster_bytes)); + printf("%-8s %-4d %-12s\n", "Search", ClusterDriverCaches.SearchDataCache.nItems, snprint_bytes(buf, sizeof(buf), search_bytes)); + printf("%-8s %-4d %-12s\n\n", "Total", total_caches, snprint_bytes(buf, sizeof(buf), source_bytes + cluster_bytes + search_bytes)); + + /** Print skip stats (if anything was skipped.) **/ + if (skip_uncomputed > 0llu) printf("Skipped %llu uncomputed caches.\n\n", skip_uncomputed - 1llu); + + return ret; + } + + /** 'drop_all'. **/ + if (strcmp(param->String, "drop_all") == 0) + { + printf("\nDropping cache for all files:\n"); + ci_ClearCaches(); + return 0; + } + + /** Unknown parameter. **/ + mssErrorf(1, "Cluster", + "Expected [param : \"show\" | \"show_less\" | \"show_all\" | \"drop_all\"] for the cache method, but got: \"%s\"", + param->String + ); + goto err; + } + + if (strcmp(method_name, "stat") == 0) + { + char buf[12]; + printf("Cluster Driver Statistics:\n"); + printf(" Stat Name %12s\n", "Value"); + printf(" OpenCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenCalls)); + printf(" OpenQueryCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenQueryCalls)); + printf(" FetchCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.FetchCalls)); + printf(" CloseCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.CloseCalls)); + printf(" GetTypeCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetTypeCalls)); + printf(" GetValCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls)); + printf(" GetValCalls_name %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_name)); + printf(" GetValCalls_key1 %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key1)); + printf(" GetValCalls_key2 %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key2)); + printf(" GetValCalls_sim %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_sim)); + return 0; + } + + /** Unknown parameter. **/ + mssErrorf(1, "Cluster", "Unknown command: \"%s\"", method_name); + + err: + mssErrorf(0, "Cluster", "Failed execute command."); + return -1; + } + + +/** ================ Unimplemented Functions ================ **/ +/** ANCHOR[id=unimplemented] **/ +// LINK #functions + +/** Not implemented. **/ +int clusterCreate(pObject obj, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterCreate() is not implemented."); + return -ENOSYS; + } +/** Not implemented. **/ +int clusterDelete(pObject obj, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterDelete() is not implemented."); + return -1; + } +/** Not implemented. **/ +int clusterDeleteObj(void* inf_v, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterDeleteObj() is not implemented."); + return -1; + } +/** Not implemented. **/ +int clusterRead(void* inf_v, char* buffer, int max_cnt, int offset, int flags, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterRead() not implemented."); + fprintf(stderr, "HINT: Use queries instead, (e.g. clusterOpenQuery()).\n"); + return -1; + } +/** Not implemented. **/ +int clusterWrite(void* inf_v, char* buffer, int cnt, int offset, int flags, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterWrite() not implemented because clusters are imutable."); + return -1; + } +/** Not implemented. **/ +int clusterSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterSetAttrValue() not implemented because clusters are imutable."); + return -1; + } +/** Not implemented. **/ +int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterAddAttr() not implemented because clusters are imutable."); + return -1; + } +/** Not implemented. **/ +void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterOpenAttr() not implemented."); + return NULL; + } +/** Not implemented. **/ +int clusterCommit(void* inf_v, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterCommit() not implemented because clusters are imutable."); + return 0; + } + + +// LINK #functions +/*** Initialize the driver. This includes: + *** - Registering the driver with the objectsystem. + *** - Registering structs with newmalloc for debugging. + *** - Initializing global data needed for the driver. + *** + *** @returns 0 if successful, or + *** -1 if an error occurs. + ***/ +int clusterInitialize(void) + { + /** Allocate the driver. **/ + pObjDriver drv = (pObjDriver)check_ptr(nmMalloc(sizeof(ObjDriver))); + if (drv == NULL) goto err; + memset(drv, 0, sizeof(ObjDriver)); + + /** Initialize caches. **/ + memset(&ClusterDriverCaches, 0, sizeof(ClusterDriverCaches)); + if (!check(xhInit(&ClusterDriverCaches.SourceDataCache, 251, 0))) goto err; + if (!check(xhInit(&ClusterDriverCaches.ClusterDataCache, 251, 0))) goto err; + if (!check(xhInit(&ClusterDriverCaches.SearchDataCache, 251, 0))) goto err; + + /** Initialize statistics. **/ + memset(&ClusterStatistics, 0, sizeof(ClusterStatistics)); + + /** Setup the structure. **/ + if (check_ptr(strcpy(drv->Name, "cluster - Clustering Driver")) == NULL) goto err; + if (!check(xaInit(&drv->RootContentTypes, 1))) goto err; + if (!check_neg(xaAddItem(&drv->RootContentTypes, "system/cluster"))) goto err; + drv->Capabilities = 0; /* TODO: Greg - Should I indicate any capabilities? */ + + /** Setup the function references. **/ + drv->Open = clusterOpen; + drv->OpenChild = NULL; + drv->Close = clusterClose; + drv->Create = clusterCreate; + drv->Delete = clusterDelete; + drv->DeleteObj = clusterDeleteObj; + drv->OpenQuery = clusterOpenQuery; + drv->QueryDelete = NULL; + drv->QueryFetch = clusterQueryFetch; + drv->QueryClose = clusterQueryClose; + drv->Read = clusterRead; + drv->Write = clusterWrite; + drv->GetAttrType = clusterGetAttrType; + drv->GetAttrValue = clusterGetAttrValue; + drv->GetFirstAttr = clusterGetFirstAttr; + drv->GetNextAttr = clusterGetNextAttr; + drv->SetAttrValue = clusterSetAttrValue; + drv->AddAttr = clusterAddAttr; + drv->OpenAttr = clusterOpenAttr; + drv->GetFirstMethod = clusterGetFirstMethod; + drv->GetNextMethod = clusterGetNextMethod; + drv->ExecuteMethod = clusterExecuteMethod; + drv->PresentationHints = clusterPresentationHints; + drv->Info = clusterInfo; + drv->Commit = clusterCommit; + drv->GetQueryCoverageMask = NULL; + drv->GetQueryIdentityPath = NULL; + + /** Register the driver. **/ + if (!check(objRegisterDriver(drv))) goto err; + + /** Register structs used in this project with the newmalloc memory management system. **/ + nmRegister(sizeof(SourceData), "ClusterSourceData"); + nmRegister(sizeof(Cluster), "Cluster"); + nmRegister(sizeof(ClusterData), "ClusterData"); + nmRegister(sizeof(SearchData), "ClusterSearch"); + nmRegister(sizeof(NodeData), "ClusterNodeData"); + nmRegister(sizeof(DriverData), "ClusterDriverData"); + nmRegister(sizeof(ClusterQuery), "ClusterQuery"); + nmRegister(sizeof(ClusterDriverCaches), "ClusterDriverCaches"); + + /** Success. **/ + return 0; + + /** Error cleanup. **/ + err: + if (drv != NULL) nmFree(drv, sizeof(ObjDriver)); + mssErrorf(1, "Cluster", "Failed to initialize cluster driver.\n"); + return -1; + } diff --git a/centrallix/test_obj.c b/centrallix/test_obj.c index c4c64e25..5ef492de 100644 --- a/centrallix/test_obj.c +++ b/centrallix/test_obj.c @@ -1443,6 +1443,7 @@ testobj_do_cmd(pObjSession s, char* cmd, int batch_mode, pLxSession inp_lx) else { printf("Unknown command '%s'\n",cmdname); + mlxCloseSession(ls); return -1; } diff --git a/centrallix/tests/test_cos_compare_00.cmp b/centrallix/tests/test_cos_compare_00.cmp index d586365f..2061443a 100644 --- a/centrallix/tests/test_cos_compare_00.cmp +++ b/centrallix/tests/test_cos_compare_00.cmp @@ -1,7 +1,11 @@ -Attribute [case1]: integer 1 -Attribute [case2]: integer 1 -Attribute [case3]: integer 1 -Attribute [case4]: integer 1 -Attribute [case5]: integer 1 -Attribute [case6]: integer 1 -Attribute [case7]: integer 1 +Attribute [case1]: string "pass" +Attribute [case2]: string "pass" +Attribute [case3]: string "pass" +Attribute [case4]: string "pass" +Attribute [cynthia]: string "pass" +Attribute [timothy]: string "pass" +Attribute [lance]: string "pass" +Attribute [gregory]: string "pass" +Attribute [nathan]: string "pass" +Attribute [identical]: string "pass" +Attribute [name]: string "pass" diff --git a/centrallix/tests/test_cos_compare_00.to b/centrallix/tests/test_cos_compare_00.to index 5bf95051..c5b0b1a5 100644 --- a/centrallix/tests/test_cos_compare_00.to +++ b/centrallix/tests/test_cos_compare_00.to @@ -1,17 +1,24 @@ ##NAME Text Mining String Similarity with Cosine Compare -# All email addresses and phone numbers are imaginary and were fabricated for the purposes of this test +# Basic tests of cosine similarity. +query select case1 = condition((cos_compare('hello', 'hello') >= 0.999) and (cos_compare('hello', 'hello') <= 1.0), "pass", "fail") +query select case2 = condition((cos_compare('hello', 'zephora') <= 0.001) and (cos_compare('hello', 'zephora') >= 0.0), "pass", "fail") +query select case3 = condition((cos_compare('hello', 'hello world') <= 0.7) and (cos_compare('hello', 'hello world') >= 0.6), "pass", "fail") +query select case4 = condition((cos_compare('hello there', 'hellow there') >= 0.9) and (cos_compare('hello', 'hellow') <= 1.0), "pass", "fail") + -query select case1 = (cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") >= 0.49) and (cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") <= 0.54) +# Tests on fabricated contact information. +# All email addresses and phone numbers are imaginary and were fabricated for the purposes of this test +query select cynthia = condition((cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") >= 0.49) and (cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") <= 0.54), "pass", "fail") -query select case2 = (cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") >= 0.425) and (cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") <= 0.475) +query select timothy = condition((cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") >= 0.45) and (cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") <= 0.50), "pass", "fail") -query select case3 = (cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") >= 0.35) and (cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") <= 0.40) +query select lance = condition((cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") >= 0.425) and (cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") <= 0.475), "pass", "fail") -query select case4 = (cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") >= 0.94) and (cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") <= 0.99) +query select gregory = condition((cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") >= 0.94) and (cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") <= 0.99), "pass", "fail") -query select case5 = (cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") >=0.66) and (cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") <= 0.71) +query select nathan = condition((cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") >= 0.575) and (cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") <= 0.625), "pass", "fail") -query select case6 = (cos_compare("This is an identical case", "This is an identical case") >=0.975) and (cos_compare("This is an identical case", "This is an identical case") <=1.00) +query select identical = condition((cos_compare("This is an identical case", "This is an identical case") >= 0.975) and (cos_compare("This is an identical case", "This is an identical case") <= 1.00), "pass", "fail") -query select case7 = (cos_compare("Samuel", "Alex") >= 0.00) and (cos_compare("Samuel", "Alex") <= 0.025) +query select name = condition((cos_compare("Samuel", "Alex") >= 0.00) and (cos_compare("Samuel", "Alex") <= 0.025), "pass", "fail") diff --git a/centrallix/tests/test_expfn_metaphone_00.cmp b/centrallix/tests/test_expfn_metaphone_00.cmp new file mode 100644 index 00000000..d13cf05c --- /dev/null +++ b/centrallix/tests/test_expfn_metaphone_00.cmp @@ -0,0 +1,140 @@ +Attribute [result]: string "TST`TST" +Attribute [result]: string "PSK`PSK" +Attribute [result]: string "SNTRLKS`SNTRLKS" +Attribute [result]: string "LRNS`LRNS" +Attribute [result]: string "FLPS`FLPS" +Attribute [result]: string "AKSPTNNS`AKSPTNKNS" +Attribute [result]: string "SPRKLFRJLSTSKSPLTSS`SPRKLFRKLSTSKSPLTXS" +Attribute [result]: string "SKTLPKSSTSLKRFLKRPS`SKTLPKSSTSLKRFLKRPS" +Attribute [result]: string "SM0`XMT" +Attribute [result]: string "XMT`SMT" +Attribute [result]: string "SNTR`XNTR" +Attribute [result]: string "XNTR`SNTR" +Attribute [result]: string "ARN`ARNF" +Attribute [result]: string "ARNF`ARNF" +Attribute [result]: string "AKST`AKST" +Attribute [result]: string "AKSTNT`AKSTNT" +Attribute [result]: string "AKTL`AKTL" +Attribute [result]: string "ARX`ARK" +Attribute [result]: string "ART`ARTS" +Attribute [result]: string "PKS`PKS" +Attribute [result]: string "PX`PX" +Attribute [result]: string "PJTR`PHTR" +Attribute [result]: string "PLX`PLX" +Attribute [result]: string "PRTX`PRTX" +Attribute [result]: string "PJ`PK" +Attribute [result]: string "P`P" +Attribute [result]: string "PR`PR" +Attribute [result]: string "PRTN`PRTN" +Attribute [result]: string "KPRL`KPR" +Attribute [result]: string "SSR`SSR" +Attribute [result]: string "KKN`KKN" +Attribute [result]: string "KMPL`KMPL" +Attribute [result]: string "KRLL`KRLL" +Attribute [result]: string "KRLL`KRLL" +Attribute [result]: string "KMSTR`KMSTR" +Attribute [result]: string "KNT`KNT" +Attribute [result]: string "KRS`KRS" +Attribute [result]: string "KF`KF" +Attribute [result]: string "SRN`XRN" +Attribute [result]: string "TM`TM" +Attribute [result]: string "ATKR`ATKR" +Attribute [result]: string "AJ`AJ" +Attribute [result]: string "FLPTS`FLPFX" +Attribute [result]: string "FKX`FKX" +Attribute [result]: string "KLKS`KKS" +Attribute [result]: string "KRMNK`JRMNK" +Attribute [result]: string "JRTL`JRTL" +Attribute [result]: string "JLN`JLN" +Attribute [result]: string "KSPL`KSPL" +Attribute [result]: string "KF`KF" +Attribute [result]: string "KRK`KRK" +Attribute [result]: string "HKMR`HKMR" +Attribute [result]: string "H`H" +Attribute [result]: string "ALNT`ALNT" +Attribute [result]: string "AL`AL" +Attribute [result]: string "ATLN`ATLN" +Attribute [result]: string "JNKLTS`ANKLFX" +Attribute [result]: string "HS`HS" +Attribute [result]: string "LF`LF" +Attribute [result]: string "MKFR`MKFR" +Attribute [result]: string "MKRKR`MKRKR" +Attribute [result]: string "MNKR`MNJR" +Attribute [result]: string "MK`MK" +Attribute [result]: string "MKLFLN`MKLFLN" +Attribute [result]: string "MKL`MXL" +Attribute [result]: string "MTL`MTL" +Attribute [result]: string "ARKSTR`ARKSTR" +Attribute [result]: string "ARKT`ARKT" +Attribute [result]: string "PNN`PNN" +Attribute [result]: string "RSPR`RSPR" +Attribute [result]: string "RSN`RSNS" +Attribute [result]: string "RJ`RJR" +Attribute [result]: string "RF`RF" +Attribute [result]: string "SLFTR`SLFTR" +Attribute [result]: string "SNHSNT`SNHSNT" +Attribute [result]: string "XNKR`SKNKR" +Attribute [result]: string "XRMRRN`SKRMRRN" +Attribute [result]: string "XLSNKR`SLSNJR" +Attribute [result]: string "SKL`SKL" +Attribute [result]: string "SKNR`SKNR" +Attribute [result]: string "SKST`SKST" +Attribute [result]: string "XKR`SKR" +Attribute [result]: string "XKR`SKR" +Attribute [result]: string "TKLR`TLR" +Attribute [result]: string "TMS`TMS" +Attribute [result]: string "TMS`TMS" +Attribute [result]: string "0M`TM" +Attribute [result]: string "TXNR`TKNR" +Attribute [result]: string "TF`TF" +Attribute [result]: string "FK`FK" +Attribute [result]: string "AKTLR`FKTLR" +Attribute [result]: string "AKSLR`FKSLR" +Attribute [result]: string "ART`FRT" +Attribute [result]: string "SF`SFR" +Attribute [result]: string "ANKLFX`ANKLFK" +Attribute [result]: string "J`J" +Attribute [result]: string "MKLLN`MKLLN" +Attribute [result]: string "MRS`MRS" +Attribute [result]: string "APR`APR" +Attribute [result]: string "KMPRL`KMPR" +Attribute [result]: string "HT`HT" +Attribute [result]: string "K0RN`KTRN" +Attribute [result]: string "K0RN`KTRN" +Attribute [result]: string "RXRT`RKRT" +Attribute [result]: string "PP`PP" +Attribute [result]: string "ARK`ARK" +Attribute [result]: string "JF`KF" +Attribute [result]: string "TF`TF" +Attribute [result]: string "R`R" +Attribute [result]: string "STFN`STFN" +Attribute [result]: string "PRS`PRS" +Attribute [result]: string "RNT`RNT" +Attribute [result]: string "PRN`PRN" +Attribute [result]: string "PRN`PRN" +Attribute [result]: string "AT`AT" +Attribute [result]: string "AT`AT" +Attribute [result]: string "APT`APT" +Attribute [result]: string "PK`PK" +Attribute [result]: string "PKR`PKR" +Attribute [result]: string "XRLS`XRLS" +Attribute [result]: string "KN`KN" +Attribute [result]: string "NM`NM" +Attribute [result]: string "RJ`R" +Attribute [result]: string "KNTN`KNTN" +Attribute [result]: string "A`A" +Attribute [result]: string "XMKR`XMKR" +Attribute [result]: string "SN`XN" +Attribute [result]: string "SKLT`SKLT" +Attribute [result]: string "STXN`STXN" +Attribute [result]: string "MX`MX" +Attribute [result]: string "PS`PTS" +Attribute [result]: string "AKNS`ANS" +Attribute [result]: string "SNS`SNS" +Attribute [result]: string "FNKK`FNKK" +Attribute [result]: string "JSF`HSF" +Attribute [result]: string "APJKT`APJKT" +Attribute [result]: string "SLS`SLS" +Attribute [result]: string "XRF`XRF" +Attribute [result]: string "KS`KS" +Attribute [result]: string "FNKLR`FNKLR" diff --git a/centrallix/tests/test_expfn_metaphone_00.to b/centrallix/tests/test_expfn_metaphone_00.to new file mode 100644 index 00000000..de1897c3 --- /dev/null +++ b/centrallix/tests/test_expfn_metaphone_00.to @@ -0,0 +1,161 @@ +##NAME metaphone() function + +# Special thanks to the following websites for double checking the correct results: +# 1: https://words.github.io/double-metaphone +# 2: https://mainegenealogy.net/metaphone_converter.asp +# 3: https://en.toolpage.org/tool/metaphone + +# These tests were collected from the following sources: +# - Example comments in the source code of exp_double_metaphone.c +# - Maurice Aubrey's Tests* +# - Tests manually written by Israel Fuller +# - Tests written by prompting ChatGPT-5 (preview)** +# +# *Source: https://github.com/gitpan/Text-DoubleMetaphone/blob/master/t/words.txt +# **GPT-5 mini (Preview) was run in GitHub Copilot to suggest the words +# for some tests after analizing a generated coverage report. I (Israel) +# used the suggestions to write some "AI generated" test cases. +# +# For more information, see the manual test suite implementation at the +# end of the exp_double_metaphone.c file. + +query select result = metaphone("Test") +query select result = metaphone("Basic") +query select result = metaphone("Centrallix") +query select result = metaphone("Lawrence") +query select result = metaphone("Philips") +query select result = metaphone("Acceptingness") +query select result = metaphone("Supercalifragilisticexpialidocious") +query select result = metaphone("Suoicodilaipxecitsiligarfilacrepus") +query select result = metaphone("Smith") +query select result = metaphone("Schmidt") +query select result = metaphone("Snider") +query select result = metaphone("Schneider") +query select result = metaphone("Arnow") +query select result = metaphone("Arnoff") +query select result = metaphone("Accede") +query select result = metaphone("Accident") +query select result = metaphone("Actually") +query select result = metaphone("Arch") +query select result = metaphone("Artois") +query select result = metaphone("Bacchus") +query select result = metaphone("Bacci") +query select result = metaphone("Bajador") +query select result = metaphone("Bellocchio") +query select result = metaphone("Bertucci") +query select result = metaphone("Biaggi") +query select result = metaphone("Bough") +query select result = metaphone("Breaux") +query select result = metaphone("Broughton") +query select result = metaphone("Cabrillo") +query select result = metaphone("Caesar") +query select result = metaphone("Cagney") +query select result = metaphone("Campbell") +query select result = metaphone("Carlisle") +query select result = metaphone("Carlysle") +query select result = metaphone("Chemistry") +query select result = metaphone("Chianti") +query select result = metaphone("Chorus") +query select result = metaphone("Cough") +query select result = metaphone("Czerny") +query select result = metaphone("Dumb") +query select result = metaphone("Edgar") +query select result = metaphone("Edge") +query select result = metaphone("Filipowicz") +query select result = metaphone("Focaccia") +query select result = metaphone("Gallegos") +query select result = metaphone("Germanic") +query select result = metaphone("Ghiradelli") +query select result = metaphone("Ghislane") +query select result = metaphone("Gospel") +query select result = metaphone("Gough") +query select result = metaphone("Greek") +query select result = metaphone("Hochmeier") +query select result = metaphone("Hugh") +query select result = metaphone("Island") +query select result = metaphone("Isle") +query select result = metaphone("Italian") +query select result = metaphone("Jankelowicz") +query select result = metaphone("Jose") +query select result = metaphone("Laugh") +query select result = metaphone("Mac Caffrey") +query select result = metaphone("Mac Gregor") +query select result = metaphone("Manager") +query select result = metaphone("McHugh") +query select result = metaphone("McLaughlin") +query select result = metaphone("Michael") +query select result = metaphone("Middle") +query select result = metaphone("Orchestra") +query select result = metaphone("Orchid") +query select result = metaphone("Pinyin") +query select result = metaphone("Raspberry") +query select result = metaphone("Resnais") +query select result = metaphone("Rogier") +query select result = metaphone("Rough") +query select result = metaphone("Salvador") +query select result = metaphone("San jacinto") +query select result = metaphone("Schenker") +query select result = metaphone("Schermerhorn") +query select result = metaphone("Schlesinger") +query select result = metaphone("School") +query select result = metaphone("Schooner") +query select result = metaphone("Succeed") +query select result = metaphone("Sugar") +query select result = metaphone("Sugary") +query select result = metaphone("Tagliaro") +query select result = metaphone("Thames") +query select result = metaphone("Thomas") +query select result = metaphone("Thumb") +query select result = metaphone("Tichner") +query select result = metaphone("Tough") +query select result = metaphone("Vghee") +query select result = metaphone("Wachtler") +query select result = metaphone("Wechsler") +query select result = metaphone("Word") +query select result = metaphone("Xavier") +query select result = metaphone("Yankelovich") +query select result = metaphone("Zhao") +query select result = metaphone("McClellan") +query select result = metaphone("maurice") +query select result = metaphone("aubrey") +query select result = metaphone("cambrillo") +query select result = metaphone("heidi") +query select result = metaphone("katherine") +query select result = metaphone("catherine") +query select result = metaphone("richard") +query select result = metaphone("bob") +query select result = metaphone("eric") +query select result = metaphone("geoff") +query select result = metaphone("dave") +query select result = metaphone("ray") +query select result = metaphone("steven") +query select result = metaphone("bryce") +query select result = metaphone("randy") +query select result = metaphone("bryan") +query select result = metaphone("brian") +query select result = metaphone("otto") +query select result = metaphone("auto") +query select result = metaphone("Abbott") +query select result = metaphone("Back") +query select result = metaphone("Bacher") +query select result = metaphone("Charles") +query select result = metaphone("Ghana") +query select result = metaphone("Gnome") +query select result = metaphone("Raj") +query select result = metaphone("Quentin") +query select result = metaphone("Who") +query select result = metaphone("Shoemaker") +query select result = metaphone("Sian") +query select result = metaphone("Scold") +query select result = metaphone("Station") +query select result = metaphone("Match") +query select result = metaphone("Pizza") +query select result = metaphone("Agnes") +query select result = metaphone("Science") +query select result = metaphone("Van Gogh") +query select result = metaphone("Josef") +query select result = metaphone("Object") +query select result = metaphone("Sholz") +query select result = metaphone("Scharf") +query select result = metaphone("Kasia") +query select result = metaphone("Van Geller") diff --git a/centrallix/tests/test_fuzzycompare_00.cmp b/centrallix/tests/test_fuzzycompare_00.cmp deleted file mode 100644 index baa6db1e..00000000 --- a/centrallix/tests/test_fuzzycompare_00.cmp +++ /dev/null @@ -1,13 +0,0 @@ -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 diff --git a/centrallix/tests/test_fuzzycompare_00.to b/centrallix/tests/test_fuzzycompare_00.to deleted file mode 100644 index 78141a47..00000000 --- a/centrallix/tests/test_fuzzycompare_00.to +++ /dev/null @@ -1,15 +0,0 @@ -##NAME Levenshtein String Comparison - -query select sw1 = 1 where fuzzy_compare('hello', 'hello!', 20) >= 0 and fuzzy_compare("hello","hello!", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'asdfkh', 20) >= 0 and fuzzy_compare("hello","asdfkh", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'aaaaaaaaaaaaaaaaa', 20) >= 0 and fuzzy_compare("hello","aaaaaaaaaaaaaaaaa", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'nope', 20) >= 0 and fuzzy_compare("hello","nope", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('below', 'hello!', 20) >= 0 and fuzzy_compare("below","hello!", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('kitten', 'smitten', 20) >= 0 and fuzzy_compare("kitten","smitten", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'bobbobbobbob', 20) >= 0 and fuzzy_compare("hello","bobbobbobbob", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', '', 20) >= 0 and fuzzy_compare("hello","", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('', '', 20) >= 0 and fuzzy_compare("","", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('blooooop', 'blob', 20) >= 0 and fuzzy_compare("blooooop","blob", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('', '!', 20) >= 0 and fuzzy_compare("","!", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('h', 'h', 20) >= 0 and fuzzy_compare("h","h", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hi', 'hi', 20) >= 0 and fuzzy_compare("hi","hi", 20) <= 1 diff --git a/centrallix/tests/test_lev_compare_00.cmp b/centrallix/tests/test_lev_compare_00.cmp new file mode 100644 index 00000000..1c295a36 --- /dev/null +++ b/centrallix/tests/test_lev_compare_00.cmp @@ -0,0 +1,23 @@ +Attribute [case1]: string "pass" +Attribute [case2]: string "pass" +Attribute [case3]: string "pass" +Attribute [case4]: string "pass" +Attribute [case5]: string "pass" +Attribute [case6]: string "pass" +Attribute [case7]: string "pass" +Attribute [case8]: string "pass" +Attribute [case9]: string "pass" +Attribute [case10]: string "pass" +Attribute [case11]: string "pass" +Attribute [case12]: string "pass" +Attribute [case13]: string "pass" +Attribute [case14]: string "pass" +Attribute [case15]: string "pass" +Attribute [case16]: string "pass" +Attribute [case17]: string "pass" +Attribute [case18]: string "pass" +Attribute [case19]: string "pass" +Attribute [case20]: string "pass" +Attribute [case21]: string "pass" +Attribute [case22]: string "pass" +Attribute [case23]: string "pass" diff --git a/centrallix/tests/test_lev_compare_00.to b/centrallix/tests/test_lev_compare_00.to new file mode 100644 index 00000000..5d9cec0f --- /dev/null +++ b/centrallix/tests/test_lev_compare_00.to @@ -0,0 +1,28 @@ +##NAME Levenshtein String Comparison + +# Legacy tests. +query select case1 = condition(lev_compare('hello', 'hello!') >= 0 and lev_compare('hello','hello!') <= 1, 'pass', 'fail') +query select case2 = condition(lev_compare('hello', 'asdfkh') >= 0 and lev_compare('hello','asdfkh') <= 1, 'pass', 'fail') +query select case3 = condition(lev_compare('hello', 'aaaaaaaaaaaaaaaaa') >= 0 and lev_compare('hello','aaaaaaaaaaaaaaaaa') <= 1, 'pass', 'fail') +query select case4 = condition(lev_compare('hello', 'nope') >= 0 and lev_compare('hello', 'nope') <= 1, 'pass', 'fail') +query select case5 = condition(lev_compare('below', 'hello!') >= 0 and lev_compare('below', 'hello!') <= 1, 'pass', 'fail') +query select case6 = condition(lev_compare('kitten', 'smitten') >= 0 and lev_compare('kitten', 'smitten') <= 1, 'pass', 'fail') +query select case7 = condition(lev_compare('hello', 'bobbobbobbob') >= 0 and lev_compare('hello', 'bobbobbobbob') <= 1, 'pass', 'fail') +query select case8 = condition(lev_compare('hello', '') >= 0 and lev_compare('hello', '') <= 1, 'pass', 'fail') +query select case9 = condition(lev_compare('', '') >= 0 and lev_compare('', '') <= 1, 'pass', 'fail') +query select case10 = condition(lev_compare('blooooop', 'blob') >= 0 and lev_compare('blooooop', 'blob') <= 1, 'pass', 'fail') +query select case11 = condition(lev_compare('', '!') >= 0 and lev_compare('','!') <= 1, 'pass', 'fail') +query select case12 = condition(lev_compare('h', 'h') >= 0 and lev_compare('h','h') <= 1, 'pass', 'fail') +query select case13 = condition(lev_compare('hi', 'hi') >= 0 and lev_compare('hi','hi') <= 1, 'pass', 'fail') + +# Kitten tests. +query select case14 = condition(lev_compare('kitten', 'kitten') >= 0.99 and lev_compare('kitten', 'kitten') <= 1.0, 'pass', 'fail') -- 0 edits +query select case15 = condition(lev_compare('kitten', 'skitten') >= 0.8 and lev_compare('kitten', 'skitten') <= 0.9, 'pass', 'fail') -- 1 insert +query select case16 = condition(lev_compare('kitten', 'itten') >= 0.8 and lev_compare('kitten', 'itten') <= 0.9, 'pass', 'fail') -- 1 delete +query select case17 = condition(lev_compare('kitten', 'mitten') >= 0.8 and lev_compare('kitten', 'mitten') <= 0.9, 'pass', 'fail') -- 1 replace +query select case18 = condition(lev_compare('kitten', 'smitten') >= 0.7 and lev_compare('kitten', 'smitten') <= 0.8, 'pass', 'fail') -- 1 insert and one replace +query select case19 = condition(lev_compare('kitten', 'iktten') >= 0.8 and lev_compare('kitten', 'iktten') <= 0.9, 'pass', 'fail') -- 1 transpose +query select case20 = condition(lev_compare('kitten', 'kittens') >= 0.8 and lev_compare('kitten', 'kittens') <= 0.9, 'pass', 'fail') -- 1 insert (end) +query select case21 = condition(lev_compare('kitten', 'kitte') >= 0.8 and lev_compare('kitten', 'kitte') <= 0.9, 'pass', 'fail') -- 1 delete (end) +query select case22 = condition(lev_compare('kitten', 'kittem') >= 0.8 and lev_compare('kitten', 'kittem') <= 0.9, 'pass', 'fail') -- 1 replace (end) +query select case23 = condition(lev_compare('kitten', 'kittne') >= 0.8 and lev_compare('kitten', 'kittne') <= 0.9, 'pass', 'fail') -- 1 transpose (end) diff --git a/centrallix/tests/test_levenshtein_00.cmp b/centrallix/tests/test_levenshtein_00.cmp index 0bc319c9..2a084162 100644 --- a/centrallix/tests/test_levenshtein_00.cmp +++ b/centrallix/tests/test_levenshtein_00.cmp @@ -1,6 +1,18 @@ -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 2 -Attribute [sw1]: integer 2 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 +Attribute [case1]: string "pass" +Attribute [case2]: string "pass" +Attribute [case3]: string "pass" +Attribute [case4]: string "pass" +Attribute [case5]: string "pass" +Attribute [case6]: string "pass" +Attribute [case7]: string "pass" +Attribute [case8]: string "pass" +Attribute [case9]: string "pass" +Attribute [case10]: string "pass" +Attribute [case11]: string "pass" +Attribute [case12]: string "pass" +Attribute [case13]: string "pass" +Attribute [case14]: string "pass" +Attribute [case15]: string "pass" +Attribute [case16]: string "pass" +Attribute [case17]: string "pass" +Attribute [case18]: string "pass" diff --git a/centrallix/tests/test_levenshtein_00.to b/centrallix/tests/test_levenshtein_00.to index a666c3a4..33f78e5f 100644 --- a/centrallix/tests/test_levenshtein_00.to +++ b/centrallix/tests/test_levenshtein_00.to @@ -1,8 +1,25 @@ ##NAME Levenshtein String Comparison -query select sw1 = levenshtein('hello', 'hello!') -query select sw1 = levenshtein('kitten', 'mitten') -query select sw1 = levenshtein('kitten', 'smitten') -query select sw1 = levenshtein('lawn', 'flown') -query select sw1 = levenshtein('kitten', 'itten') -query select sw1 = levenshtein('kitten', 'skitten') +# Kitten tests. +query select case1 = condition(levenshtein('kitten', 'kitten') == 0, 'pass', 'fail') -- 0 edits +query select case2 = condition(levenshtein('kitten', 'skitten') == 1, 'pass', 'fail') -- 1 insert +query select case3 = condition(levenshtein('kitten', 'itten') == 1, 'pass', 'fail') -- 1 delete +query select case4 = condition(levenshtein('kitten', 'mitten') == 1, 'pass', 'fail') -- 1 replace +query select case5 = condition(levenshtein('kitten', 'smitten') == 2, 'pass', 'fail') -- 1 insert and one replace +query select case6 = condition(levenshtein('kitten', 'iktten') == 1, 'pass', 'fail') -- 1 transpose +query select case7 = condition(levenshtein('kitten', 'kittens') == 1, 'pass', 'fail') -- 1 insert (end) +query select case8 = condition(levenshtein('kitten', 'kitte') == 1, 'pass', 'fail') -- 1 delete (end) +query select case9 = condition(levenshtein('kitten', 'kittem') == 1, 'pass', 'fail') -- 1 replace (end) +query select case10 = condition(levenshtein('kitten', 'kittne') == 1, 'pass', 'fail') -- 1 transpose (end) + +# Alternate words. +query select case11 = condition(levenshtein('lawn', 'flown') == 2, 'pass', 'fail') -- 1 insert and one replace +query select case12 = condition(levenshtein('hello', 'hello!') == 1, 'pass', 'fail') -- 1 insert (end) +query select case13 = condition(levenshtein('zert', 'zerf') == 1, 'pass', 'fail') -- 1 replace (end) +query select case14 = condition(levenshtein('llearr', 'lear') == 2, 'pass', 'fail') -- 2 deletes (start & end) + +# Edge cases. +query select case15 = condition(levenshtein('', '') == 0, 'pass', 'fail') -- 0 edits +query select case16 = condition(levenshtein('This is a very long string!! I do not expect this function to need to process a string longer than this, because this string is a full 254 characters. That is pretty long. The object system limits strings to this size so we cannot make a longer string...', 'This is a very long string!! I do not expect this function to need to process a string longer than this, because this string is a full 254 characters. That is pretty long. The object system limits strings to this size so we cannot make a longer string...') == 0, 'pass', 'fail') -- 0 edits. +query select case17 = condition(levenshtein('This is a very long string!! I do not expect this function to need to process a string longer than this, because this string is a full 254 characters. That is pretty long. The object system limits strings to this size so we cannot make a longer string...', 'This is quite a lengthy string. I do not expect the function to compute any longer string since this one is a full 254 characters. That is plenty, even if someone adds many contact details to their record!! Thus, this test should cover most cases we see.') == 133, 'pass', 'fail') -- 133 edits. +query select case18 = condition(levenshtein('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', 'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB') == 254, 'pass', 'fail') -- 254 replaces. diff --git a/centrallix/tests/test_similarity_00.cmp b/centrallix/tests/test_similarity_00.cmp deleted file mode 100644 index a0d29220..00000000 --- a/centrallix/tests/test_similarity_00.cmp +++ /dev/null @@ -1,5 +0,0 @@ -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 diff --git a/centrallix/tests/test_similarity_00.to b/centrallix/tests/test_similarity_00.to deleted file mode 100644 index a0942ab7..00000000 --- a/centrallix/tests/test_similarity_00.to +++ /dev/null @@ -1,7 +0,0 @@ -##NAME Text Mining String Similarity - -query select sw1 = (cos_compare('hello', 'hello') >= 0.999) and (cos_compare('hello', 'hello') <= 1) -query select sw1 = (cos_compare('hello', 'nancy') <= 0.001) and (cos_compare('hello', 'nancy') >= 0) -query select sw1 = (cos_compare('hello', 'hello world') <= 0.891) and (cos_compare('hello', 'hello world') >= 0.890) -query select sw1 = (cos_compare('hello', 'hellow') >= 0.935) and (cos_compare('hello', 'hellow') <= 0.936) -query select sw1 = (cos_compare('hello', 'hellow', 1) >= 0.935) and (cos_compare('hello', 'hellow', 1) <= 0.936)