diff --git a/.gitignore b/.gitignore index cbfe20f1..bddd6b09 100644 --- a/.gitignore +++ b/.gitignore @@ -62,3 +62,4 @@ perf.data.old .idea/ .vscode/ centrallix-os/tmp/* +centrallix-os/datasets/ diff --git a/centrallix-doc/Widgets/widgets.xml b/centrallix-doc/Widgets/widgets.xml index b6b50afd..f38f178d 100644 --- a/centrallix-doc/Widgets/widgets.xml +++ b/centrallix-doc/Widgets/widgets.xml @@ -3731,7 +3731,7 @@ myTabControl "widget/tab" The title of the column to be displayed in the header row. - The type of the column: "text", "check", or "image". "text" is a normal column, and displays the textual value of the data element. "check" displays a checkmark if the data is non-zero (integers) or for strings if the value is non-empty and not "N" or "No". "image" displays the image referred to by the pathname contained in the data value. + The type of the column: "text", "check", "image", or "progress". "text" is a normal column, and displays the textual value of the data element. "check" displays a checkmark if the data is non-zero (integers) or for strings if the value is non-empty and not "N" or "No". "image" displays the image referred to by the pathname contained in the data value. "progress" displays a progress bar, with additional fields such as bar_color, bar_textcollor, and bar_padding. width of the column. diff --git a/centrallix-lib/Makefile.in b/centrallix-lib/Makefile.in index a7197622..0daf7e56 100644 --- a/centrallix-lib/Makefile.in +++ b/centrallix-lib/Makefile.in @@ -63,10 +63,10 @@ CFLAGS=@CFLAGS@ @DEFS@ -Iinclude -DCXLIB_INTERNAL -DNM_USE_SYSMALLOC -Wall $(PRO MTCFLAGS=@CFLAGS@ @DEFS@ -Iinclude -DCXLIB_INTERNAL -DNM_USE_SYSMALLOC -Wall $(PROFILE) $(COVERAGE) -g -O0 TCFLAGS=$(patsubst -DNDEBUG,,$(CFLAGS)) -XSTATICFILES=mtask.o mtlexer.o memstr.o xarray.o xhash.o xstring.o mtsession.o newmalloc.o xhashqueue.o bdqs_transport.o xhandle.o xringqueue.o cxsec.o smmalloc.o qprintf.o strtcpy.o util.o +XSTATICFILES=mtask.o mtlexer.o memstr.o xarray.o xhash.o xstring.o mtsession.o newmalloc.o xhashqueue.o bdqs_transport.o xhandle.o xringqueue.o cxsec.o smmalloc.o clusters.o qprintf.o strtcpy.o util.o STATICFILES=$(patsubst %,src/%,$(XSTATICFILES)) -XDYNAMICFILES=mtask.lo mtlexer.lo memstr.lo xarray.lo xhash.lo xstring.lo mtsession.lo newmalloc.lo xhashqueue.lo bdqs_transport.lo xhandle.lo xringqueue.lo cxsec.lo smmalloc.lo qprintf.lo strtcpy.lo util.lo +XDYNAMICFILES=mtask.lo mtlexer.lo memstr.lo xarray.lo xhash.lo xstring.lo mtsession.lo newmalloc.lo xhashqueue.lo bdqs_transport.lo xhandle.lo xringqueue.lo cxsec.lo smmalloc.lo clusters.lo qprintf.lo strtcpy.lo util.lo DYNAMICFILES=$(patsubst %,src/%,$(XDYNAMICFILES)) INCLUDEFILES:=$(wildcard include/*.h) diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h new file mode 100644 index 00000000..5aa3123e --- /dev/null +++ b/centrallix-lib/include/clusters.h @@ -0,0 +1,146 @@ +#ifndef CLUSTERS_H +#define CLUSTERS_H + +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: lib_cluster.c, lib_cluster.h */ +/* Author: Israel Fuller */ +/* Creation: September 29, 2025 */ +/* Description Clustering library used to cluster and search data with */ +/* cosine similarity and Levenshtein similarity (aka. edit */ +/* distance). Used by the "clustering driver". */ +/* For more information on how to use this library, see */ +/* string-similarity.md in the centrallix-sysdoc folder. */ +/************************************************************************/ + +#include +#include + +#ifdef CXLIB_INTERNAL +#include "xarray.h" +#else +#include "cxlib/xarray.h" +#endif + +/*** This value defines the number of dimensions used for a sparse + *** vector. The higher the number, the fewer collisions will be + *** encountered when using these vectors for cosine comparisons. + *** This is also called the vector table size, if viewing the + *** vector as a hash table of character pairs. + *** + *** 2147483629 is the signed int max, and is also a prime number. + *** Using this value ensures that the longest run of 0s will not + *** cause an int underflow with the current encoding scheme. + *** + *** Unfortunately, we can't use a number this large yet because + *** kmeans algorithm creates densely allocated centroids with + *** `CA_NUM_DIMS` dimensions, so a large number causes it to fail. + *** This, we use 251 as the largest prime number less than 256, + *** giving us a decent balance between collision reduction and + *** kmeans centroid performance/memory overhead. + ***/ +#define CA_NUM_DIMS 251 + +/// LINK ../../centrallix-sysdoc/string_comparison.md#cosine_charsets +/** The character used to create a pair with the first and last characters of a string. **/ +#define CA_BOUNDARY_CHAR (unsigned char)('a' - 1) + +/** Types. **/ +typedef int* pVector; /* Sparse vector. */ +typedef double* pCentroid; /* Dense centroid. */ +#define pCentroidSize CA_NUM_DIMS * sizeof(double) + +/** Duplocate information. **/ +typedef struct + { + void* key1; + void* key2; + double similarity; + } + Dup, *pDup; + +/** Registering all defined types for debugging. **/ +#define ca_init() \ + nmRegister(sizeof(pVector), "pVector"); \ + nmRegister(sizeof(pCentroid), "pCentroid"); \ + nmRegister(pCentroidSize, "Centroid"); \ + nmRegister(sizeof(Dup), "Dup") + +/** Edit distance function. **/ +int ca_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length); + +/** Vector functions. **/ +pVector ca_build_vector(const char* str); +unsigned int ca_sparse_len(const pVector vector); +void ca_print_vector(const pVector vector); +void ca_free_vector(pVector sparse_vector); + +/** Kmeans function. **/ +int ca_kmeans( + pVector* vectors, + const unsigned int num_vectors, + const unsigned int num_clusters, + const unsigned int max_iter, + const double min_improvement, + unsigned int* labels, + double* vector_sims); + +/** Vector helper macros. **/ +#define ca_is_empty(vector) (vector[0] == -CA_NUM_DIMS) +#define ca_has_no_pairs(vector) \ + ({ \ + __typeof__ (vector) _v = (vector); \ + _v[0] == -172 && _v[1] == 11 && _v[2] == -78; \ + }) + +/** Comparison functions (see ca_search()). **/ +double ca_cos_compare(void* v1, void* v2); +double ca_lev_compare(void* str1, void* str2); +bool ca_eql(pVector v1, pVector v2); + +/** Similarity search functions. **/ +void* ca_most_similar( + void* target, + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double threshold); +pXArray ca_sliding_search( + void** data, + const unsigned int num_data, + const unsigned int window_size, + const double (*similarity)(void*, void*), + const double dupe_threshold, + void** maybe_keys, + pXArray dups); +pXArray ca_complete_search( + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double dupe_threshold, + void** maybe_keys, + pXArray dups); + +#endif /* End of .h file. */ diff --git a/centrallix-lib/include/glyph.h b/centrallix-lib/include/glyph.h new file mode 100644 index 00000000..4636d6b4 --- /dev/null +++ b/centrallix-lib/include/glyph.h @@ -0,0 +1,79 @@ +#ifndef GLYPH_H +#define GLYPH_H + +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: glyph.h */ +/* Author: Israel Fuller */ +/* Creation: October 27, 2025 */ +/* Description: A simple debug visualizer to make pretty patterns in */ +/* developer's terminal which can be surprisingly useful */ +/* for debugging algorithms. */ +/************************************************************************/ + +#include + +/** Uncomment to activate glyphs. **/ +/** Should not be enabled in production code on the master branch. */ +// #define ENABLE_GLYPHS + +#ifdef ENABLE_GLYPHS +#define glyph_print(s) printf("%s", s); + +/*** Initialize a simple debug visualizer to make pretty patterns in the + *** developer's terminal. Great for when you need to run a long task and + *** want a super simple way to make sure it's still working. + *** + *** @attention - Relies on storing data in variables in scope, so calling + *** glyph() requires a call to glyph_init() previously in the same scope. + *** + *** @param name The symbol name of the visualizer. + *** @param str The string printed for the visualization. + *** @param interval The number of invocations of glyph() required to print. + *** @param flush Whether to flush on output. + ***/ +#define glyph_init(name, str, interval, flush) \ + const char* vis_##name##_str = str; \ + const unsigned int vis_##name##_interval = interval; \ + const bool vis_##name##_flush = flush; \ + unsigned int vis_##name##_i = 0u; + +/*** Invoke a visualizer. + *** + *** @param name The name of the visualizer to invoke. + ***/ +#define glyph(name) \ + if (++vis_##name##_i % vis_##name##_interval == 0) \ + { \ + glyph_print(vis_##name##_str); \ + if (vis_##name##_flush) fflush(stdout); \ + } +#else +#define glyph_print(str) +#define glyph_init(name, str, interval, flush) +#define glyph(name) +#endif + +#endif /* End of .h file. */ diff --git a/centrallix-lib/include/util.h b/centrallix-lib/include/util.h index df4ba0d5..efe914d8 100644 --- a/centrallix-lib/include/util.h +++ b/centrallix-lib/include/util.h @@ -2,7 +2,7 @@ #define UTILITY_H /************************************************************************/ -/* Centrallix Application Server System */ +/* Centrallix Application Server System */ /* Centrallix Base Library */ /* */ /* Copyright (C) 1998-2011 LightSys Technology Services, Inc. */ @@ -11,24 +11,165 @@ /* GNU Lesser General Public License, Version 2.1, contained in the */ /* included file "COPYING". */ /* */ -/* Module: (util.c,.h) */ -/* Author: Micah Shennum */ -/* Date: May 26, 2011 */ -/* Description: Collection of utilities */ +/* Module: util.c, util.h */ +/* Author: Micah Shennum and Israel Fuller */ +/* Date: May 26, 2011 */ +/* Description: Collection of utilities including: */ +/* - Utilities for parsing numbers. */ +/* - The timer utility for benchmarking code. */ +/* - snprint_bytes() for formatting a byte count. */ +/* - snprint_commas_llu() for formatting large numbers. */ +/* - fprint_mem() for printing memory stats. */ +/* - min() and max() for handling numbers. */ +/* - The check functions for reliably printing debug data. */ /************************************************************************/ #ifdef __cplusplus extern "C" { #endif - int strtoi(const char *nptr, char **endptr, int base); unsigned int strtoui(const char *nptr, char **endptr, int base); + char* snprint_bytes(char* buf, const size_t buf_size, unsigned int bytes); + char* snprint_commas_llu(char* buf, size_t buf_size, unsigned long long value); + void fprint_mem(FILE* out); + + typedef struct + { + double start, total; + } + Timer, *pTimer; + + pTimer timer_init(pTimer timer); + pTimer timer_new(void); + pTimer timer_start(pTimer timer); + pTimer timer_stop(pTimer timer); + double timer_get(pTimer timer); + pTimer timer_reset(pTimer timer); + void timer_de_init(pTimer timer); + void timer_free(pTimer timer); #ifdef __cplusplus } #endif -#endif /* UTILITY_H */ +#ifndef __cplusplus +#include + +/*** @brief Returns the smaller of two values. + *** + *** @param a The first value. + *** @param b The second value. + *** @return The smaller of the two values. + *** + *** @note This macro uses GCC extensions to ensure type safety. + ***/ +#define min(a, b) \ + ({ \ + __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + (_a < _b) ? _a : _b; \ + }) + +/*** @brief Returns the larger of two values. + *** + *** @param a The first value. + *** @param b The second value. + *** @return The larger of the two values. + *** + *** @note This macro uses GCC extensions to ensure type safety. + ***/ +#define max(a, b) \ + ({ \ + __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + (_a > _b) ? _a : _b; \ + }) + +/** File name macro, expanding functionality like __FILE__ and __LINE__. **/ +#define __FILENAME__ \ + ({ \ + const char* last_directory = strrchr(__FILE__, '/'); \ + ((last_directory != NULL) ? last_directory + 1 : __FILE__); \ + }) + +/** Error Handling. **/ +void print_err(int code, const char* function_name, const char* file_name, const int line_number); + +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is not zero. Not intended for user errors. + *** + *** @param result The result of the function we're checking. + *** @returns Whether the passed function succeeded. + ***/ +#define check(result) \ + ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ + __typeof__ (result) _r = (result); \ + const bool success = (_r == 0); \ + if (!success) print_err(_r, #result, __FILE__, __LINE__); \ + success; \ + }) +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is negative. Not intended for user errors. + *** + *** @param result The result of the function we're checking. + *** @returns Whether the passed function succeeded. + ***/ +#define check_neg(result) \ + ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ + __typeof__ (result) _r = (result); \ + const bool success = (_r >= 0); \ + if (!success) print_err(_r, #result, __FILE__, __LINE__); \ + success; \ + }) + +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is -1. Not intended for user errors. + *** + *** @param result The result of the function we're checking. + *** @returns Whether the passed function succeeded. + ***/ +#define check_weak(result) \ + ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ + __typeof__ (result) _r = (result); \ + const bool success = (_r != -1); \ + if (!success) print_err(_r, #result, __FILE__, __LINE__); \ + success; \ + }) + +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is a NAN double. Not intended for user errors. + *** + *** @param result The result of the function we're checking. + *** @returns result + ***/ +#define check_double(result) \ + ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ + __typeof__ (result) _r = (result); \ + if (isnan(_r)) print_err(0, #result, __FILE__, __LINE__); \ + _r; \ + }) + +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is a NULL pointer. Not intended for user errors. + *** + *** @param result The result of the function we're checking. + *** @returns result + ***/ +#define check_ptr(result) \ + ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ + __typeof__ (result) _r = (result); \ + if (_r == NULL) print_err(0, #result, __FILE__, __LINE__); \ + _r; \ + }) + +#endif /* __cplusplus */ + +#endif /* UTILITY_H */ diff --git a/centrallix-lib/include/xhash.h b/centrallix-lib/include/xhash.h index 1b5d8459..65b90057 100644 --- a/centrallix-lib/include/xhash.h +++ b/centrallix-lib/include/xhash.h @@ -1,7 +1,6 @@ #ifndef _XHASH_H #define _XHASH_H - /************************************************************************/ /* Centrallix Application Server System */ /* Centrallix Base Library */ @@ -55,6 +54,7 @@ int xhAdd(pXHashTable this, char* key, char* data); int xhRemove(pXHashTable this, char* key); char* xhLookup(pXHashTable this, char* key); int xhClear(pXHashTable this, int (*free_fn)(), void* free_arg); +int xhForEach(pXHashTable this, int (*callback_fn)(pXHashEntry, void*), void* each_arg); +int xhClearKeySafe(pXHashTable this, void (*free_fn)(pXHashEntry, void*), void* free_arg); #endif /* _XHASH_H */ - diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c new file mode 100644 index 00000000..505c3c27 --- /dev/null +++ b/centrallix-lib/src/clusters.c @@ -0,0 +1,1094 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: lib_cluster.c, lib_cluster.h */ +/* Author: Israel Fuller */ +/* Creation: September 29, 2025 */ +/* Description Clustering library used to cluster and search data with */ +/* cosine similarity and Levenshtein similarity (aka. edit */ +/* distance). Used by the "clustering driver". */ +/* For more information on how to use this library, see */ +/* string-similarity.md in the centrallix-sysdoc folder. */ +/************************************************************************/ + +/** This file has additional documentation in string_similarity.md. **/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "clusters.h" +#include "newmalloc.h" +#include "util.h" +#include "xarray.h" + +/*** Gets the hash, representing a pair of ASCII characters, represented by unsigned ints. + *** Thank you to professor John Delano for this hashing algorithm. + *** + *** @param c1 The first character in the pair. + *** @param c2 The second character in the pair. + *** @returns The resulting hash. + ***/ +static unsigned int +hash_char_pair(const unsigned char c1, const unsigned char c2) + { + const double sum = (c1 * c1 * c1) + (c2 * c2 * c2); + const double scale = ((double)c1 + 1.0) / ((double)c2 + 1.0); + const unsigned int hash = (unsigned int)round(sum * scale) - 1u; + return hash % CA_NUM_DIMS; + } + +/*** An internal struct for temporarily storing character pairs while building + *** sparse vectors. + *** + *** @param c1 The first character in the character pair. + *** @param c2 The second character in the character pair. + *** @param hash The hash for the two characters, calculated by calling the + *** hash_char_pair() function (above). + **/ +typedef struct + { + unsigned char c1, c2; + unsigned int hash; + } + CharPair, *pCharPair; + +/*** Internal function to compare two character pairs to allow us to sort them + *** by hash (ascending). + *** + *** @param p1 The first pCharPair. + *** @param p2 The second pCharPair. + *** @returns An int > 0 if p1's hash is larger. + *** An int < 0 if p2's hash is larger. + *** 0 if p1 and p2 have identical hashes. + ***/ +static int +charpair_cmp(const void *p1, const void *p2) + { + const CharPair *a = p1, *b = p2; + return a->hash - b->hash; + } + +/*** Builds a vector using a string. + *** + *** Vectors are based on the frequencies of character pairs in the string. + *** Space characters and punctuation characters (see code for list) are ignored, + *** and all characters are converted to lowercase. Character 96, which is just + *** before 'a' in the ASCII table (and maps to '`') is used to make pairs on the + *** start and end of strings. The only supported characters for the passed char* + *** are spaces, punctuation, uppercase and lowercase letters, and numbers. + *** + *** This results in the following modified ASCII table: + *** ```csv + *** #, char, #, char, #, char + *** 97, a, 109, m, 121, y + *** 98, b, 110, n, 122, z + *** 99, c, 111, o, 123, 0 + *** 100, d, 112, p, 124, 1 + *** 101, e, 113, q, 125, 2 + *** 102, f, 114, r, 126, 3 + *** 103, g, 115, s, 127, 4 + *** 104, h, 116, t, 128, 5 + *** 105, i, 117, u, 129, 6 + *** 106, j, 118, v, 130, 7 + *** 107, k, 119, w, 131, 8 + *** 108, l, 120, x, 132, 9 + *** ``` + *** Thus, any number from 96 (the start/end character) to 132 ('9') is a valid + *** input to get_char_pair_hash(). + *** + *** After hashing each character pair, we add some number from 1 to 13 to the + *** coresponding dimension. However, for most names, this results in a lot of + *** zeros and a FEW positive numbers. Thus, after creating the dense vector, + *** we convert it to a sparse vector in which a negative number replaces a run + *** of that many zeros. Consider the following example: + *** + *** Dense pVector: `[1,0,0,0,3,0]` + *** + *** Sparse pVector: `[1,-3,3,-1]` + *** + *** Using these sparse vectors greatly reduces the required memory and gives + *** approximately an x5 boost to performance when traversing vectors, at the + *** cost of more algorithmically complex code. + *** + *** @param str The string to be divided into pairs and hashed to make the vector. + *** @returns The sparse vector built using the hashed character pairs. + ***/ +pVector +ca_build_vector(const char* str) + { + unsigned char* chars = NULL; + CharPair* char_pairs = NULL; + pVector sparse_vector = NULL; + pVector trimmed_sparse_vector = NULL; + + unsigned int num_chars = 0u; + chars = check_ptr(nmSysMalloc((strlen(str) + 2u) * sizeof(unsigned char))); + if (chars == NULL) goto err_free; + + /** Begin adding char pairs (in order). **/ + chars[num_chars++] = CA_BOUNDARY_CHAR; /* Starting boundary character. */ + for (const char* char_ptr = str; *char_ptr != '\0'; char_ptr++) + { + char maybe_char = *char_ptr; + if (maybe_char < 0) fprintf(stderr, "Warning: Unexpected negative char '%c' in string: \"%s\"\n", maybe_char, str); + unsigned char c = (unsigned char)maybe_char; + + /** Always consider boundary character in string. **/ + if (c != CA_BOUNDARY_CHAR) goto skip_checks; + + /** Ignore insignificant characters: spaces and punctuation. **/ + if (isspace(c)) continue; /* space, \n, \v, \f, \r */ + if (ispunct(c)) continue; /* !"#$%&'()*+,-./:;<=>?@[\]^_{|}~ */ + + skip_checks: + /** Shift numbers to the end of the lowercase letters. **/ + if ('0' <= c && c <= '9') c += 75u; + + /** Store the character. **/ + chars[num_chars++] = tolower(c); + } + chars[num_chars++] = CA_BOUNDARY_CHAR; /* Ending boundary character. */ + + /** Compute hash values for char pairs. **/ + char_pairs = check_ptr(nmSysMalloc(num_chars * sizeof(CharPair))); + if (char_pairs == NULL) goto err_free; + const unsigned int num_pairs = num_chars - 1u; + for (unsigned int i = 0u; i < num_pairs; i++) + { + /** Store characters. **/ + char_pairs[i].c1 = chars[i]; + char_pairs[i].c2 = chars[i + 1]; + + /** Hash the character pair into an index (dimension). **/ + /** Note that the passed value should always be between 97 ('a') and 132 ('9'). **/ + char_pairs[i].hash = hash_char_pair(chars[i], chars[i + 1]); + } + + /** Free unused memory. **/ + nmSysFree(chars); + chars = NULL; + + /** Sort char_pairs by hash value. **/ + qsort(char_pairs, num_pairs, sizeof(CharPair), charpair_cmp); + + /** Allocate space for the sparse vector. **/ + sparse_vector = check_ptr(nmSysMalloc((num_pairs * 2u + 1u) * sizeof(int))); + if (sparse_vector == NULL) goto err_free; + + /** Build the sparse vector. **/ + unsigned int cur = 0u, dim = 0u; + for (unsigned int i = 0u; i < num_pairs;) + { + unsigned int hash = char_pairs[i].hash; + + /** Proceed through the pairs until we find a unique hash. **/ + /** Dividing value by 2 each time reduces the impact of repeated pairs. **/ + int value = 0; + for (; i < num_pairs && char_pairs[i].hash == hash; i++) + { + value /= 2; /* Reduce impact of repeated pairs. */ + value += ((unsigned int)char_pairs[i].c1 + (unsigned int)char_pairs[i].c2) % 13u + 1u; + } + + /** Skip zeros to reach the dimension index specified by the hash. **/ + unsigned int num_zeros = hash - dim; + if (num_zeros > 0u) + { + sparse_vector[cur++] = (int)-num_zeros; + dim = hash; + } + + /** Add the value to the sparse vector. **/ + sparse_vector[cur++] = value; + dim++; + } + if (dim != CA_NUM_DIMS) sparse_vector[cur++] = -(CA_NUM_DIMS - dim); + + /** Free unused memory. **/ + nmSysFree(char_pairs); + char_pairs = NULL; + + /** Trim extra space wasted by identical hashes. **/ + trimmed_sparse_vector = check_ptr(nmSysRealloc(sparse_vector, cur * sizeof(int))); + if (trimmed_sparse_vector == NULL) goto err_free; + sparse_vector = NULL; /* Mark memory freed by nmSysRealloc() no longer valid. */ + + /** Return the result. **/ + return trimmed_sparse_vector; + + err_free: + if (trimmed_sparse_vector != NULL) nmSysFree(trimmed_sparse_vector); + if (sparse_vector != NULL) nmSysFree(sparse_vector); + if (char_pairs != NULL) nmSysFree(char_pairs); + if (chars != NULL) nmSysFree(chars); + return NULL; + } + +/*** Free memory allocated to store a sparse vector. + *** + *** @param sparse_vector The sparse vector being freed. + ***/ +void +ca_free_vector(pVector sparse_vector) + { + nmSysFree(sparse_vector); + + return; + } + +/*** Compute the length of a sparsely allocated vector. + *** + *** @param vector The vector. + *** @returns The computed length. + ***/ +unsigned int +ca_sparse_len(const pVector vector) + { + unsigned int i = 0u; + + for (unsigned int dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = vector[i++]; + + /** Negative val represents -val 0s in the array, so skip that many values. **/ + if (val < 0) dim += (unsigned)(-val); + + /** We have a param_value, but we don't need to do anything with it. **/ + else dim++; + } + + return i; + } + +/*** Print the underlying implementation values sparsely allocated + *** vector (for debugging). + *** + *** @param vector The vector. + ***/ +void +ca_print_vector(const pVector vector) + { + const unsigned int len = ca_sparse_len(vector); + printf("Vector: [%d", vector[0]); + for (unsigned int i = 1u; i < len; i++) + printf(", %d", vector[i]); + printf("]"); + + return; + } + +/*** Compute the magnitude of a sparsely allocated vector. + *** + *** @param vector The vector. + *** @returns The computed magnitude. + ***/ +static double +magnitude_sparse(const pVector vector) + { + unsigned int magnitude = 0u; + + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = vector[i++]; + + /** Negative val represents -val 0s in the array, so skip that many values. **/ + if (val < 0) dim += (unsigned)(-val); + + /** We have a param_value, so square it and add it to the magnitude. **/ + else { magnitude += (unsigned)(val * val); dim++; } + } + + return sqrt((double)magnitude); + } + +/*** Compute the magnitude of a densely allocated centroid. + *** + *** @param centroid The centroid. + *** @returns The computed magnitude. + ***/ +static double +magnitude_dense(const pCentroid centroid) + { + double magnitude = 0.0; + + for (int i = 0; i < CA_NUM_DIMS; i++) + magnitude += centroid[i] * centroid[i]; + + return sqrt(magnitude); + } + +/*** Parse a token from a sparsely allocated vector and write the param_value and + *** number of remaining values to the passed locations. + *** + *** @param token The sparse vector token being parsed. + *** @param remaining The location to save the remaining number of characters. + *** @param param_value The location to save the param_value of the token. + ***/ +static void +parse_vector_token(const int token, unsigned int* remaining, unsigned int* param_value) + { + if (token < 0) + { + /** This run contains -token zeros. **/ + *remaining = (unsigned)(-token); + *param_value = 0u; + } + else + { + /** This run contains one param_value. **/ + *remaining = 1u; + *param_value = (unsigned)(token); + } + + return; + } + +/*** Calculate the similarity on sparsely allocated vectors. Comparing + *** any string to an empty string should always return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param v2 Sparse vector #2. + *** @returns Similarity between 0 and 1 where + *** 1 indicates identical and + *** 0 indicates completely different. + ***/ +static double +sparse_similarity(const pVector v1, const pVector v2) + { + /** Calculate dot product. **/ + unsigned int vec1_remaining = 0u, vec2_remaining = 0u; + unsigned int dim = 0u, i1 = 0u, i2 = 0u, dot_product = 0u; + while (dim < CA_NUM_DIMS) + { + unsigned int val1 = 0u, val2 = 0u; + if (vec1_remaining == 0u) parse_vector_token(v1[i1++], &vec1_remaining, &val1); + if (vec2_remaining == 0u) parse_vector_token(v2[i2++], &vec2_remaining, &val2); + + /*** Accumulate the dot_product. If either vector is 0 here, + *** the total is 0 and this statement does nothing. + ***/ + dot_product += val1 * val2; + + /** Consume overlap from both runs. **/ + unsigned int overlap = min(vec1_remaining, vec2_remaining); + vec1_remaining -= overlap; + vec2_remaining -= overlap; + dim += overlap; + } + + /** Optional optimization to speed up nonsimilar vectors. **/ + if (dot_product == 0u) return 0.0; + + /** Return the difference score. **/ + return (double)dot_product / (magnitude_sparse(v1) * magnitude_sparse(v2)); + } + +/*** Calculate the difference on sparsely allocated vectors. Comparing + *** any string to an empty string should always return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param v2 Sparse vector #2. + *** @returns Similarity between 0 and 1 where + *** 1 indicates completely different and + *** 0 indicates identical. + ***/ +#define sparse_dif(v1, v2) (1.0 - sparse_similarity(v1, v2)) + +/*** Calculate the similarity between a sparsely allocated vector and a densely + *** allocated centroid using a dot product. Comparing any string to an empty + *** string should always return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param c1 Dense centroid #2. + *** @returns Similarity between 0 and 1 where + *** 1 indicates identical and + *** 0 indicates completely different. + ***/ +static double +sparse_similarity_to_centroid(const pVector v1, const pCentroid c2) + { + double dot_product = 0.0; + + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = v1[i++]; + + /** Negative val represents -val 0s in the array, so skip that many values. **/ + if (val < 0) dim += (unsigned)(-val); + + /** We have a param_value, so square it and add it to the magnitude. **/ + else dot_product += (double)val * c2[dim++]; + } + + /** Return the difference score. **/ + return dot_product / (magnitude_sparse(v1) * magnitude_dense(c2)); + } + +/*** Calculate the difference between a sparsely allocated vector and a densely + *** allocated centroid. Comparing any string to an empty string should always + *** return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param c1 Dense centroid #2. + *** @returns Difference between 0 and 1 where + *** 1 indicates completely different and + *** 0 indicates identical. + ***/ +#define sparse_dif_to_centroid(v1, c2) (1.0 - sparse_similarity_to_centroid(v1, c2)) + +/*** Computes Levenshtein distance between two strings. + *** + *** @param str1 The first string. + *** @param str2 The second string. + *** @param length1 The length of the first string. + *** @param length1 The length of the first string. + *** @returns The edit distance between the two strings, or a negative value on error. + *** + *** @attention - `Tip`: Pass 0 for the length of either string to infer it + *** using the null terminating character. Conversely, character arrays + *** with no null terminator are allowed if an explicit length is specified. + *** + *** @attention - `Complexity`: O(nm), where n and m are the lengths of str1 + *** and str2 (respectively). + ***/ +int +ca_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) + { + int result = -1; + unsigned int** lev_matrix = NULL; + + /*** lev_matrix: + *** For all i and j, d[i][j] will hold the Levenshtein distance between + *** the first i characters of s and the first j characters of t. + *** + *** As they say, no dynamic programming algorithm is complete without a + *** matrix that you fill out and it has the answer in the final location. + ***/ + const size_t str1_len = (str1_length == 0u) ? strlen(str1) : str1_length; + const size_t str2_len = (str2_length == 0u) ? strlen(str2) : str2_length; + lev_matrix = check_ptr(nmSysMalloc((str1_len + 1) * sizeof(unsigned int*))); + if (lev_matrix == NULL) goto end; + for (unsigned int i = 0u; i < str1_len + 1u; i++) + { + lev_matrix[i] = check_ptr(nmSysMalloc((str2_len + 1) * sizeof(unsigned int))); + if (lev_matrix[i] == NULL) goto end; + } + + /*** Base case #0: + *** Transforming an empty string into an empty string has 0 cost. + ***/ + lev_matrix[0][0] = 0u; + + /*** Base case #1: + *** Any source prefixe can be transformed into an empty string by + *** dropping each character. + ***/ + for (unsigned int i = 1u; i <= str1_len; i++) + lev_matrix[i][0] = i; + + /*** Base case #2: + *** Any target prefixes can be transformed into an empty string by + *** inserting each character. + ***/ + for (unsigned int j = 1u; j <= str2_len; j++) + lev_matrix[0][j] = j; + + /** General Case. **/ + for (unsigned int i = 1u; i <= str1_len; i++) + { + for (unsigned int j = 1u; j <= str2_len; j++) + { + /** If the characters are equal, no change is needed. **/ + if (str1[i - 1] == str2[j - 1]) + lev_matrix[i][j] = lev_matrix[i - 1][j - 1]; + + /*** We need to make a change, so use the oppereration with the + *** lowest cost out of delete, insert, replace, or swap. + ***/ + else + { + unsigned int cost_delete = lev_matrix[i - 1][j] + 1u; + unsigned int cost_insert = lev_matrix[i][j - 1] + 1u; + unsigned int cost_replace = lev_matrix[i-1][j-1] + 1u; + + /** If a swap is possible, calculate the cost. **/ + bool can_swap = ( + i > 1 && j > 1 && + str1[i - 1] == str2[j - 2] && + str1[i - 2] == str2[j - 1] + ); + unsigned int cost_swap = (can_swap) ? lev_matrix[i - 2][j - 2] + 1 : UINT_MAX; + + /** Assign the best operation. **/ + lev_matrix[i][j] = min(min(min(cost_delete, cost_insert), cost_replace), cost_swap); + } + } + } + + /** Store result. **/ + unsigned int unsigned_result = lev_matrix[str1_len][str2_len]; + if (unsigned_result > INT_MAX) + { + fprintf(stderr, + "Warning: Integer overflow detected in ca_edit_dist(\"%s\", \"%s\", %lu, %lu) = %u > %d\n", + str1, str2, str1_length, str2_length, unsigned_result, INT_MAX + ); + } + result = (int)unsigned_result; + + /** Cleanup. **/ + end: + if (lev_matrix != NULL) + { + for (unsigned int i = 0u; i < str1_len + 1u; i++) + { + if (lev_matrix[i] == NULL) break; + else nmSysFree(lev_matrix[i]); + } + nmSysFree(lev_matrix); + } + + /** Done. **/ + return result; + } + +/*** Compares two strings using their cosie similarity, returning a value + *** between `0.0` (completely different) and `1.0` (identical). If either + *** OR BOTH strings are NULL, this function returns `0.0`. + *** + *** @attention - This function takes `void*` instead of `pVector` so that it + *** can be used as the similarity function in the ca_search() function + *** family without needing a messy typecast to avoid the compiler warning. + *** + *** @param v1 A `pVector` to the first string to compare. + *** @param v2 A `pVector` to the second string to compare. + *** @returns The cosine similarity between the two strings. + ***/ +double +ca_cos_compare(void* v1, void* v2) + { + if (v1 == v2) return 1.0; + + /** Input validation checks. **/ + const pVector vec1 = v1, vec2 = v2; + const bool v1_empty = (vec1 == NULL || ca_is_empty(vec1) || ca_has_no_pairs(vec1)); + const bool v2_empty = (vec2 == NULL || ca_is_empty(vec2) || ca_has_no_pairs(vec2)); + if (v1_empty && v2_empty) return 1.0; + if (v1_empty && !v2_empty) return 0.0; + if (!v1_empty && v2_empty) return 0.0; + + /** Apply rounding to avoid annoying floating point issues before returning. **/ + return round(sparse_similarity(vec1, vec2) * 1000000) / 1000000; + } + +/*** Compares two strings using their Levenshtein edit distance to compute a + *** similarity between `0.0` (completely different) and `1.0` (identical). + *** If both strings are empty, this function returns `1.0` (identical). If + *** either OR BOTH strings are NULL, this function returns `0.0`. + *** + *** @attention - This function takes `void*` instead of `char*` so that it + *** can be used as the similarity function in the ca_search() function + *** family without needing a messy typecast to avoid the compiler warning. + *** + *** @param str1 A `char*` to the first string to compare. + *** @param str2 A `char*` to the second string to compare. + *** @returns The levenshtein similarity between the two strings, or NAN on failure. + ***/ +double +ca_lev_compare(void* str1, void* str2) + { + /** Input validation checks. **/ + if (str1 == NULL || str2 == NULL) return 0.0; + if (str1 == str2) return 1.0; + + /** Handle string length. **/ + const size_t len1 = strlen(str1); + const size_t len2 = strlen(str2); + if (len1 == 0lu && len2 == 0lu) return 1.0; + if (len1 != 0lu && len2 == 0lu) return 0.0; + if (len1 == 0lu && len2 != 0lu) return 0.0; + + /** Compute levenshtein edit distance. **/ + const int edit_dist = ca_edit_dist((const char*)str1, (const char*)str2, len1, len2); + if (!check_neg(edit_dist)) return NAN; + + /** Normalize edit distance into a similarity measure. **/ + const double normalized_similarity = 1.0 - (double)edit_dist / (double)max(len1, len2); + + /** Apply rounding to avoid annoying floating point issues before returning. **/ + return round(normalized_similarity * 1000000) / 1000000; + } + +/*** Check if two sparse vectors are identical. + *** + *** @param v1 The first vector. + *** @param v2 The second vector. + *** @returns true if they are equal, + *** false if any element is different. + ***/ +bool +ca_eql(pVector v1, pVector v2) + { + const unsigned int len = ca_sparse_len(v1); + + for (unsigned int i = 0u; i < len; i++) + if (v1[i] != v2[i]) return false; + + return true; + } + +/*** Calculate the average size of all clusters in a set of vectors. + *** + *** @param vectors The vectors of the dataset (allocated sparsely). + *** @param num_vectors The number of vectors in the dataset. + *** @param labels The clusters to which vectors are assigned. + *** @param centroids The locations of the centroids (allocated densely). + *** @param num_clusters The number of centroids (k). + *** @returns The average cluster size. + ***/ +static double +get_cluster_size( + pVector* vectors, + const unsigned int num_vectors, + unsigned int* labels, + pCentroid* centroids, + const unsigned int num_clusters) + { + double result = NAN; + double* cluster_sums = NULL; + unsigned int* cluster_counts = NULL; + + /** Allocate space to store clusters as averages are computed. **/ + /*** We use nmMalloc() here because this function is usually called + *** repeatedly with the same number of clusters in the k-means loop. + *** Also, it is likely that k-means may be invoked multiple times with + *** the same k value, leading to additional caching benefits. + ***/ + cluster_sums = check_ptr(nmMalloc(num_clusters * sizeof(double))); + cluster_counts = check_ptr(nmMalloc(num_clusters * sizeof(unsigned int))); + if (cluster_sums == NULL) goto end; + if (cluster_counts == NULL) goto end; + for (unsigned int i = 0u; i < num_clusters; i++) + { + cluster_sums[i] = 0.0; + cluster_counts[i] = 0u; + } + + /** Sum the difference from each vector to its cluster centroid. **/ + for (unsigned int i = 0u; i < num_vectors; i++) + { + const unsigned int label = labels[i]; + cluster_sums[label] += sparse_dif_to_centroid(vectors[i], centroids[label]); + cluster_counts[label]++; + } + + /** Add up the average cluster size. **/ + double cluster_total = 0.0; + unsigned int num_valid_clusters = 0u; + for (unsigned int label = 0u; label < num_clusters; label++) + { + const unsigned int cluster_count = cluster_counts[label]; + if (cluster_count == 0u) continue; + + cluster_total += cluster_sums[label] / cluster_count; + num_valid_clusters++; + } + + /** Calculate average sizes. **/ + result = cluster_total / num_valid_clusters; + + end: + /** Clean up. **/ + if (cluster_sums != NULL) nmFree(cluster_sums, num_clusters * sizeof(double)); + if (cluster_counts != NULL) nmFree(cluster_counts, num_clusters * sizeof(unsigned int)); + + return result; + } + +/*** Executes the k-means clustering algorithm. Selects NUM_CLUSTERS random + *** vectors as initial centroids. Then points are assigned to the nearest + *** centroid, after which centroids are moved to the center of their points. + *** + *** @param vectors The vectors to cluster. + *** @param num_vectors The number of vectors to cluster. + *** @param num_clusters The number of clusters to generate. + *** @param max_iter The max number of iterations. + *** @param min_improvement The minimum amount of improvement that must be met + *** each clustering iteration. If there is less improvement, the algorithm + *** will stop. Pass any value less than -1 to fully disable this feature. + *** @param labels Stores the final cluster identities of the vectors after + *** clustering is completed. Each value will be `0 <= n < num_clusters`. + *** @param vector_sims An array of num_vectors elements, allocated by the + *** caller, where index i stores the similarity of vector i to its assigned + *** cluster. Passing NULL skips evaluation of these values. + *** + *** @attention - Assumes: num_vectors is the length of vectors. + *** @attention - Assumes: num_clusters is the length of labels. + *** + *** @attention - Issue: At larger numbers of clustering iterations, some + *** clusters have a size of negative infinity. In this implementation, + *** the bug is mitigated by setting a small number of max iterations, + *** such as 16 instead of 100. + *** @attention - Issue: Clusters do not appear to improve much after the first + *** iteration, which puts the efficacy of the algorithm into question. This + *** may be due to the uneven density of a typical dataset. However, the + *** clusters still offer useful information. + *** + *** Complexity: + *** + *** - `O(kd + k + i*(k + n*(k+d) + kd))` + *** + *** - `O(kd + k + ik + ink + ind + ikd)` + *** + *** - `O(nk + nd)` + ***/ +int +ca_kmeans( + pVector* vectors, + const unsigned int num_vectors, + const unsigned int num_clusters, + const unsigned int max_iter, + const double min_improvement, + unsigned int* labels, + double* vector_sims) + { + pCentroid* centroids = NULL; + pCentroid* new_centroids = NULL; + + /** Setup variables. **/ + bool successful = false; + unsigned int cluster_counts[num_clusters]; + memset(labels, 0u, num_vectors * sizeof(unsigned int)); + + /** Allocate space to store centroids and new_centroids. **/ + /** Dynamic allocation is required because these densely allocated arrays might be up to 500KB! **/ + const size_t centroids_size = num_clusters * sizeof(pCentroid); + centroids = check_ptr(nmMalloc(centroids_size)); + new_centroids = check_ptr(nmMalloc(centroids_size)); + if (centroids == NULL) goto end; + if (new_centroids == NULL) goto end; + memset(centroids, 0, centroids_size); + memset(new_centroids, 0, centroids_size); + for (unsigned int i = 0u; i < num_clusters; i++) + { + centroids[i] = check_ptr(nmMalloc(pCentroidSize)); + new_centroids[i] = check_ptr(nmMalloc(pCentroidSize)); + if (centroids[i] == NULL) goto end; + if (new_centroids[i] == NULL) goto end; + memset(centroids[i], 0, pCentroidSize); + memset(new_centroids[i], 0, pCentroidSize); + } + + /** Select random vectors to use as the initial centroids. **/ + srand(time(NULL)); + for (unsigned int i = 0u; i < num_clusters; i++) + { + /** Pick a random vector. **/ + const pVector vector = vectors[rand() % num_vectors]; + + /** Sparse copy the vector to expand it into a densely allocated centroid. **/ + pCentroid centroid = centroids[i]; + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int token = vector[i++]; + if (token > 0) centroid[dim++] = (double)token; + else for (unsigned int j = 0u; j < (unsigned)-token; j++) centroid[dim++] = 0.0; + } + } + + /** Main kmeans loop. **/ + double old_average_cluster_size = 1.0; + for (unsigned int iter = 0u; iter < max_iter; iter++) + { + bool changed = false; + + /** Reset new centroids. **/ + for (unsigned int i = 0u; i < num_clusters; i++) + { + cluster_counts[i] = 0u; + for (unsigned int dim = 0; dim < CA_NUM_DIMS; dim++) + new_centroids[i][dim] = 0.0; + } + + /** Assign each point to the nearest centroid. **/ + for (unsigned int i = 0u; i < num_vectors; i++) + { + const pVector vector = vectors[i]; + double min_dist = DBL_MAX; + unsigned int best_centroid_label = 0u; + + /** Find nearest centroid. **/ + for (unsigned int j = 0u; j < num_clusters; j++) + { + const double dist = sparse_dif_to_centroid(vector, centroids[j]); + if (dist < min_dist) + { + min_dist = dist; + best_centroid_label = j; + } + } + + /** Update label to new centroid, if necessary. **/ + if (labels[i] != best_centroid_label) + { + labels[i] = best_centroid_label; + changed = true; + } + + /** Accumulate values for new centroid calculation. **/ + pCentroid best_centroid = new_centroids[best_centroid_label]; + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = vector[i++]; + if (val < 0) dim += (unsigned)(-val); + else best_centroid[dim++] += (double)val; + } + cluster_counts[best_centroid_label]++; + } + + /** Stop if centroids didn't change. **/ + if (!changed) break; + + /** Update centroids. **/ + for (unsigned int i = 0u; i < num_clusters; i++) + { + if (cluster_counts[i] == 0u) continue; + pCentroid centroid = centroids[i]; + const pCentroid new_centroid = new_centroids[i]; + const unsigned int cluster_count = cluster_counts[i]; + for (unsigned int dim = 0u; dim < CA_NUM_DIMS; dim++) + centroid[dim] = new_centroid[dim] / cluster_count; + } + + /** Is there enough improvement? **/ + if (min_improvement < -1) continue; /** Skip check if it will never end the loop. **/ + const double average_cluster_size = check_double(get_cluster_size(vectors, num_vectors, labels, centroids, num_clusters)); + if (isnan(average_cluster_size)) goto end; + const double improvement = old_average_cluster_size - average_cluster_size; + if (improvement < min_improvement) break; + old_average_cluster_size = average_cluster_size; + } + + /** Compute vector similarities, if requested. **/ + if (vector_sims != NULL) + { + for (unsigned int i = 0u; i < num_vectors; i++) + vector_sims[i] = sparse_similarity_to_centroid(vectors[i], centroids[labels[i]]); + } + + /** Success. **/ + successful = true; + + end: + /** Clean up. **/ + if (centroids != NULL) + { + for (unsigned int i = 0u; i < num_clusters; i++) + { + if (centroids[i] != NULL) nmFree(centroids[i], pCentroidSize); + else break; + } + nmFree(centroids, num_clusters * sizeof(pCentroid)); + } + if (new_centroids != NULL) + { + for (unsigned int i = 0u; i < num_clusters; i++) + { + if (new_centroids[i] != NULL) nmFree(new_centroids[i], pCentroidSize); + else break; + } + nmFree(new_centroids, num_clusters * sizeof(pCentroid)); + } + + /** Return the function result code. **/ + return (successful) ? 0 : -1; + } + +/*** Finds the data that is the most similar to the target and returns + *** it if the similarity meets the threshold. + *** + *** @param target The target data to compare to the rest of the data. + *** @param data The rest of the data, compared against the target to + *** find the data that is the most similar. + *** @param num_data The number of elements in data. Specify 0 to detect + *** length on a null terminated array of data. + *** @param similarity A function which takes two data items of the type + *** of the data param and returns their similarity. + *** @param threshold The minimum similarity threshold. If the most similar + *** data does not meet this threshold, the function returns NULL. + *** @returns A pointer to the most similar piece of data found in the data + *** array, or NULL if the most similar data did not meet the threshold. + ***/ +void* +ca_most_similar( + void* target, + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double threshold) + { + void* most_similar = NULL; + double best_sim = -INFINITY; + + /** Iterate over all data options to find the one with the highest similarity. **/ + for (unsigned int i = 0u; (num_data == 0u) ? (data[i] != NULL) : (i < num_data); i++) + { + const double sim = check_double(similarity(target, data[i])); + if (isnan(sim)) continue; /* Skip failed comparison. */ + if (sim > best_sim && sim > threshold) + { + most_similar = data[i]; + best_sim = sim; + } + } + + return most_similar; + } + + +/*** Runs a sliding search over the provided data, comparing each element to + *** the following `window_size` elements, invoking the passed comparison + *** function just under `window_size * num_data` times. If any comparison + *** yields a similarity greater than the threshold, it is stored in the + *** xArray returned by this function. + *** + *** @param data The data to be searched. + *** @param num_data The number of data items in data. + *** @param window_size The size of the sliding window used for the search. + *** @param similarity A function which takes two data items of the type of + *** the data param and returns their similarity. + *** @param threshold The minimum threshold required for a duplocate to be + *** included in the returned xArray. + *** @param maybe_keys A pointer to an array of keys, with one key per data. + *** These will be used to fill in the key1 and key2 attributes for each + *** struct. If this variable is null, these values are also left null. + *** @param maybe_dups A pointer to an xArray in which dups should be found. + *** Pass NULL to allocate a new one. + *** @returns An xArray holding all of the duplocates found, or NULL if an + *** error occurs. + ***/ +pXArray +ca_sliding_search( + void** data, + const unsigned int num_data, + const unsigned int window_size, + const double (*similarity)(void*, void*), + const double threshold, + void** maybe_keys, + pXArray maybe_dups) + { + pXArray dups = maybe_dups; + + /** Allocate space for dups (if necessary). **/ + if (dups == NULL) + { + /** Guess that we will need space for num_data * 2 dups. **/ + const int guess_size = num_data * 2; + dups = check_ptr(xaNew(guess_size)); + if (dups == NULL) goto err; + } + const int num_starting_dups = dups->nItems; + + /** Search for dups. **/ + for (unsigned int i = 0u; i < num_data; i++) + { + const unsigned int window_start = i + 1u; + const unsigned int window_end = min(i + window_size, num_data); + for (unsigned int j = window_start; j < window_end; j++) + { + const double sim = check_double(similarity(data[i], data[j])); + if (isnan(sim) || sim < 0.0 || 1.0 < sim) + { + fprintf(stderr, "Invalid similarity %g %lf.\n", sim, sim); + goto err_free_dups; + } + if (sim > threshold) /* Dup found! */ + { + Dup* dup = (Dup*)check_ptr(nmMalloc(sizeof(Dup))); + if (dup == NULL) goto err_free_dups; + if (maybe_keys != NULL) + { + dup->key1 = maybe_keys[i]; + dup->key2 = maybe_keys[j]; + } + dup->similarity = sim; + if (!check_neg(xaAddItem(dups, (void*)dup))) goto err_free_dups; + } + } + } + + /** Success. **/ + return dups; + + err_free_dups: + /** Error cleanup: Free the dups that we added to the XArray. **/ + while (dups->nItems > num_starting_dups) + nmFree(dups->Items[--dups->nItems], sizeof(Dup)); + if (maybe_dups == NULL) check(xaDeInit(dups)); /* Failure ignored. */ + + err: + return NULL; + } + +/*** Runs a complete search over the provided data, comparing each element to + *** each other element, invoking the passed comparison function `num_data^2` + *** times. If any comparison yields a similarity greater than the threshold, + *** it is stored in the xArray returned by this function. + *** + *** @param data The data to be searched. + *** @param num_data The number of data items in data. + *** @param similarity A function which takes two data items of the type of + *** the data param and returns their similarity. + *** @param threshold The minimum threshold required for a duplocate to be + *** included in the returned xArray. + *** @param maybe_keys A pointer to an array of keys, with one key per data. + *** These will be used to fill in the key1 and key2 attributes for each + *** struct. If this variable is null, these values are also left null. + *** @param maybe_dups A pointer to an xArray in which dups should be found. + *** Pass NULL to allocate a new one. + *** @returns An xArray holding all of the duplocates found. If maybe_dups is + *** not NULL, this will be that xArray, to allow for chaining. + ***/ +pXArray +ca_complete_search( + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double threshold, + void** maybe_keys, + pXArray maybe_dups) + { + return ca_sliding_search(data, num_data, num_data, similarity, threshold, maybe_keys, maybe_dups); + } + +/** Scope cleanup. **/ +#undef sparse_dif +#undef sparse_dif_to_centroid diff --git a/centrallix-lib/src/mtask.c b/centrallix-lib/src/mtask.c index 9a167d72..401fbf15 100644 --- a/centrallix-lib/src/mtask.c +++ b/centrallix-lib/src/mtask.c @@ -3407,7 +3407,7 @@ netGetRemotePort(pFile net_filedesc) } -/*** NETCONNECTTCP creats a client socket and connects it to a +/*** NETCONNECTTCP creates a client socket and connects it to a *** server on a given TCP service/port and host name. The flag *** NET_U_NOBLOCK causes the request to return immediately even *** if the connection is still trying to establish. Further @@ -4265,4 +4265,3 @@ syGetSem(pSemaphore sem, int cnt, int flags) return code; } - diff --git a/centrallix-lib/src/mtlexer.c b/centrallix-lib/src/mtlexer.c index e92ea49f..39a69cc1 100644 --- a/centrallix-lib/src/mtlexer.c +++ b/centrallix-lib/src/mtlexer.c @@ -7,6 +7,7 @@ #include #include #include + #include "newmalloc.h" #include "mtask.h" #include "mtlexer.h" @@ -907,7 +908,9 @@ mlxNextToken(pLxSession this) } else { - mssError(1,"MLX","Unexpected character encountered"); + char buf[4]; + snprintf(buf, sizeof(buf), "%c", ch); // mssError() does not support %c. + mssError(1, "MLX", "Unexpected character encountered: '%s'", buf); this->TokType = MLX_TOK_ERROR; break; } @@ -1305,4 +1308,3 @@ mlxSetOffset(pLxSession this, unsigned long new_offset) return 0; } - diff --git a/centrallix-lib/src/util.c b/centrallix-lib/src/util.c index 629b59c7..9dbc804d 100644 --- a/centrallix-lib/src/util.c +++ b/centrallix-lib/src/util.c @@ -1,5 +1,5 @@ /************************************************************************/ -/* Centrallix Application Server System */ +/* Centrallix Application Server System */ /* Centrallix Base Library */ /* */ /* Copyright (C) 1998-2011 LightSys Technology Services, Inc. */ @@ -8,17 +8,29 @@ /* GNU Lesser General Public License, Version 2.1, contained in the */ /* included file "COPYING". */ /* */ -/* Module: (util.c,.h) */ -/* Author: Micah Shennum */ -/* Date: May 26, 2011 */ -/* Description: Collection of utilities */ +/* Module: util.c, util.h */ +/* Author: Micah Shennum and Israel Fuller */ +/* Date: May 26, 2011 */ +/* Description: Collection of utilities including: */ +/* - Utilities for parsing numbers. */ +/* - The timer utility for benchmarking code. */ +/* - snprint_bytes() for formatting a byte count. */ +/* - snprint_commas_llu() for formatting large numbers. */ +/* - fprint_mem() for printing memory stats. */ +/* - min() and max() for handling numbers. */ +/* - The check functions for reliably printing debug data. */ /************************************************************************/ +#include +#include +#include +#include #include - #include -#include -#include +#include +#include + +#include "newmalloc.h" #include "util.h" /** @@ -77,3 +89,204 @@ unsigned int strtoui(const char *nptr, char **endptr, int base){ //return as tmp; return (unsigned int)tmp; } + +/*** snprint_bytes() allows one to pick between CS units, where the kibibyte + *** (KiB) is 1024 bytes, and metric units where the kilobyte (KB) is 1000 bytes. + *** Fun Fact: Windows uses kibibytes, but displays them as KB. + ***/ +#define USE_METRIC false +#define N_UNITS 6u +static char* units_cs[N_UNITS] = {"bytes", "KiB", "MiB", "GiB"}; +static char* units_metric[N_UNITS] = {"bytes", "KB", "MB", "GB"}; + +/*** Displays a size in bytes using the largest unit where the result would be + *** at least 1.0. Note that units larger than GB and GiB are not supported + *** because the largest possible unsigned int is 4,294,967,295, which is + *** exactly 4 GiB (or approximately 4.29 GB). + *** + *** @param buf The buffer to which new text will be written, using snprintf(). + *** @param buf_size The amount of space in the buffer, passed to snprintf(). + *** It is recommended to have at least 12 characters available. + *** @param bytes The number of bytes, which will be formatted and written + *** to the buffer.. + *** @returns buf, for chaining. + ***/ +char* +snprint_bytes(char* buf, const size_t buf_size, unsigned int bytes) + { + char** units = (USE_METRIC) ? units_metric : units_cs; + const double unit_size = (USE_METRIC) ? 1000.0 : 1024.0; + + /** Search for the largest unit where the value would be at least 1. **/ + const double size = (double)bytes; + for (unsigned char i = N_UNITS; i >= 1u; i--) + { + const double denominator = pow(unit_size, i); + if (size >= denominator) + { + const double converted_size = size / denominator; + if (converted_size >= 100.0) + snprintf(buf, buf_size, "%.5g %s", converted_size, units[i]); + else if (converted_size >= 10.0) + snprintf(buf, buf_size, "%.4g %s", converted_size, units[i]); + else /* if (converted_size >= 1.0) - Always true. */ + snprintf(buf, buf_size, "%.3g %s", converted_size, units[i]); + return buf; + } + } + + /** None of the larger units work, so we just use bytes. **/ + snprintf(buf, buf_size, "%u %s", bytes, units[0]); + + return buf; + } +#undef nUints + +/*** Print a large number formatted with comas to a buffer. + *** + *** @param buf The buffer to print the number into. + *** @param buf_size The maximum number of characters to add to the buffer. + *** @param value The value to write into the buffer. + *** @returns `buf`, or `NULL` if `buf_size` is 0. + */ +char* +snprint_commas_llu(char* buf, size_t buf_size, unsigned long long value) + { + if (buf_size == 0) return NULL; + if (value == 0) + { + if (buf_size > 1) { buf[0] = '0'; buf[1] = '\0'; } + else buf[0] = '\0'; + return buf; + } + + char tmp[32]; + unsigned int ti = 0; + while (value > 0 && ti < sizeof(tmp) - 1) + { + if (ti % 4 == 3) tmp[ti++] = ','; + tmp[ti++] = '0' + (value % 10); + value /= 10; + } + tmp[ti] = '\0'; + + unsigned int outlen = min(ti, buf_size - 1u); + for (unsigned int i = 0u; i < outlen; i++) buf[i] = tmp[ti - i - 1]; + buf[outlen] = '\0'; + + return buf; + } + +void +fprint_mem(FILE* out) + { + FILE* fp = fopen("/proc/self/statm", "r"); + if (fp == NULL) { perror("fopen()"); return; } + + long size, resident, share, text, lib, data, dt; + if (fscanf(fp, "%ld %ld %ld %ld %ld %ld %ld", + &size, &resident, &share, &text, &lib, &data, &dt) != 7) + { + fprintf(stderr, "Failed to read memory info\n"); + check(fclose(fp)); /* Failure ignored. */ + return; + } + check(fclose(fp)); /* Failure ignored. */ + + long page_size = sysconf(_SC_PAGESIZE); // in bytes + long resident_bytes = resident * page_size; + + const size_t buf_siz = 16u; + char buf[buf_siz]; + snprint_bytes(buf, buf_siz, (unsigned int)resident_bytes); + + fprintf(out, "Memory used: %ld bytes (%s)\n", resident_bytes, buf); + fprintf(out, "Share %ldb, Text %ldb, Lib %ldb, Data %ldb\n", share, text, lib, data); + + return; + } + +static double +get_time(void) + { + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + + return (double)ts.tv_sec + (double)ts.tv_nsec / 1.0e9f; + } + +pTimer +timer_init(pTimer timer) + { + if (timer == NULL) return NULL; + timer->start = NAN; + timer->total = 0.0; + + return timer; + } + +pTimer +timer_new(void) + { + return timer_init(nmMalloc(sizeof(Timer))); + } + +pTimer +timer_start(pTimer timer) + { + if (!timer) return timer; + timer->start = get_time(); + + return timer; + } + +pTimer +timer_stop(pTimer timer) + { + if (!timer) return timer; + timer->total += get_time() - timer->start; + + return timer; + } + +double +timer_get(pTimer timer) + { + return (timer) ? timer->total : NAN; + } + +pTimer +timer_reset(pTimer timer) + { + return timer_init(timer); + } + +void +timer_de_init(pTimer timer) {} + +void +timer_free(pTimer timer) + { + timer_de_init(timer); + nmFree(timer, sizeof(Timer)); + + return; + } + +/*** Function for failing on error, assuming the error came from a library or + *** system function call, so that the error buffer is set to a valid value. + ***/ +void print_err(int code, const char* function_name, const char* file_name, const int line_number) + { + /** Create a descriptive error message. **/ + char error_buf[BUFSIZ]; + snprintf(error_buf, sizeof(error_buf), "%s:%d: %s failed", file_name, line_number, function_name); + + /** Print it with as much info as we can reasonably find. **/ + if (errno != 0) perror(error_buf); + else if (code != 0) fprintf(stderr, "%s (error code %d).\n", error_buf, code); + else fprintf(stderr, "%s.\n", error_buf); + + return; + } diff --git a/centrallix-lib/src/xhash.c b/centrallix-lib/src/xhash.c index afeb432b..7bf4242c 100644 --- a/centrallix-lib/src/xhash.c +++ b/centrallix-lib/src/xhash.c @@ -290,4 +290,103 @@ xhClear(pXHashTable this, int (*free_fn)(), void* free_arg) return 0; } +/*** Executes an operation on each entry of the hash table entry. + *** + *** @param this The affected hash table (passing NULL causes undefined + *** behavior). + *** @param callback_fn A callback function to be called on each hash table + *** entry. It takes 2 parameters: the current hash table entry and a void* + *** argument specified using each_arg. If any invocation of the callback + *** function returns a value other than 0, xhForEach() will immediately + *** fail, returning that value as the error code. + *** @param each_arg An additional argument which will be passed to each + *** invocation of the callback function. + *** @returns 0 if the function executes successfully. + *** 1 if the callback function is NULL. + *** n (where n != 0) if the callback function returns n. + ***/ +int +xhForEach(pXHashTable this, int (*callback_fn)(pXHashEntry, void*), void* each_arg) + { + if (callback_fn == NULL) return 1; + + for (int row = 0; row < this->nRows; row++) + { + pXHashEntry entry = (pXHashEntry)(this->Rows.Items[row]); + while (entry != NULL) + { + pXHashEntry next = entry->Next; + const int ret = callback_fn(entry, each_arg); + if (ret != 0) return ret; + entry = next; + } + } + + return 0; + } + +/*** A helper function for `xhClearKeySafe()`. Deallocates a hash table entry + *** after calling the appropriate free function with the provided free arg. + *** + *** @param entry A pointer to the hash table entry to be freed (passing NULL + *** causes undefined behavior). + *** @param arg A pointer to a void* array with 2 elements: The first element + *** is a function pointer to the free function, which we invoke using the + *** provided entry and the free_arg, specified as the second element of + *** this array. + *** @returns 0, success. + ****/ +static int +xh_i_FreeEntry(pXHashEntry entry, void* arg) + { + /*** The passed void* actually points to a void* array with 2 elements. + *** + *** The first element is a function pointer to the free function, which + *** we invoke using the provided entry and the free_arg, specified as the + *** second element of the array. + *** + *** Interestingly, you can write this code in one line like this: + *** ((void (*)(pXHashEntry, void*))((void**)arg)[0])(entry, ((void**)arg)[1]); + *** But I value code readability, so fortunately, I can't be THAT cleaver... + ***/ + void** args = (void**)arg; + void (*free_fn)(pXHashEntry, void*) = args[0]; + free_fn(entry, args[1]); + + /** Free the entry. **/ + nmFree(entry, sizeof(XHashEntry)); + + return 0; + } +/*** Clears all contents from a hash table. The free function is passed each + *** hash entry struct, allowing it to free both the value and key, if needed. + *** + *** @param this The affected hash table (passing NULL causes undefined + *** behavior). + *** @param free_fn A pointer to a free function which will be called with a + *** pointer to each `XHashEntry` before they are deallocated. It is also + *** passed a `void*`, which will be `free_arg` (the third argument). + *** @param free_arg The void pointer value passed to the free function. + *** @returns 0 if successful, or + *** -1 if `free_fn()` is `NULL`. + ***/ +int +xhClearKeySafe(pXHashTable this, void (*free_fn)(pXHashEntry, void*), void* free_arg) + { + if (free_fn == NULL) return -1; + + /** Free each row. **/ + void* args[2] = {free_fn, free_arg}; + const int ret = xhForEach(this, xh_i_FreeEntry, args); + + /** Mark all rows as empty. **/ + for (int i = 0; i < this->nRows; i++) + this->Rows.Items[i] = NULL; + this->nItems = 0; + + /*** We are successful only if the free function didn't fail (and it should + *** not be able to fail). + ***/ + return ret; + } diff --git a/centrallix-os/cluster-schema.cluster b/centrallix-os/cluster-schema.cluster new file mode 100644 index 00000000..5e11cd7c --- /dev/null +++ b/centrallix-os/cluster-schema.cluster @@ -0,0 +1,61 @@ +// Input schema +$Version=2$ +file_name "system/cluster" + { + name "cluster/parameter" + { + type : DATA_T // See datatypes.h + ?default : type // Default value for the variable. + ?name : String // Overrides the name above. + ?style : StyleObj // idk where to find docs for this. + } + // Access with :parameters:name. Accessing dynamic data (e.g. parameters) + // should be done within a runserver() call. + ... + + source : DataSourcePath + key_attr : string ⊂ DataSourcePath/columns + data_attr : string ⊂ DataSourcePath/columns + + cluster_name "cluster/cluster" + { + algorithm : "none" | "sliding-window" | "k-means" // Implemented + | "k-means++" | "k-medoids" | "db-scan" // Not implemented + similarity_measure : "cosine" | "levenshtein" // levenshtein not implemented. + num_clusters : uint > 1 // (probably a parameter) + ?min_improvement : double && 0.0 < x < 1.0 | "none" // default: 0.0001 + ?max_iterations : uint // default: 64 + ?window_size : uint > 0 // required for algorithm = sliding_window. + ?overlap_size : double && 0.0 <= x <= 1.0 // default: 0.0, only allowed for algorithm = k-means | k-means++ | k-medoids, not implemented + + // Not implemented + sub_cluster_name "cluster/cluster" + { + // Same as above. + } + } + ... + + search_name "system/search" + { + source : string ⊂ [cluster_name, ...] + similarity_measure : "cosine" | "levenshtein" + threshold : double && 0.0 < x < 1.0 // optimization. + } + ... + } + +// Output schema + +- /cluster_name + ? /sub_cluster_name + ? ... + - /{query} + - /items : StringVec // The data points in the cluster. + ... +/search_name +- /{query} + - /key1 : string // The key of the first data point. + - /key2 : string // The key of the second data point. + - /sim : double && 0.0 < x <= threshold // The similarity of the two data points. +... diff --git a/centrallix-sysdoc/ClusterDriverRequirements-old.md b/centrallix-sysdoc/ClusterDriverRequirements-old.md new file mode 100644 index 00000000..601f4170 --- /dev/null +++ b/centrallix-sysdoc/ClusterDriverRequirements-old.md @@ -0,0 +1,186 @@ + +## Cluster Driver Specifications +### Cluster Open +```c +void* clusterOpen(pObject obj, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); +``` +`clusterOpen()` shall... +- Create or read a node, as indicated by passed flags. + - Read flags from `obj->Mode`. + - If `O_EXCL` is specified, `O_CREAT` is specified, and there are no other elements in the path, create a new node. + - Otherwise attempt to read the previous object (in `obj->Prev`). + - If this fails and `O_CREAT` is specified, create a new node. + - If there is still no node, fail. +- Parse the provided path. + - Use `obj_internal_PathPart()` with the pathname in `obj->Pathname`. + - Not parse previous parts of the path already parsed by other drivers. + - Start at the `obj->SubPtr`-th path element (skipping `obj->SubPtr - 1` elements). + - Consume elements in the path until `obj_internal_PathPart()` returns `NULL`. + - Store the number of elements consumed in `obj->SubCnt`. +- Determine what data is being targeted from the parsed path. + - If the relevant part of the path contains only the name of the file, the driver shall set the target to root. + - If it contains the name of a valid (sub)cluster or search, the driver shall set the target to that (sub)cluster or search. + - Otherwise, the driver shall produce a descriptive error. +- Parse the provided structure file. + - Follow the spec given in `cluster-schema.cluster`. + - Produce descriptive errors when issues are detected. +- Return a new struct containing necessary information, including: + - The name, source path, and attribute name. + - All parameters (and a param list for scope), clusters, and searches. + - Each parameter shall be represented by a `pParam` object (see `params.h`). + - Each cluster shall be represented by a struct with information including: + - Its name, clustering algorithm, and similarity measure. + - The number of clusters to generate. + - If a k-means algorithm is specified, the improvement threshold. + - The maximum number of iterations to run. + - A list of subclusters with at least this information for each. + - Each search shall be represented by a struct with information including: + - Its name, threshold, and similarity measure. + - Its source, which is a valid cluster name of a cluster in the clusters list. + - Information about targets, derived from the path. + +### Cluster Close +```c +int clusterClose(void* inf_v, pObjTrxTree* oxt); +``` +`clusterClose()` shall... +- Free all allocated data in the driver struct. +- Close any open files or the like in the driver struct. +- Return 0. + +### Cluster Open Query +```c +void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt); +``` +`clusterOpenQuery()` shall... +- Return a query struct that can be passed to `clusterQueryFetch()`. + - This struct shall contain an index to the last row accessed (starting at 0). + - This struct shall contain a pointer to the driver data. + +### Cluster Query Fetch +```c +void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) +``` +`clusterQueryFetch()` shall... +- If the driver struct targets the root node, this function shall produce an error. +- If the driver struct targets an entry, this function shall produce a different error. +- If the driver targets a cluster or search, this function shall return a driver struct targetting the cluster or search *entry* (respectively) indicated by the query struct's row pointer, and increment the pointer. + - Exception: If no data remains, this function shall return `NULL` instead. + - This request shall cause clustering / searching to execute, if it has not executed already. + +### Cluster Query Close +```c +int clusterQueryClose(void* qy_v, pObjTrxTree* oxt); +``` +`clusterQueryClose()` shall... +- Free all allocated data in the query struct. +- Close any open files or the like in the query struct. +- Return 0. + +### Cluster Get Attribute Type +```c +int clusterGetAttrType(void* qy_v, pObjTrxTree* oxt); +``` +`clusterGetAttrType()` shall... +- Return the `DATA_T_...` type of the requested attribute, or `DATA_T_UNAVAILABLE` if the attribute does not exist. +- The name, content_type, inner_type, and outer_type attributes shall be of type `DATA_T_STRING`. +- The last_modification attribute shall be of type `DATA_T_DATETIME`. +- If the target is root... + - The source and attr_name attributes shall be of type `DATA_T_STRING`. +- If the target is a cluster... + - The algorithm and similarity_measure attributes shall be of type `DATA_T_STRING`. + - The num_clusters and max_iterations attributes shall be of type `DATA_T_INTEGER`. + - The improvement_threshold and average_similarity attributes shall be of type `DATA_T_DOUBLE`. +- If the target is a search... + - The source and similarity_measure attribute shall be of type `DATA_T_STRING`. + - The threshold attribute shall be of type `DATA_T_DOUBLE`. +- If the target is a cluster entry... + - The val attribute shall be of type `DATA_T_INTEGER`. + - The sim attribute shall be of type `DATA_T_DOUBLE`. +- If the target is a search entry... + - The val1 and val2 attribute shall be of type `DATA_T_INTEGER`. + - The sim attribute shall be of type `DATA_T_DOUBLE`. + +### Cluster Get Attribute Value +```c +int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* _); +``` +`clusterGetAttrValue()` shall... +- If the given datatype does not match that returned from `clusterGetAttrType()`, the function shall produce an error. +- Requesting the name attribute shall produce the following values, depending on the target: + - If the target is root, the name in the driver struct (aka. the one specified in the .cluster file) shall be produced. + - If the target is a cluster or cluster entry, the name of the cluster shall be produced. + - If the target is a search or search entry, the name of the search shall be produced. +- Requesting the annotation shall produce some string describing the driver. +- Requesting the outer_type shall produce "system/row". +- Requesting the inner_type or content_type shall produce "system/void". (All path elements are consumed.) +- If the target is root... + - Requesting source shall produce the source path. + - Requesting attr_name shall produce the attribute name. +- If the target is a cluster... + - Requesting algorithm shall produce the name of the clustering algorithm. + - Requesting similarity_measure shall produce the name of the similarity measure. + - Requesting num_clusters shall produce the number of clusters. + - Requesting max_iterations shall produce the maximum number of iterations. + - Requesting improvement_threshold shall produce the minimum improvement threshold. + - Requesting average_similarity shall produce the average size of clusters, running clustering / searching algorithms, if necessary. +- If the target is a search... + - Requesting source shall produce the name of the source cluster for the search. + - Requesting similarity_measure shall produce the name of the similarity measure. + - Requesting threshold shall produce the filtering threshold. +- If the target is a cluster entry... + - Requesting val shall produce the value of the data point in this cluster. + - Requesting sim shall produce the similarity of the data point to the center of the cluster. +- If the target is a cluster entry... + - Requesting val1 or val2 shall produce the first and second value (respectively)detected in this search. + - Requesting sim shall produce the similarity of these two data points. + + +### Cluster Get First Attribute +```c +char* clusterGetFirstAttr(void* inf_v, pObjTrxTree oxt); +``` +`clusterGetFirstAttr()` shall... +- Reset the current attribute index on the driver struct to 0. +- Return the value of invoking `clusterGetNextAttr()`. + +### Cluster Get Next Attribute +```c +char* clusterGetNextAttr(void* inf_v, pObjTrxTree oxt); +``` +`clusterGetNextAttr()` shall... +- Return the attribute name at the attribute index given by the driver struct in the list of attributes based on the target type. +- Return `NULL` if the end of the list has been reached. +- Increase the attribute index on the driver struct by 1. + +- The attribute name list for a targetting root shall include "source" and "attr_name". +- The attribute name list for a targetting a cluster shall include "algorithm", "similarity_measure", "num_clusters", "improvement_threshold", and "max_iterations". +- The attribute name list for a targetting a search shall include "source", "threshold", and "similarity_measure". +- The attribute name list for a targetting a cluster entry shall include "val" and "sim". +- The attribute name list for a targetting a search entry shall include "val1", "val2", and "sim". + +### Cluster Get Next Attribute +```c +int clusterInfo(void* inf_v, pObjectInfo info); +``` +`clusterInfo()` shall... +- Provide the OBJ_INFO_F_CANT_ADD_ATTR flag. +- Provide the OBJ_INFO_F_CANT_HAVE_CONTENT flag. +- Provide the OBJ_INFO_F_NO_CONTENT flag. +- If the target is a root... + - Provide the OBJ_INFO_F_CAN_HAVE_SUBOBJ flag. + - Provide the OBJ_INFO_F_SUBOBJ_CNT_KNOWN flag. + - Provide the OBJ_INFO_F_HAS_SUBOBJ flag if there is at least one cluster or search. + - Provide the OBJ_INFO_F_NO_SUBOBJ flag otherwise. + - Provide the total number of clusters and searches as the number of subobjects. +- If the target is a cluster... + - Provide the OBJ_INFO_F_CAN_HAVE_SUBOBJ flag. + - Provide the OBJ_INFO_F_HAS_SUBOBJ flag. + - If the algorithm has been run, provide OBJ_INFO_F_SUBOBJ_CNT_KNOWN flag and the number of data points clustered as the number of subobjects. +- If the target is a search... + - Provide the OBJ_INFO_F_CAN_HAVE_SUBOBJ flag. + - If the algorithm has been run... + - Provide OBJ_INFO_F_SUBOBJ_CNT_KNOWN flag and the number of elements found by the search as the number of subobjects. + - Provide the OBJ_INFO_F_HAS_SUBOBJ flag if at least one element was found. +- If the target is a cluster entry or a search entry... + - Provide the OBJ_INFO_F_CANT_HAVE_SUBOBJ flag. \ No newline at end of file diff --git a/centrallix-sysdoc/GCC_Dependencies.md b/centrallix-sysdoc/GCC_Dependencies.md new file mode 100644 index 00000000..1327ea09 --- /dev/null +++ b/centrallix-sysdoc/GCC_Dependencies.md @@ -0,0 +1,20 @@ +# GCC Dependencies + +Author: Israel Fuller + +Date: Descember 4, 2025 + +## Table of Contents +- [GCC Dependencies](#gcc-dependencies) + - [Table of Contents](#table-of-contents) + - [Introduction](#intoduction) + - [List of Dependencies](#list-of-dependencies) + +## Intoduction +This document tracks dependencies on the GCC toolchain in the centrallix codebase. As code is added which relies on GCC specific behavior, such additions should be noted here to make possible use of a different toolchain (e.g. LLVM) in the future less painful. + +## List of Dependencies +- `util.h` Uses the `__typeof__` to avoid double-computation in macros. + +## Notes +`__FILE__` and `__LINE__` are not dependencies as they were added in C90. See [this page](https://gcc.gnu.org/onlinedocs/cpp/Standard-Predefined-Macros.html) for information about predefined macros. diff --git a/centrallix-sysdoc/Libraries/mtask.md b/centrallix-sysdoc/Libraries/mtask.md new file mode 100644 index 00000000..e3395ce0 --- /dev/null +++ b/centrallix-sysdoc/Libraries/mtask.md @@ -0,0 +1,95 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# Handling Network Connection + +**Author**: Greg Beeley + +**Date**: January 13, 1999 + +**Updated**: December 11, 2025 + +**License**: Copyright (C) 2001-2025 LightSys Technology Services. See LICENSE.txt for more information. + + +## Table of Contents +- [Handling Network Connection](#the-mtsession-library) + - [Introduction](#introduction) + - [netConnectTCP()](#netconnecttcp) + - [netCloseTCP()](#netclosetcp) + - [fdWrite()](#fdwrite) + - [fdRead()](#fdread) + + +## Introduction +The `MTASK` module provides simple and easy TCP/IP connectivity. It includes many functions, only a few of which are documented below: + +- ⚠️ **Warning**: This documentation is incomplete, as many relevant functions are not explained here. You can help by expanding it. + + +## netConnectTCP() +```c +pFile netConnectTCP(char* host_name, char* service_name, int flags); +``` +This function creates a client socket and connects it to a server on a given TCP service/port and host name. It takes the following three parameters: +- `host_name`: The host name or ascii string for the host's ip address. +- `service_name`: The name of the service (from `/etc/services`) or its numeric representation as a string. +- `flags`: Normally left 0. + +- 📖 **Note**: The `NET_U_NOBLOCK` flag causes the function to return immediately even if the connection is still being established. Further reads and writes will block until the connection either establishes or fails. + +This function returns the connection file descriptor if successful, or `NULL` if an error occurs. + + +## netCloseTCP() +```c +int netCloseTCP(pFile net_filedesc, int linger_msec, int flags); +``` +This function closes a network connection (either a TCP listening, server, or client socket). It will also optionally waits up to `linger_msec` milliseconds (1/1000 seconds) for any data written to the connection to make it to the other end before performing the close. If `linger_msec` is set to 0, the connection is aborted (reset). The linger time can be set to 1000 msec or so if no writes were performed on the connection prior to the close. If a large amount of writes were performed immediately prior to the close, offering to linger for a few more seconds (perhaps 5 or 10 by specifying 5000 or 10000 msec) can be a good idea. + + +## fdWrite() +```c +int fdWrite(pFile filedesc, char* buffer, int length, int offset, int flags); +``` +This function writes data to an open file descriptor, from a given `buffer` and `length` of data to write. It also takes an optional seek `offset` and and `flags`, which can be zero or more of: +- `FD_U_NOBLOCK` - If the write can't be performed immediately, don't perform it at all. +- `FD_U_SEEK` - The `offset` value is valid. Seek to it before writing. Not allowed for network connections. +- `FD_U_PACKET` - *ALL* of the data specified by `length` in `buffer` must be written. Normal `write()` semantics in UNIX state that not all data has to be written, and the number of bytes actually written is returned. Setting this flag makes sure all data is really written before returning. + + +## fdRead() +```c +int fdRead(pFile filedesc, char* buffer, int maxlen, int offset, int flags); +``` +This function works the same as [`fdWrite()`](#fdwrite) except that it reads data instead of writing it. It takes the same flags as above, except that `FD_U_PACKET` now requires that all of `maxlen` bytes must be read before returning. This is good for reading a packet of a known length that might be broken up into fragments by the network (TCP/IP has a maximum frame transmission size of about 1450 bytes). diff --git a/centrallix-sysdoc/Libraries/mtsession.md b/centrallix-sysdoc/Libraries/mtsession.md new file mode 100644 index 00000000..a6c946e4 --- /dev/null +++ b/centrallix-sysdoc/Libraries/mtsession.md @@ -0,0 +1,141 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# The MTSession Library + +**Author**: Greg Beeley + +**Date**: January 13, 1999 + +**Updated**: December 11, 2025 + +**License**: Copyright (C) 2001-2025 LightSys Technology Services. See LICENSE.txt for more information. + + +## Table of Contents +- [The MTSession Library](#the-mtsession-library) + - [Introduction](#introduction) + - [mssUserName()](#mssusername) + - [mssPassword()](#msspassword) + - [mssSetParam()](#msssetparam) + - [mssGetParam()](#mssgetparam) + - [mssError()](#msserror) + - [mssErrorErrno()](#msserrorerrno) + + +## Introduction +The mtsession (MSS) module is used for session authentication, error reporting, and for storing session-wide variables such as the current date format, username, and password (used when issuing a login request to a remote server). Care should be taken in the use of Centrallix that its coredump files are NOT in a world-readable location, as the password will be visible in the coredump file (or just ulimit the core file size to 0). + + +- ⚠️ **Warning**: This documentation is incomplete, as several relevant functions are not explained here. You can help by expanding it. + + +## mssInitialize() +```c +int mssInitialize(char* authmethod, char* authfile, char* logmethod, int logall, char* log_progname); +``` +This function initializes the session manager and sets global variables used in this module. It returns 0 if successful and -1 if an error occurs. + + +## mssUserName() +```c +char* mssUserName(); +``` +This function returns the current user name, or `NULL` an error occurs. + + +## mssPassword() +```c +char* mssPassword(); +``` +This function returns the current user's password that they used to log into Centrallix, or `NULL` an error occurs. + + +## mssSetParam() +```c +int mssSetParam(char* paramname, char* param); +``` +This function sets the session parameter of the provided name (`paramname`) to the provided value (`param`). The parameter MUST be a string value. This function returns 0 if successful, or -1 an error occurs. + + +## mssGetParam() +```c +char* mssGetParam(char* paramname); +``` +Returns the value of a session parameter of the provided name (`paramname`), or `NULL` if an error occurs. Common session parameters include: +- `dfmt`: The current date format. +- `mfmt`: The current money format. +- `textsize`: The current max text size from a read of an object's content via `objGetAttrValue(obj, "objcontent", POD(&str))` + + +## mssError() +```c +int mssError(int clr, char* module, char* message, ...); +``` +Formats and caches an error message for return to the user. This function returns 0 if successful, or -1 if an error occurred. + +| Parameter | Type | Description +| --------- | ------------- | ------------ +| crl | int | If set to 1, all previous error messages are cleared. Set this when the error is initially discovered and no other module is likely to have made a relevant `mssError()` call for the current error. +| module | char* | A two-to-five letter abbreviation of the module reporting the error. This is typically the module or driver's abbreviation prefix in full uppercase letters (although that is not required). This is intended to help the developer find the source of the error faster. +| message | char* | A string error message, accepting format specifiers like `%d` and `%s` which are supplied by the argument list, similar to `printf()`. +| ... | ... | Parameters for the formatting. + +Errors that occur inside a session context are normally stored up and not printed until other MSS module routines are called to fetch the errors. Errors occurring outside a session context (such as in Centrallix's network listener) are printed to Centrallix's standard output immediately. + +The `mssError()` function is not required to be called at every function nesting level when an error occurs. For example, if the expression compiler returns -1 indicating that a compilation error occurred, it has probably already added one or more error messages to the error list. The calling function should only call `mssError()` if doing so would provide additional context or other useful information (e.g. _What_ expression failed compilation? _Why_ as an expression being compiled? etc.). However, it is far easier to give too little information that too much, so it can often be best to air on the side of calling `mssError()` with information that might be irrelevant, rather than skipping it and leaving the developer confused. + +- 📖 **Note**: The `mssError()` routines do not cause the calling function to return or exit. The function must still clean up after itself and return an appropriate value (such as `-1` or `NULL`) to indicate failure. + +- ⚠️ **Warning**: Even if `-1` is returned, the error message may still be sent to the user in some scenarios. This is not guaranteed, though. + +- ⚠️ **Warning**: `%d` and `%s` are the ONLY supported format specifier for this function. **DO NOT** use any other format specifiers like `%lf`, `%u`, `%lu`, `%c` etc. **DO NOT** attempt to include `%%` for a percent symbol in your error message, as misplaced percent symbols often break this function. If you wish to use these features of printf, it is recommended to print the error message to a buffer and pass that buffer to `mssError()`, as follows: + ```c + char err_buf[256]; + snprintf(err_buf, sizeof(err_buf), + "Incorrect values detected: %u, %g (%lf), '%c'", + unsigned_int_value, double_value, char_value + ); + if (mssError(1, "EXMPL", "%s", err_buf) != 0) + { + fprintf(stderr, "ERROR! %s\n", err_buf); + } + return -1; + ``` + + + +## mssErrorErrno() +```c +int mssErrorErrno(int clr, char* module, char* message, ...); +``` +This function works the same way as [`mssError`](#mssError), except checks the current value of `errno` and includes a description of any error stored there. This is useful if a system call or other library function is responsible for this error. diff --git a/centrallix-sysdoc/Libraries/newmalloc.md b/centrallix-sysdoc/Libraries/newmalloc.md new file mode 100644 index 00000000..c77d90f1 --- /dev/null +++ b/centrallix-sysdoc/Libraries/newmalloc.md @@ -0,0 +1,168 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# Memory Management in Centrallix + +**Author**: Greg Beeley + +**Date**: January 13, 1999 + +**Updated**: December 11, 2025 + +**License**: Copyright (C) 2001-2025 LightSys Technology Services. See LICENSE.txt for more information. + + +## Table of Contents +- [Memory Management in Centrallix](#objectsystem-driver-interface) + - [Introduction](#introduction) + - [nmMalloc()](#nmmalloc) + - [nmFree()](#nmfree) + - [nmStats()](#nmstats) + - [nmRegister()](#nmregister) + - [nmDebug()](#nmdebug) + - [nmDeltas()](#nmdeltas) + - [nmSysMalloc()](#nmsysmalloc) + - [nmSysRealloc()](#nmsysrealloc) + - [nmSysStrdup()](#nmsysstrdup) + - [nmSysFree()](#nmsysfree) + + +## Introduction +Centrallix has its own memory management wrapper that caches deallocated blocks of memory by size for faster reuse. This wrapper also detects double-freeing of blocks (sometimes), making debugging of memory problems just a little bit easier. + +In addition, the memory manager provides statistics on the hit ratio of allocated blocks coming from the lists vs. `malloc()`, and on how many blocks of each size/type are `malloc()`ed and cached. This information can be helpful for tracking down memory leaks. Empirical testing has shown an increase of performance of around 50% or more in programs that use newmalloc. + +One caveat is that this memory manager does not provide `nmRealloc()` function, only `nmMalloc()` and `nmFree()`. Thus, either `malloc()`, `free()`, and `realloc()` or [`nmSysMalloc()`](#nmsysmalloc), [`nmSysFree()`](#nmsysfree), and [`nmSysRealloc()`](#nmsysrealloc) should be used for blocks of memory that might vary in size. + +The newmalloc module can be accessed by adding `#include "cxlib/newmalloc.h"` to the include section of a .c file in centrallix, or `#include "newmalloc.h"` in centrallix-lib. + +- 📖 **Note**: This memory manager is usually the wrong choice for blocks of memory of arbitrary, inconsistent sizes. It is intended for allocating structures quickly that are of a specific size. For example, allocated space for a struct that is always the same size. + +- 🥱 **tl;dr**: Use `nmMalloc()` for structs, not for strings. + +- ⚠️ **Warning**: Do not mix and match, even though calling `free()` on a block obtained from `nmMalloc()` or calling `nmFree()` on a block obtained from `malloc()` might not crash the program immediately. However, it may result in either inefficient use of the memory manager, or a significant memory leak, respectively. These practices will also lead to incorrect results from the statistics and block count mechanisms. + +The newmalloc module provides the following functions: + + +## nmMalloc() +```c +void* nmMalloc(int size); +``` +This function allocates a block of the given `size`. It returns `NULL` if the memory could not be allocated. + + +## nmFree() +```c +void nmFree(void* ptr, int size); +``` +This function frees the block of memory. + +- ⚠️ **Warning**: The caller **must know the size of the block.** Getting this wrong is very bad!! For structures, this is trivial, simply use `sizeof()`, exactly the same as with `nmMalloc()`. + + +## nmStats() +```c +void nmStats(void); +``` +Prints statistics about the memory manager, for debugging and optimizing. + +For example: +``` +NewMalloc subsystem statistics: + nmMalloc: 20244967 calls, 19908369 hits (98.337%) + nmFree: 20233966 calls + bigblks: 49370 too big, 32768 largest size +``` + +- ⚠️ **Warning**: Centrallix-lib must be built with the configure option `--enable-debugging` for this function to work. Otherwise, all the stats will be zeros. + + +## nmRegister() +```c +void nmRegister(int size, char* name); +``` +Registers an inteligent name for block of the specified size. This allows the memory manager to give more information when reporting block allocation counts. A given size can have more than one name. This function is optional and not required for any production usecases, but using it can make tracking down memory leaks easier. + +This function is usually called in a module's `Initialize()` function on each of the structures the module uses internally. + + +## nmDebug() +```c +void nmDebug(void); +``` +Prints a listing of block allocation counts, giving (by size): +- The number of blocks allocated but not yet freed. +- The number of blocks in the cache. +- The total allocations for this block size. +- A list of names (from [`nmRegister()`](#nmregister)) for that block size. + + +## nmDeltas() +```c +void nmDeltas(void); +``` +Prints a listing of all blocks whose allocation count has changed, and by how much, since the last `nmDeltas()` call. This function is VERY USEFUL FOR MEMORY LEAK DETECTIVE WORK. + + +## nmSysMalloc() +```c +void* nmSysMalloc(int size); +``` +Allocates memory without using the block-caching algorithm. This is roughly equivalent to `malloc()`, but pointers returned by malloc and this function are not compatible - i.e., you cannot `free()` something that was [`nmSysMalloc()`](#nmsysmalloc)'ed, nor can you [`nmSysFree()`](#nmsysfree) something that was `malloc()`'ed. + +- 📖 **Note**: This function is much better to use on variable-sized blocks of memory. `nmMalloc()` is better for fixed-size blocks, such as for structs. + + +## nmSysRealloc() +```c +void* nmSysRealloc(void* ptr, int newsize); +``` +Changes the size of an allocated block of memory that was obtained from [`nmSysMalloc()`](#nmsysmalloc), [`nmSysRealloc()`](#nmsysrealloc), or [`nmSysStrdup()`](#nmsysstrdup). The new pointer may be different if the block needs to be moved. This is the rough equivalent of `realloc()`. + +- 📖 **Note**: If you are `realloc()`'ing a block of memory and need to store pointers to data somewhere inside the block, it is often better to store an offset rather than a full pointer. This is because a full pointer becomes invalid if a [`nmSysRealloc()`](#nmsysrealloc) causes the block to move. + + +## nmSysStrdup() +```c +char* nmSysStrdup(const char* str); +``` +Allocates memory using the [`nmSysMalloc()`](#nmsysmalloc) function and copies the string `str` into this memory. It is a rough equivalent of `strdup()`. The resulting pointer can be free'd using [`nmSysFree()`](#nmsysfree). + + +## nmSysFree() +```c +void nmSysFree(void* ptr); +``` +Frees a block of memory allocated by [`nmSysMalloc()`](#nmsysmalloc), [`nmSysRealloc()`](#nmsysrealloc), or [`nmSysStrdup()`](#nmsysstrdup). diff --git a/centrallix-sysdoc/Libraries/xarray.md b/centrallix-sysdoc/Libraries/xarray.md new file mode 100644 index 00000000..48004cc1 --- /dev/null +++ b/centrallix-sysdoc/Libraries/xarray.md @@ -0,0 +1,219 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# The XArray Library + +**Author**: Greg Beeley + +**Date**: January 13, 1999 + +**Updated**: December 11, 2025 + +**License**: Copyright (C) 2001-2025 LightSys Technology Services. See LICENSE.txt for more information. + + +## Table of Contents +- [The XArray Library](#the-xarray-library) + - [Introduction](#introduction) + - [xaNew()](#xanew) + - [xaFree()](#xafree) + - [xaInit()](#xainit) + - [xaDeInit()](#xadeinit) + - [xaAddItem()](#xaadditem) + - [xaAddItemSorted()](#xaadditemsorted) + - [xaAddItemSortedInt32()](#xaadditemsortedint32) + - [xaGetItem()](#xagetitem) + - [xaFindItem()](#xafinditem) + - [xaFindItemR()](#xafinditemr) + - [xaRemoveItem()](#xaremoveitem) + - [xaClear()](#xaclear) + - [xaClearR()](#xaclearr) + - [xaCount()](#xacount) + - [xaInsertBefore()](#xainsertbefore) + - [xaInsertAfter()](#xainsertafter) + + +## Introduction +The xarray (xa) module is intended to manage sized growable arrays, similar to a light-weight arraylist implementation. It includes the `XArray`, which has the following fields: +- `nItems : int`: The number of items in the array. +- `nAlloc : int`: Internal variable to store the size of the allocated memory. +- `Items : void**`: The allocated array of items. + +- 📖 **Note**: Some code occasionally sets `nAlloc` to 0 after an XArray struct has been deinitialized to indicate that the relevant data is no longer allocated. Other than this, it is only used internally by the library. + +- ⚠️ **Warning**: Do not mix calls to [`xaNew()`](#xanew)/[`xaFree()`](#xafree) with calls to [`xaInit()`](#xainit)/[`xaDeInit()`](#xadeinit). Every struct allocated using new must be freed, and ever struct allocated using init must be deinitted. Mixing these calls can lead to memory leaks, bad frees, and crashes. + + +## xaNew() +```c +pXArray xaNew(int init_size); +``` +Allocates a new `XArray` struct on the heap (using [`nmMalloc()`](#nmmalloc) for caching) and returns a pointer to it, or returns `NULL` if an error occurs. + + +## xaFree() +```c +int xaFree(pXArray this); +``` +Frees a `pXArray` allocated using [`xaNew`](#xanew), returning 0 if successful or -1 if an error occurs. + + +## xaInit() +```c +int xaInit(pXArray this, int init_size); +``` +This function initializes an allocated (but uninitialized) xarray. It makes room for `init_size` items initially, but this is only an optimization. A typical value for `init_size` is 16. Remember to [`xaDeInit`](#xadeinit) this xarray, do **not** [`xaFree`](#xafree) it. + +This function returns 0 on success, or -1 if an error occurs. + + +## xaDeInit() +```c +int xaDeInit(pXArray this); +``` +This function de-initializes an xarray, but does not free the XArray structure itself. This is useful if the structure is a local variable allocated using [`xaInit()`](#xainit). + +This function returns 0 on success, or -1 if an error occurs. + +For example: +```c +XArray arr; +if (xaInit(&arr, 16) != 0) goto handle_error; + +/** Use the xarray. **/ + +if (arr.nAlloc != 0 && xaDeInit(&arr) != 0) goto handle_error; +arr.nAlloc = 0; +``` + + +## xaAddItem() +```c +int xaAddItem(pXArray this, void* item); +``` +This function adds an item to the end of the xarray. The item is assumed to be a `void*`, but this function will _not_ follow pointeres stored in the array. Thus, other types can be typecast and stored into that location (such as an `int`). + +This function returns 0 on success, or -1 if an error occurs. + + +## xaAddItemSorted() +```c +int xaAddItemSorted(pXArray this, void* item, int keyoffset, int keylen); +``` +This function adds an item to a sorted xarray while maintaining the sorted property. The value for sorting is expected to begin at the offset given by `keyoffset` and continue for `keylen` bytes. This function _will_ follow pointers are stored in the array so casting other types to store them is not allowed (as it is with [`xaAddItem()`](#xaadditem)). + + +## xaAddItemSortedInt32() +```c +int xaAddItemSortedInt32(pXArray this, void* item, int keyoffset) +``` + + + +## xaGetItem() +```c +void* xaGetItem(pXArray this, int index) +``` +This function returns an item given a specific index into the xarray, or `NULL` if the index is out of bounds. If the bounds check needs to be omitted for performance and the caller can otherwise verify that no out of bounds read is possible (e.g. because they are iterating from 0 to `xarray->nItems`), the caller should access `xarray->Items` directly. Either way, the result may need to be typecasted or stored in a variable of a specific type for it to be useable, and error checking for `NULL` values should be used. + + +## xaFindItem() +```c +int xaFindItem(pXArray this, void* item); +``` +This function returns array index for the provided item in the array, or -1 if the item could not be found. Requires an exact match, so two `void*` pointing to different memory with identical contents are not considered equal by this function. If the data is actually another datatype typecasted as a `void*`, all 8 bytes must be identical for a match. + +For example: +```c +void* data = &some_data; + +XArray xa; +xaInit(&xa, 16); + +... + +xaAddItem(&xa, data); + +... + +int item_id = xaFindItem(&xa, data); +assert(data == xa.Items[item_id]); +``` + + +## xaFindItemR() +```c +int xaFindItemR(pXArray this, void* item); +``` +This function works the same as [`xaFindItem()`](#xafinditem), however it iterates in reverse, giving a slight performance boost, especially for finding items near the end of the array. + + +## xaRemoveItem() +```c +int xaRemoveItem(pXArray this, int index) +``` +This function removes an item from the xarray at the given the index, then shifts all following items back to fill the gap created by the removal. XArray is not optimized for removing multiple items efficiently. This function returns 0 on success, or -1 if an error occurs. + + +## xaClear() +```c +int xaClear(pXArray this, int (*free_fn)(), void* free_arg); +``` +This function removes all elements from the xarray, leaving it empty. `free_fn()` is invoked on each element with a `void*` to the element to be freed as the first argument and `free_arg` as the second argument (the return value of `free_fn()` is always ignored). This function returns 0 on success (even if the `free_fn()` returns an error), or -1 if an error is detected. + + +## xaClearR() +```c +int xaClearR(pXArray this, int (*free_fn)(), void* free_arg); +``` +This function works the same as [`xaClear()`](#xaclear), except that it is slightly faster because the free function is evaluated on items in reverse order. + + +## xaCount() +```c +int xaCount(pXArray this); +``` +This function returns the number of items in the xarray, or -1 on error. It is equivalent to accessing `xarray->nItems` (although the latter expression will not return an error). + + +## xaInsertBefore() +```c +int xaInsertBefore(pXArray this, int index, void* item) +``` +This function inserts an item before the specified index, moving all following items forward to make space. The new item cannot be inserted past the end of the array. This function returns the index on success, or -1 if an error occurs. + + +## xaInsertAfter() +```c +int xaInsertAfter(pXArray this, int index, void* item) +``` +This function inserts an item after the specified index, moving all following items forward to make space. The new item cannot be inserted past the end of the array. This function returns the index on success, or -1 if an error occurs. diff --git a/centrallix-sysdoc/Libraries/xhash.md b/centrallix-sysdoc/Libraries/xhash.md new file mode 100644 index 00000000..5d4ca802 --- /dev/null +++ b/centrallix-sysdoc/Libraries/xhash.md @@ -0,0 +1,126 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# The XHash Library + +**Author**: Greg Beeley + +**Date**: January 13, 1999 + +**Updated**: December 11, 2025 + +**License**: Copyright (C) 2001-2025 LightSys Technology Services. See LICENSE.txt for more information. + + +## Table of Contents +- [The XHash Library](#the-xhash-library) + - [Introduction](#introduction) + - [xhInitialize()](#xhinitialize) + - [xhInit()](#xhinit) + - [xhDeInit()](#xhdeinit) + - [xhAdd()](#xhadd) + - [xhRemove()](#xhremove) + - [xhLookup()](#xhlookup) + - [xhClear()](#xhclear) + - [xhForEach()](#xhforeach) + - [xhClearKeySafe()](#xhclearkeysafe) + + +## Introduction +The xhash (xh) module provides an extensible hash table interface. The hash table is a table of linked lists of items, so collisions and overflows are handled by this data structure (although excessive collisions still cause a performance loss). This implementation also supports variable-length keys for more flexible usecases. + +- ⚠️ **Warning**: All `xhXYZ()` function calls assume that the `pXHashTable this` arg points to a valid hashtable struct. All non-init functions assume that this struct has been validly initialized and has not yet been freed. If these conditions are not met, the resulting behavior is undefined. + + +## xhInitialize() +```c +int xhInitialize(); +``` +Initialize the random number table for hash computation, returning 0 on success or -1 if an error occurs. Normally, you can assume someone else has already called this during program startup. + + +## xhInit() +```c +int xhInit(pXHashTable this, int rows, int keylen); +``` +This function initializes a hash table, setting the number of rows and the key length. Specify a `keylen` of 0 for for variable length keys (aka. null-terminated strings). The `rows` should be an odd number, preferably prime (although that isn't required). `rows` **SHOULD NOT** be a power of 2. Providing this value allows the caller to optimize it based on how much data they expect to be stored in the hash table. If this value is set to 1, the hash search degenerates to a linear array search with extra overhead. Thus, the value should be large enough to comfortably accommodate the elements with minimal collisions. Typical values include 31, 251, or 255 (though 255 is not prime). + + +## xhDeInit() +```c +int xhDeInit(pXHashTable this); +``` +This function deinitializes a hash table struct, freeing all rows. Note that the stored data is not freed and neither are the keys as this data is assumed to be the responsibility of the caller. Returns 0 on success, or -1 if an error occurs. + + +## xhAdd() +```c +int xhAdd(pXHashTable this, char* key, char* data); +``` +Adds an item to the hash table, with a given key value and data pointer. Both data and key pointers must have a lifetime that exceeds the time that they item is hashed, as they are assumed to be the responsibility of the caller. This function returns 0 on success, or -1 if an error occurs. + + +## xhRemove() +```c +int xhRemove(pXHashTable this, char* key); +``` +This function removes an item with the given key value from the hash table. It returns 0 if the item was successfully removed, or -1 if an error occurs (including failing to find the item). + + +## xhLookup() +```c +char* xhLookup(pXHashTable this, char* key); +``` +This function returns a pointer to the data associated with the given key, or `NULL` if an error occurs (including failing to find the key). + + +## xhClear() +```c +int xhClear(pXHashTable this, int (*free_fn)(), void* free_arg); +``` +Clears all items from a hash table. If a `free_fn()` is provided, it will be invoked with each data pointer as the first argument and `free_arg` as the second argument as items are removed. The return value of the `free_fn()` is ignored. This function returns 0 on success (even if the `free_fn()` returns an error), or -1 if an error is detected. + + +## xhForEach() +```c +int xhForEach(pXHashTable this, int (*callback_fn)(pXHashEntry, void*), void* each_arg); +``` +This function executes an operation on each entry of the hash table entry. The provided callback function will be called with each entry (in an arbitrary order). This function is provided 2 parameters: the current hash table entry, and a `void*` argument specified using `each_arg`. If any invocation of the callback function returns a value other than 0, the `xhForEach()` will immediately fail, returning that value as the error code. + +This function returns 0 if the function executes successfully, 1 if the callback function is `NULL`, or n (where n != 0) if the callback function returns n. It does not return any error code other than 1 or any error codes returned by `callback_fn()`. + + +## xhClearKeySafe() +```c +int xhClearKeySafe(pXHashTable this, void (*free_fn)(pXHashEntry, void*), void* free_arg); +``` +This function clears all contents from the hash table. The free function is passed each hash entry struct and `free_arg`, allowing it to free both the value and key, if needed, and the free function is not allowed to return an error code. This function returns 0 for success as long as `free_fn()` is nonnull, otherwise it returns -1. diff --git a/centrallix-sysdoc/Libraries/xstring.md b/centrallix-sysdoc/Libraries/xstring.md new file mode 100644 index 00000000..4ecce289 --- /dev/null +++ b/centrallix-sysdoc/Libraries/xstring.md @@ -0,0 +1,298 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# The XString Library + +**Author**: Greg Beeley + +**Date**: January 13, 1999 + +**Updated**: December 11, 2025 + +**License**: Copyright (C) 2001-2025 LightSys Technology Services. See LICENSE.txt for more information. + + +## Table of Contents +- [The XString Library](#the-xstring-library) + - [Introduction](#introduction) + - [xsNew()](#xsnew) + - [xsFree()](#xsfree) + - [xsInit()](#xsinit) + - [xsDeInit()](#xsdeinit) + - [xsCheckAlloc()](#xscheckalloc) + - [xsConcatenate()](#xsconcatenate) + - [xsCopy()](#xscopy) + - [xsStringEnd()](#xsstringend) + - [xsConcatPrintf()](#xsconcatprintf) + - [xsPrintf()](#xsprintf) + - [xsWrite()](#xswrite) + - [xsRTrim()](#xsrtrim) + - [xsLTrim()](#xsltrim) + - [xsTrim()](#xstrim) + - [xsFind()](#xsfind) + - [xsFindRev()](#xsfindrev) + - [xsSubst()](#xssubst) + - [xsReplace()](#xsreplace) + - [xsInsertAfter()](#xsinsertafter) + - [xsGenPrintf_va()](#xsgenprintf_va) + - [xsGenPrintf()](#xsgenprintf) + - [xsString()](#xsstring) + - [xsLength()](#xslength) + - [xsQPrintf_va(), xsQPrintf(), & xsConcatQPrintf()](#xsqprintf_va-xsqprintf--xsconcatqprintf) + + +## Introduction +The xstring (xs) module is used for managing growable strings. It is based on a structure containing a small initial string buffer to avoid string allocations for small strings. However, it can also perform `realloc()` operations to extend the string space for storing incrementally larger strings. This module allows for strings to contain arbitrary data, even NULL (`'\0'`) characters mid-string. Thus, it can also be used as an extensible buffer for arbitrary binary data. + +- 📖 **Note**: The contents of the XString can be easily referenced with the `xstring->String` field in the xstring struct. + +- ⚠️ **Warning**: Do not mix calls to [`xsNew()`](#xsnew)/[`xsFree()`](#xsfree) with calls to [`xsInit()`](#xsinit)/[`xsDeInit()`](#xsdeinit). Every struct allocated using new must be freed, and ever struct allocated using init must be deinitted. Mixing these calls can lead to memory leaks, bad frees, and crashes. + + +## xsNew() +```c +pXString xsNew() +``` +This function allocates a new XString structure to contain a new, empty string. It uses [`nmMalloc()`](#nmmalloc) because the XString struct is always a consistant size. This function returns a pointer to the new string if successful, or `NULL` if an error occurs. + + +## xsFree() +```c +void xsFree(pXString this); +``` +This function frees an XString structure allocated with [`xsNew()`](#xsnew), freeing all associated memory. + + +## xsInit() +```c +int xsInit(pXString this); +``` +This function initializes an XString structure to contain a new, empty string. This function returns 0 if successful, or -1 if an error occurs. + + +## xsDeInit() +```c +int xsDeInit(pXString this); +``` +This function deinitializes an XString structure allocated with [`xsInit()`](#xsinit), freeing all associated memory. This function returns 0 if successful, or -1 if an error occurs. + + +## xsCheckAlloc() +```c +int xsCheckAlloc(pXString this, int addl_needed); +``` +This function will optionally allocate more memory, if needed, given the currently occupied data area and the additional space required (specified with `addl_needed`). This function returns 0 if successful, or -1 if an error occurs. + + +## xsConcatenate() +```c +int xsConcatenate(pXString this, char* text, int len); +``` +This function concatenates the `text` string onto the end of the XString's value. If `len` is set, that number of characters are copied, including possible null characters (`'\0'`). If `len` is -1, all data up to the null-terminater is copied. This function returns 0 if successful, or -1 if an error occurs. + +- ⚠️ **Warning**: Do not store pointers to values within the string while adding text to the end of the string. The string may be reallocated to increase space, causing such pointers to break. Instead, use offset indexes into the string and calculate pointers on demand with `xs->String + offset`. + + For example, **DO NOT**: + ```c + XString xs; + if (xsInit(&xs) != 0) goto handle_error; + + if (xsConcatenate(&xs, "This is the first sentence. ", -1) != 0) goto handle_error; + char* ptr = xsStringEnd(&xs); /* Stores string pointer! */ + if (xsConcatenate(&xs, "This is the second sentence.", -1) != 0) goto handle_error; + + /** Print will probably read invalid memory. **/ + printf("A pointer to the second sentence is '%s'\n", ptr); + + ... + + if (xsDeInit(&xs) != 0) goto handle_error; + ``` + + Instead, use indexes and pointer arithmetic like this: + ```c + XString xs; + if (xsInit(&xs) != 0) goto handle_error; + + if (xsConcatenate(&xs, "This is the first sentence. ", -1) != 0) goto handle_error; + int offset = xsStringEnd(&xs) - xs->String; /* Stores index offset. */ + if (xsConcatenate(&xs, "This is the second sentence.", -1) != 0) goto handle_error; + + /** Print will probably work fine. **/ + printf("A pointer to the second sentence is '%s'\n", xs->String + offset); + + ... + + if (xsDeInit(&xs) != 0) goto handle_error; + ``` + + +## xsCopy() +```c +int xsCopy(pXString this, char* text, int len); +``` +This function copies the string `text` into the XString, overwriting any previous contents. This function returns 0 if successful, or -1 if an error occurs. + + +## xsStringEnd() +```c +char* xsStringEnd(pXString this); +``` +This function returns a pointer to the end of the string. This function is more efficient than searching for a null-terminator using `strlen()` because the xs module already knows the string length. Furthermore, since some string may contain nulls, using `strlen()` may produce an incorrect result. + + +## xsConcatPrintf() +```c +int xsConcatPrintf(pXString this, char* fmt, ...); +``` +This function prints additional data onto the end of the string. It is similar to printf, however, only the following features are supported: +- `%s`: Add a string (`char*`). +- `%d`: Add a number (`int`). +- `%X`: Add something? +- `%%`: Add a `'%'` character. +Attempting to use other features of printf (such as `%lf`, `%c`, `%u`, etc.) will cause unexpected results. + +This function returns 0 if successful, or -1 if an error occurs. + + +## xsPrintf() +```c +int xsPrintf(pXString this, char* fmt, ...); +``` +This function works the same as [`xsConcatPrintf()`](#xsconcatprintf), except that it overwrites the previous string instead of appending to it. This function returns 0 if successful, or -1 if an error occurs. + + +## xsWrite() +```c +int xsWrite(pXString this, char* buf, int len, int offset, int flags); +``` +This function writes data into the xstring, similar to using the standard fdWrite or objWrite API. This function can thus be used as a value for `write_fn`, for those functions that require this (such as the `expGenerateText()` function). This function returns `len` if successful, or -1 if an error occurs. + + +## xsRTrim() +```c +int xsRTrim(pXString this); +``` +This function trims whitespace characters (spaces, tabs, newlines, and line feeds) from the right side of the xstring. This function returns 0 if successful, or -1 if an error occurs. + + +## xsLTrim() +```c +int xsLTrim(pXString this); +``` +This function trims whitespace characters (spaces, tabs, newlines, and line feeds) from the left side of the xstring. This function returns 0 if successful, or -1 if an error occurs. + + +## xsTrim() +```c +int xsTrim(pXString this); +``` +This function trims whitespace characters (spaces, tabs, newlines, and line feeds) from both sides of the xstring. This function returns 0 if successful, or -1 if an error occurs. + + +## xsFind() +```c +int xsFind(pXString this, char* find, int findlen, int offset) +``` +This function searches for a specific string (`find`) in the xstring, starting at the provided `offset`. `findlen` is the length of the provided string, allowing it to include null characters (pass -1 to have the length calculated using `strlen(find)`). This function returns the index where the string was found if successful, or -1 if an error occurs (including the string not being found). + + +## xsFind() +```c +int xsFindRev(pXString this, char* find, int findlen, int offset) +``` +This function works the same as [`xsFind()`](#xsfind) except that it searches from the end of the string, resulting in better performance if the value is closer to the end of the string. This function returns the index where the string was found if successful, or -1 if an error occurs (including the string not being found). + + +## xsSubst() +```c +int xsSubst(pXString this, int offset, int len, char* rep, int replen) +``` +This function substitutes a string into a given position in an xstring. This does not search for matches as with [`xsReplace()`](#xsrepalce), instead the position (`offset`) and length (`len`) must be specified. Additionally, the length of the replacement string (`replen`) can be specified handle null characters. Both `len` and `replen` can be left blank to generate them using `strlen()`. This function returns 0 if successful, or -1 if an error occurs. + + +## xsReplace() +```c +int xsReplace(pXString this, char* find, int findlen, int offset, char* rep, int replen); +``` +This function searches an xString for the specified string (`find`) and replaces that string with another specified string (`rep`). Both strings can have their length specified (`findlen` and `replen` respectively), or left as -1 to generate it using `strlen()`. This function returns the starting offset of the replace if successful, or -1 if an error occurs (including the string not being found). + + +## xsInsertAfter() +```c +int xsInsertAfter(pXString this, char* ins, int inslen, int offset); +``` +This function inserts the specified string (`ins`) at offset (`offset`). The length of the string can be specified (`inslen`), or left as -1 to generate it using `strlen()`. This function returns the new offset after the insertion (i.e. `offset + inslen`), or -1 if an error occurs. + + +## xsGenPrintf_va() +```c +int xsGenPrintf_va(int (*write_fn)(), void* write_arg, char** buf, int* buf_size, const char* fmt, va_list va); +``` +This function performs a `printf()` operation to an `xxxWrite()` style function. + +In the wise words of Greg Beeley from 2002: +> This routine isn't really all that closely tied to the XString module, but this seemed to be the best place for it. If a `buf` and `buf_size` are supplied (`NULL` otherwise), then `buf` MUST be allocated with the `nmSysMalloc()` routine. Otherwise, **kaboom!** This routine will grow `buf` if it is too small, and will update `buf_size` accordingly. + +This function returns the printed length (>= 0) on success, or -(errno) if an error occurs. + + +## xsGenPrintf() +```c +int xsGenPrintf(int (*write_fn)(), void* write_arg, char** buf, int* buf_size, const char* fmt, ...); +``` +This function works the same as [`xsGenPrintf_va()`](#xsgenprintf_va), but with a more convenient signature for the developer. + + +## xsString() +```c +char* xsString(pXString this); +``` +This function returns the stored string after checking for various errors, or returns `NULL` if an error occurs. + + +## xsLength() +```c +xsLength(pXString this); +``` +This function returns the length of the string in constant time (since this value is stored in `this->Length`) checking for various errors, or returns `NULL` if an error occurs. + + + + +## xsQPrintf_va(), xsQPrintf(), & xsConcatQPrintf() +```c +int xsQPrintf_va(pXString this, char* fmt, va_list va); +int xsQPrintf(pXString this, char* fmt, ...); +int xsConcatQPrintf(pXString this, char* fmt, ...); +``` +These functions use the `QPrintf` to add data to an xstring. They return 0 on success, or some other value on failure. diff --git a/centrallix-sysdoc/OSDriver_Authoring.md b/centrallix-sysdoc/OSDriver_Authoring.md index c167fce2..4b90a117 100644 --- a/centrallix-sysdoc/OSDriver_Authoring.md +++ b/centrallix-sysdoc/OSDriver_Authoring.md @@ -1,794 +1,1237 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + # ObjectSystem Driver Interface -Author: Greg Beeley -Date: January 13, 1999 +**Author**: Greg Beeley + +**Date**: January 13, 1999 + +**Imported**: August 13, 2001 + +**Updated**: December 11, 2025 + +**License**: Copyright (C) 2001-2025 LightSys Technology Services. See LICENSE.txt for more information. -Updated: March 9, 2011 -License: Copyright (C) 2001-2011 LightSys Technology Services. See LICENSE.txt for more information. ## Table of Contents - [ObjectSystem Driver Interface](#objectsystem-driver-interface) - [Table of Contents](#table-of-contents) - [I Introduction](#i-introduction) - [II Interface](#ii-interface) - - [A. Initialization](#a--initialization) - - [B. Opening And Closing Objects](#b--opening-and-closing-objects) - - [C. Creating and Deleting Objects.](#c--creating-and-deleting-objects) - - [D. Reading and Writing Object Content.](#d--reading-and-writing-object-content) - - [E. Querying for Child Objects.](#e--querying-for-child-objects) - - [F. Managing Object Attributes](#f--managing-object-attributes) - - [G. Managing Object Methods](#g--managing-object-methods) + - [Abbreviation Prefix](#abbreviation-prefix) + - [Internal Functions](#internal-functions) + - [Function: Initialize()](#function-initialize) + - [Function: Open()](#function-open) + - [Function: OpenChild()](#function-openchild) + - [Function: Close()](#function-close) + - [Function: Create()](#function-create) + - [Function: Delete()](#function-delete) + - [Function: DeleteObj()](#function-deleteobj) + - [Function: Read()](#function-read) + - [Function: Write()](#function-write) + - [Function: OpenQuery()](#function-openquery) + - [Function: QueryDelete()](#function-querydelete) + - [Function: QueryFetch()](#function-queryfetch) + - [Function: QueryCreate()](#function-querycreate) + - [Function: QueryClose()](#function-queryclose) + - [Function: GetAttrType()](#function-getattrtype) + - [Function: GetAttrValue()](#function-getattrvalue) + - [Function: GetFirstAttr()](#function-getfirstattr--getnextattr) + - [Function: GetNextAttr()](#function-getfirstattr--getnextattr) + - [Function: SetAttrValue()](#function-setattrvalue) + - [Function: AddAttr()](#function-addattr) + - [Function: OpenAttr()](#function-openattr) + - [Function: GetFirstMethod()](#function-getfirstmethod--getnextmethod) + - [Function: GetNextMethod()](#function-getfirstmethod--getnextmethod) + - [Function: ExecuteMethod()](#function-executemethod) + - [Function: PresentationHints()](#function-presentationhints) + - [Function: Info()](#function-info) + - [Function: Commit()](#function-commit) + - [Function: GetQueryCoverageMask()](#function-getquerycoveragemask) + - [Function: GetQueryIdentityPath()](#function-getqueryidentitypath) - [III Reading the Node Object](#iii-reading-the-node-object) - - [pSnNode snReadNode(pObject obj)](#psnnode-snreadnodepobject-obj) - - [pSnNode snNewNode(pObject obj, char* content_type)](#psnnode-snnewnodepobject-obj-char-content_type) - - [int snWriteNode(pSnNode node)](#int-snwritenodepsnnode-node) - - [int snDeleteNode(pSnNode node)](#int-sndeletenodepsnnode-node) - - [int snGetSerial(pSnNode node)](#int-sngetserialpsnnode-node) - - [pStructInf stParseMsg(pFile inp_fd, int flags)](#pstructinf-stparsemsgpfile-inp_fd-int-flags) - - [pStructInf stParseMsgGeneric(void* src, int (*read_fn)(), int flags)](#pstructinf-stparsemsggenericvoid-src-int-read_fn-int-flags) - - [int stGenerateMsg(pFile out_fd, pStructInf info, int flags)](#int-stgeneratemsgpfile-out_fd-pstructinf-info-int-flags) - - [int stGenerateMsgGeneric(void* dst, int (*write_fn)(), pStructInf info, int flags)](#int-stgeneratemsggenericvoid-dst-int-write_fn-pstructinf-info-int-flags) - - [pStructInf stCreateStruct(char* name, char* type)](#pstructinf-stcreatestructchar-name-char-type) - - [pStructInf stAddAttr(pStructInf inf, char* name)](#pstructinf-staddattrpstructinf-inf-char-name) - - [pStructInf stAddGroup(pStructInf inf, char* name, char* type)](#pstructinf-staddgrouppstructinf-inf-char-name-char-type) - - [int stAddValue(pStructInf inf, char* strval, int intval)](#int-staddvaluepstructinf-inf-char-strval-int-intval) - - [pStructInf stLookup(pStructInf inf, char* name)](#pstructinf-stlookuppstructinf-inf-char-name) - - [int stAttrValue(pStructInf inf, int* intval, char** strval, int nval)](#int-stattrvaluepstructinf-inf-int-intval-char-strval-int-nval) - - [int stFreeInf(pStructInf this)](#int-stfreeinfpstructinf-this) - - [IV Memory Management in Centrallix](#iv-memory-management-in-centrallix) - - [void* nmMalloc(int size)](#void-nmmallocint-size) - - [void nmFree(void* ptr, int size)](#void-nmfreevoid-ptr-int-size) - - [void nmStats()](#void-nmstats) - - [void nmRegister(int size, char* name)](#void-nmregisterint-size-char-name) - - [void nmDebug()](#void-nmdebug) - - [void nmDeltas()](#void-nmdeltas) - - [void* nmSysMalloc(int size)](#void-nmsysmallocint-size) - - [void nmSysFree(void* ptr)](#void-nmsysfreevoid-ptr) - - [void* nmSysRealloc(void* ptr, int newsize)](#void-nmsysreallocvoid-ptr-int-newsize) - - [char* nmSysStrdup(const char* str)](#char-nmsysstrdupconst-char-str) - - [V Other Utility Modules](#v-other-utility-modules) - - [A. XArray (XA) - Arrays](#axarray-xa---arrays) - - [xaInit(pXArray this, int init_size)](#xainitpxarray-this-int-init_size) - - [xaDeInit(pXArray this)](#xadeinitpxarray-this) - - [xaAddItem(pXArray this, void* item)](#xaadditempxarray-this-void-item) - - [xaAddItemSorted(pXArray this, void* item, int keyoffset, int keylen)](#xaadditemsortedpxarray-this-void-item-int-keyoffset-int-keylen) - - [xaFindItem(pXArray this, void* item)](#xafinditempxarray-this-void-item) - - [xaRemoveItem(pXArray this, int index)](#xaremoveitempxarray-this-int-index) - - [B. XHash (XH) - Hash Tables](#bxhash-xh---hash-tables) - - [int xhInit(pXHashTable this, int rows, int keylen)](#int-xhinitpxhashtable-this-int-rows-int-keylen) - - [int xhDeInit(pXHashTable this)](#int-xhdeinitpxhashtable-this) - - [int xhAdd(pXHashTable this, char* key, char* data)](#int-xhaddpxhashtable-this-char-key-char-data) - - [int xhRemove(pXHashTable this, char* key)](#int-xhremovepxhashtable-this-char-key) - - [char* xhLookup(pXHashTable this, char* key)](#char-xhlookuppxhashtable-this-char-key) - - [int xhClear(pXHashTable this, int free_blk)](#int-xhclearpxhashtable-this-int-free_blk) - - [C. XString (XS) - Strings](#cxstring-xs---strings) - - [int xsInit(pXString this)](#int-xsinitpxstring-this) - - [int xsDeInit(pXString this)](#int-xsdeinitpxstring-this) - - [int xsConcatenate(pXString this, char* text, int len)](#int-xsconcatenatepxstring-this-char-text-int-len) - - [int xsCopy(pXString this, char* text, int len)](#int-xscopypxstring-this-char-text-int-len) - - [char* xsStringEnd(pXString this)](#char-xsstringendpxstring-this) - - [D. Expression (EXP) - Expression Trees](#dexpression-exp---expression-trees) - - [pExpression expCompileExpression(char* text, pParamObjects objlist, int lxflags, int cmpflags)](#pexpression-expcompileexpressionchar-text-pparamobjects-objlist-int-lxflags-int-cmpflags) - - [expFreeExpression(pExpression this)](#expfreeexpressionpexpression-this) - - [int expEvalTree(pExpression this, pParamObjects objlist)](#int-expevaltreepexpression-this-pparamobjects-objlist) - - [pParamObjects expCreateParamList()](#pparamobjects-expcreateparamlist) - - [int expFreeParamList(pParamObjects this)](#int-expfreeparamlistpparamobjects-this) - - [int expAddParamToList(pParamObjects this, char* name, pObject obj, int flags)](#int-expaddparamtolistpparamobjects-this-char-name-pobject-obj-int-flags) - - [int expModifyParam(pParamObjects this, char* name, pObject replace_obj)](#int-expmodifyparampparamobjects-this-char-name-pobject-replace_obj) - - [int expRemoveParamFromList(pParamObjects this, char* name)](#int-expremoveparamfromlistpparamobjects-this-char-name) - - [int expReverseEvalTree(pExpression tree, pParamObjects objlist)](#int-expreverseevaltreepexpression-tree-pparamobjects-objlist) - - [E. MTSession (MSS) - Basic Session Management](#emtsession-mss---basic-session-management) - - [char* mssUserName()](#char-mssusername) - - [char* mssPassword()](#char-msspassword) - - [int mssSetParam(char* paramname, char* param)](#int-msssetparamchar-paramname-char-param) - - [char* mssGetParam(char* paramname)](#char-mssgetparamchar-paramname) - - [int mssError(int clr, char* module, char* message, ...)](#int-msserrorint-clr-char-module-char-message-) - - [int mssErrorErrno(int clr, char* module, char* message, ...)](#int-msserrorerrnoint-clr-char-module-char-message-) - - [F. OSML Utility Functions](#fosml-utility-functions) - - [char* obj_internal_PathPart(pPathname path, int start, int length)](#char-obj_internal_pathpartppathname-path-int-start-int-length) - - [int obj_internal_AddToPath(pPathname path, char* new_element)](#int-obj_internal_addtopathppathname-path-char-new_element) - - [int obj_internal_CopyPath(pPathname dest, pPathname src)](#int-obj_internal_copypathppathname-dest-ppathname-src) - - [void obj_internal_FreePathStruct(pPathname path)](#void-obj_internal_freepathstructppathname-path) - - [VI Network Connection Functionality](#vi-network-connection-functionality) - - [pFile netConnectTCP(char* host_name, char* service_name, int flags)](#pfile-netconnecttcpchar-host_name-char-service_name-int-flags) - - [int netCloseTCP(pFile net_filedesc, int linger_msec, int flags)](#int-netclosetcppfile-net_filedesc-int-linger_msec-int-flags) - - [int fdWrite(pFile filedesc, char* buffer, int length, int offset, int flags)](#int-fdwritepfile-filedesc-char-buffer-int-length-int-offset-int-flags) - - [int fdRead(pFile filedesc, char* buffer, int maxlen, int offset, int flags)](#int-fdreadpfile-filedesc-char-buffer-int-maxlen-int-offset-int-flags) - - [VII Parsing Data](#vii-parsing-data) - - [pLxSession mlxOpenSession(pFile fd, int flags)](#plxsession-mlxopensessionpfile-fd-int-flags) - - [pLxSession mlxStringSession(char* str, int flags)](#plxsession-mlxstringsessionchar-str-int-flags) - - [int mlxCloseSession(pLxSession this)](#int-mlxclosesessionplxsession-this) - - [int mlxNextToken(pLxSession this)](#int-mlxnexttokenplxsession-this) - - [char* mlxStringVal(pLxSession this, int* alloc)](#char-mlxstringvalplxsession-this-int-alloc) - - [int mlxIntVal(pLxSession this)](#int-mlxintvalplxsession-this) - - [double mlxDoubleVal(pLxSession this)](#double-mlxdoublevalplxsession-this) - - [int mlxCopyToken(pLxSession this, char* buffer, int maxlen)](#int-mlxcopytokenplxsession-this-char-buffer-int-maxlen) - - [int mlxHoldToken(pLxSession this)](#int-mlxholdtokenplxsession-this) - - [int mlxSetOptions(pLxSession this, int options)](#int-mlxsetoptionsplxsession-this-int-options) - - [int mlxUnsetOptions(pLxSession this, int options)](#int-mlxunsetoptionsplxsession-this-int-options) - - [int mlxSetReservedWords(pLxSession this, char** res_words)](#int-mlxsetreservedwordsplxsession-this-char-res_words) - - [int mlxNoteError(pLxSession this)](#int-mlxnoteerrorplxsession-this) - - [int mlxNotePosition(pLxSession this)](#int-mlxnotepositionplxsession-this) - - [VIII Objectsystem Driver Testing](#viii-objectsystem-driver-testing) - - [A. Object opening, closing, creation, and deletion](#aobject-opening-closing-creation-and-deletion) - - [B. Object attribute enumeration, getting, and setting.](#bobject-attribute-enumeration-getting-and-setting) - - [C. Object querying (for subobjects)](#cobject-querying-for-subobjects) + - [Module: st_node](#module-st_node) + - [st_node: snReadNode()](#st_node-snreadnode) + - [st_node: snNewNode()](#st_node-snnewnode) + - [st_node: snWriteNode()](#st_node-snwritenode) + - [st_node: snDelete()](#st_node-sndeletenode) + - [st_node: snGetSerial()](#st_node-sngetserial) + - [st_node: snGetLastModification()](#st_node-sngetlastmodification) + - [Module: stparse](#module-stparse) + - [stparse: stStructType()](#stparse-ststructtype) + - [stparse: stLookup()](#stparse-stlookup) + - [stparse: stAttrValue()](#stparse-stattrvalue) + - [stparse: stGetExpression()](#stparse-stgetexpression) + - [stparse: stCreateStruct()](#stparse-stcreatestruct) + - [stparse: stAddAttr()](#stparse-staddattr) + - [stparse: stAddGroup()](#stparse-staddgroup) + - [stparse: stAddValue()](#stparse-staddvalue) + - [stparse: stFreeInf()](#stparse-stfreeinf) + - [stparse: Using Fields Directly](#stparse-using-fields-directly) + - [IV Module: Expression](#viii-module-expression) + - [expCompileExpression())](#expallocexpression) + - [expFreeExpression()](#expfreeexpression) + - [expCompileExpression()](#expcompileexpression) + - [expCompileExpressionFromLxs()](#expcompileexpressionfromlxs) + - [expPodToExpression()](#exppodtoexpression) + - [expExpressionToPod()](#expexpressiontopod) + - [expDuplicateExpression()](#expduplicateexpression) + - [expIsConstant()](#expisconstant) + - [expEvalTree()](#expevaltree) + - [expCreateParamList()](#expcreateparamlist) + - [expFreeParamList()](#expfreeparamlist) + - [expAddParamToList()](#expaddparamtolist) + - [expModifyParam()](#expmodifyparam) + - [expRemoveParamFromList()](#expremoveparamfromlist) + - [expSetParamFunctions()](#expsetparamfunctions) + - [expReverseEvalTree()](#expreverseevaltree) + - [V Path Handling Functions](#x-path-handling-functions) + - [obj_internal_PathPart()](#obj_internal_pathpart) + - [obj_internal_AddToPath()](#obj_internal_addtopath) + - [obj_internal_CopyPath](#obj_internal_copypath) + - [obj_internal_FreePathStruct()](#obj_internal_freepathstruct) + - [VI Parsing Data](#xii-parsing-data) + - [mlxOpenSession()](#mlxopensession) + - [mlxStringSession()](#mlxstringsession) + - [mlxCloseSession()](#mlxclosesession) + - [mlxNextToken()](#mlxnexttoken) + - [mlxStringVal()](#mlxstringval) + - [mlxIntVal()](#mlxintval) + - [mlxDoubleVal()](#mlxdoubleval) + - [mlxCopyToken()](#mlxcopytoken) + - [mlxHoldToken()](#mlxholdtoken) + - [mlxSetOptions()](#mlxsetoptions) + - [mlxUnsetOptions()](#mlxunsetoptions) + - [mlxSetReservedWords()](#mlxsetreservedwords) + - [mlxNoteError()](#mlxnoteerror) + - [mlxNotePosition()](#mlxnoteposition) + - [VII Driver Testing](#xiii-driver-testing) + - [Object opening, closing, creation, and deletion](#aobject-opening-closing-creation-and-deletion) + - [Object attribute enumeration, getting, and setting.](#bobject-attribute-enumeration-getting-and-setting) + - [Object querying (for subobjects)](#cobject-querying-for-subobjects) + + ## I Introduction -An objectsystem driver's purpose is to provide access to a particular type of local or network data/resource, and to organize that data in a tree- structured heirarchy that can be integrated into the Centrallix's ObjectSystem. This tree structure will vary based on the data being presented, but will fit the basic ObjectSystem model of a heirarchy of objects, each having attributes, perhaps some methods, and possibly content. +An objectsystem driver's purpose is to provide access to a particular type of local or network data/resource. Specific information about the resource to be accessed (such as credentials for a database, queries for selecting data, the auth token for an API, etc.) is stored in a file that is opened by the relevant driver. For example, the query driver (defined in `objdrv_query.c`) opens `.qy` files, which store one or more ObjectSQL queries used to fetch data. -Each objectsystem driver will implement this subtree structure rooted at what is called the "node" object. The node has a specifically recognizable object type which the ObjectSystem Management Layer uses to determine which OS Driver to pass control to. Normally, the 'node' object is a UNIX file either with a particular extension registered with the OSML, or a UNIX file residing in a directory containing a '.type' file, which contains the explicit object type for all objects in that directory without recognizable extensions. +When the object system starts up, each driver registers one or more type names that it supports (e.g. `"system/query"` for the query driver). When a file is opened, the object system uses the file's type name to select which driver to use. It finds this type name with one of two strategies. If the file has an extension (e.g. `example.qy`), that extension can be mapped to a type name using `types.cfg` (e.g. `.qy` maps to `"system/query"`). Althernatively, the file may reside in a directory containing a `.type` file which explicitly specifies the type name for all files in that directory without recognizable extensions. -Normally, objectsystem drivers will be able to manage any number of 'node' objects and the subtrees rooted at them. Each 'node' object will normally relate to a particular instance of a network resource, or in some cases, a group of resources that are easily enumerated. For example, a POP3 server would be a network resource that an OS driver could be written for. If the network had multiple POP3 servers, then that one OS driver would be able to access each of them using different node objects. However, if somehow the OS driver were able to easily enumerate the various POP3 servers on the network (i.e., they responded to some kind of hypothetical broadcast query), then the OS driver author could optionally design the driver to list the POP3 servers under a single node for the whole network. +Once a file is opened, the driver should organize provided data into a tree-structured hierarchy, which becomes part of the path used by Centrallix's ObjectSystem. For example, when opening `example.qy` in the ObjectSystem, the driver makes `/rows` and `/columns` available, allowing for paths such as `/apps/data/example.qy/rows`. The root of a driver's tree (`example.qy`) is called the driver's "node" object, and most paths traverse the node objects of multiple drivers. The root of the entire tree is a special driver called the root node which is used to begin traversal. Within its tree, a driver author is free to define any manner of hierarchical structures for representing available data. However, the structure should fit the basic ObjectSystem model of a hierarchy of objects, each having attributes, and optionally some methods and/or content. -The structure of the subtree beneath the node object is entirely up to the drivers' author to determine; the OSML does not impose any structural restrictions on such subtrees. +A driver can be opened multiple times, leading one driver to have multiple "node" objects, also called instances. Typically, each "node" object relates to a particular instance of a resource. For example, say you are designing a driver to access MySQL databases. You could design the driver file to describe a MySQL instance. Thus, the node object for this driver could have children for each database in that instance (e.g. `Kardia_DB`, `mysql`, and even the system databases used by MySQL to manage the database internals). Another design would be for each driver file to describe one MySQL database. Thus, you could make a `Kardia_DB` file to access that database, and the children of that node object would be each table in the database. A third design option would be for each driver file to describe a MySQL table. Thus, you make a `p_partner` file to access members of the partner table, a `p_contact_info` file to access contact info for parterners, etc. with each node object having children for the rows in the table. This last option would require the developer to create a _lot_ of files (and would probably also make joins hard to implement), so in this case, it's probably not the best. Ultimately, though, these design choices are up to the driver author. -Here is one example of an OS Driver's node object and subtree (this is for the Sybase OS Driver, objdrv_sybase.c): +an instance of a POP3 driver might represent a POP3 server on the network. If the network had multiple POP3 servers, this driver could be used to access each of them through different node objects (e.g. `dev.pop3`, `prod.pop3`, etc.). However, if somehow the OS driver were able to easily enumerate the various POP3 servers on the network (i.e., they responded to some kind of hypothetical broadcast query), then the OS driver author could also design the driver to list the POP3 servers under a single node for the whole network. -``` -OMSS_DB (type = application/sybase) +The structure of the subtree beneath the node object is entirely up to the drivers' author to determine; the OSML does not impose any structural restrictions on such subtrees. Each object within this structure (e.g. `/example.qy`) can have three types of readable data: +- Child objects (e.g. `/rows`) which can have their own data. +- Content, which can be read similar to reading a file. +- Query data, allowing the object to be queried for information. + +Thus, parent objects with child objects behave similarly to a directory, although they can still have separate readable data _and_ queryable data. This may seem foreign in the standard file system paradime, however, it is common for web servers, where opening a directory often returns `index.html` file in that directory, or some other form of information to allow further navigation. Querying an object was originally intended as a way to quickly traversal of its child objects, although queries are not required to be implemented this way. + +Below is an example of the Sybase driver's node object and its subtrees of child objects (defined in `objdrv_sybase.c`): + +```sh +Kardia_DB (type = "application/mysql") | - +--- JNetHelp (type = system/table) - | | - | +--- columns (type = system/table-columns) - | | | - | | +--- document_id (type = system/column) - | | | - | | +--- parent_id (type = system/column) - | | | - | | +--- title (type = system/column) - | | | - | | +--- content (type = system/column) - | | - | +--- rows (type = system/table-rows) - | | - | +--- 1 (type = system/row) - | | - | +--- 2 (type = system/row) + +----- p_partner (type = "system/table") + | | + | +----- columns (type = "system/table-columns") + | | | + | | +----- p_partner_key (type = "system/column") + | | | + | | +----- p_given_name (type = "system/column") + | | | + | | +----- p_surname (type = "system/column") + | | | + | | ... + | | + | +----- rows (type = "system/table-rows") + | | | + | | +----- 1 (type = "system/row") + | | | + | | +----- 2 (type = "system/row") + | | | + | | ... + | | + | ... | - +--- Partner (type = system/table) + +----- p_contact_info (type = "system/table") + | | + | ... + ... ``` (... and so forth) -In this case the node object would contain the information necessary to access the database, such as server name, database name, max connections to pool, and so forth. More about the node object and managing its parameters will be discussed later in this document. +In this case, the `OMSS_DB` file becomes the driver's node object. This file would contain the information necessary to access the database, such as server name, database name, max connections to pool, and so forth. + +OS Drivers support several primary areas of functionality: +- Opening and closing objects. +- Creating and deleting node objects (optional). +- Reading and writing object content (optional). +- Getting and (optionally) setting object attributes. +- Executing object methods (optional). +- Querying data attributes (optional). + +Using the example above, we can query from the database using a statement like `select :title from /OMSS_DB/JNetHelp/rows`, which will open a sybase driver instance, then open a query and repeatedly fetch rows, getting the `title` attribute from each row. + -OS Drivers support several primary areas of functionality: opening and closing objects, reading and writing object content (if the object has content), setting and viewing object attributes, executing object methods, and querying an object's child objects based on name and/or attribute values. Drivers will also support the creation and deletion of objects and/or a set of child objects. ## II Interface -This section describes the standard interface between the OSML and the ObjectSystem driver itself. +This section describes the standard interface between the OSML and the ObjectSystem driver itself. Every driver should implement certain required functions. (**Note**: Many drivers "implement" some required functions to simply fail with a not implemented or not supported error. For example, most database drivers "implement" `Read()` and `Write()` this way because database content should be queried, not read). Various optional functions are also available, which a driver is not required to implement. + +The driver should implement an `Initialize()` function, as well as the following (* indicates required functions): + +| Function Name | Description +| --------------------------------------------------------- | ------------ +| [Open](#function-open)* | Opens a new driver instance object on a given node object. +| [OpenChild](#function-openchild) | Opens a single child object of the provided object by name. +| [Close](#function-close)* | Close an open object created by either `Open()` or `QueryFetch()`. +| [Create](#function-create) | Create a new driver node object. (Not currently used because the OSML calls the driver Open with the `O_WRONLY \| O_CREAT \| O_EXCL` options instead. See [Open()](#function-open) below for more info.) +| [Delete](#function-delete) | Used for general object deletion. Drivers can implement `DeleteObj()` instead. +| [DeleteObj](#function-deleteobj)* | Replacement for `Delete()` which operates on an already-open object. +| [OpenQuery](#function-openquery)** | Start a new query for child objects of a given object. +| [QueryDelete](#function-querydelete) | Delete specific objects from a query's result set. +| [QueryFetch](#function-queryfetch)** | Open the next child object in the query's result set. +| [QueryCreate](#function-querycreate) | Currently just a stub function that is not fully implemented. +| [QueryClose](#function-queryclose)** | Close an open query. +| [Read](#function-read)* | Read content from the object. +| [Write](#function-write)* | Write content to the object. +| [GetAttrType](#function-getattrtype)* | Get the type of a given object's attribute. +| [GetAttrValue](#function-getattrvalue)* | Get the value of a given object's attribute. +| [GetFirstAttr](#function-getfirstattr--getnextattr)* | Get the name of the object's first attribute. +| [GetNextAttr](#function-getfirstattr--getnextattr)* | Get the name of the object's next attribute. +| [SetAttrValue](#function-setattrvalue)* | Set the value of an object's attribute. +| [AddAttr](#function-addattr) | Add a new attribute to an object. +| [OpenAttr](#function-openattr) | Open an attribute as if it were an object with content. +| [GetFirstMethod](#function-getfirstmethod--getnextmethod) | Get the name of an object's first method. +| [GetNextMethod](#function-getfirstmethod--getnextmethod) | Get the name of an object's next method. +| [ExecuteMethod](#function-executemethod) | Execute a method with a given name and optional parameter string. +| [PresentationHints](#function-presentationhints) | Get info about an object's attributes. +| [Info](#function-info)* | Get info about an object instance. +| [Commit](#function-commit) | Commit changes made to an object, ensuring that all modifications in the current transaction are completed and the transaction is closed before returning. +| [GetQueryCoverageMask](#function-getquerycoveragemask) | Should be left `NULL` outside the MultiQuery module. +| [GetQueryIdentityPath](#function-getqueryidentitypath) | Should be left `NULL` outside the MultiQuery module. + +_*Function is always required._ + +_**Function is always required, but can always return NULL if queries are not supported._ + + +--- +### Abbreviation Prefix +Each OS Driver will have an abbreviation prefix, such as `qy` for the query driver or `sydb` for the sybase database driver. This prefix should be prepended to the start of every public function name within the OS driver for consistency and scope management (e.g. `qyInitialize()`, `sydbQueryFetch()`, etc.). Normally, a driver's abbreviation prefix is two to four characters, all lowercase and may be the same as a file extension the driver supports. However, this is not an absolute requirement (see the cluster driver in `objdrv_cluster.c` which supports `.cluster` files using an abbreviation prefix of `cluster`). + +This document uses `xxx` to refer to an unspecified abbreviation prefix. + +- 📖 **Note**: Once an abbreviation prefix has been selected, the driver author should add it to the [Prefixes.md](Prefixes.md) file. + + +### Internal Functions +It is highly likely that driver authors will find shared functionality in the following functions, or wish to abstract out functionality from any of them for a variety of reasons. When creating additional internal functions in this way, they should be named using the convention of `xxx_internal_FunctionName()`, or possibly `xxxi_FunctionName()` for short. + +--- +### Function: Initialize() +```c +/*** @returns 0 if successful, or + *** -1 if an error occurred. + ***/ +int xxxInitialize(void) +``` +- ⚠️ **Warning**: For compiled drivers, the success/failure of this function is ignored by the caller. However, for drivers loaded as modules, the return value is checked in order to determine whether to keep the module loaded. In either case, `mssError()` should be called for any failure (other than memory allocation failures). +- 📖 **Note**: Unlike other functions defined in the driver, each driver author must manually add this call to the start up code, found in the `cxDriverInit()` function in `centrallix.c`. -### A. Initialization -Each OS Driver will have an initialization function, normally named xxxInitialize() where 'xxx' is the driver's abbreviative prefix. This prefix should be attached to each and every function within the OS driver for consistency and project management. Normally 'xxx' is two to four characters, all lowercase. This initialization function is called when the Centrallix starts up, and at least at the present time, this initial call to the OS driver must be manually added to the appropriate startup code, currently found in 'centrallix.c'. +The initialization function is called when the Centrallix starts up, and should register the driver with the OSML and initialize necessary global variables. It is recommended to place global variables in a single global 'struct' that is named with the driver's prefix in all uppercase. Global variables should **NOT** be accessed from outside the driver. Instead, the driver should define functions to access them, allowing it to abstract details away from other drivers. -Within the initialization function, the driver should initialize all necessary global variables and register itself with the OSML. Global variables should all be placed inside a single global 'struct', which is normally named similarly to the driver's prefix, except normally in all uppercase. Under no circumstances should global variables be accessed outside of the module, except via the module's functions. +To register itself with the OSML, the driver should first allocate an ObjDriver structure and initialize its contents: -To register with the OSML, the driver must first allocate an ObjDriver structure and fill in its contents. +```c +pObjDriver drv = (pObjDriver)nmMalloc(sizeof(ObjDriver)); +if (drv == NULL) goto error_handling; +memset(drv, 0, sizeof(ObjDriver)); +... +``` - pObjDriver drv; +To initialize this struct, the driver must: +- Provide a name (in `drv->Name`). +- Provide an array of supported root types (in `drv->RootContentTypes`). +- Provide capability flags (in `drv->Capabilities`). +- Provide function pointers to implemented functions (see [II Interface](#ii-interface) for a list). - drv = (pObjDriver)nmMalloc(sizeof(ObjDriver)); +#### Name +The `name` field is a 64 character buffer (allowing names up to 63 characters, with a null terminator). It usually follows the format of the driver abbreviation prefix (in all uppercase), followed by a dash, followed by a descriptive name for the driver. -This involves setting a large number of fields to the appropriate entry points within the OS Driver, as well as telling the OSML what object type(s) are handled by the driver and giving the OSML a description of the driver. A list of the required entry point functions / fields follows: +For example: +```c +if (strcpy(drv->Name, "SYBD - Sybase Database Driver") == NULL) goto error_handling; +``` -| Function/Field | Description -| -------------------- | ------------ -| Open | Function that the OSML calls when the user opens an object managed by this driver. -| Close | Close an open object. -| Create | Create a new object. -| Delete | Delete an existing object. -| OpenQuery | Start a query for child objects. -| QueryDelete | Delete all objects in the query result set. -| QueryFetch | Open the next child object in the query's result set. -| QueryClose | Close an open query. -| Read | Read content from the object. -| Write | Write content to the object. -| GetAttrType | Get the type of an object's attribute. -| GetAttrValue | Get the value of an object's attribute. -| GetFirstAttr | Get the first attribute associated with the object. -| GetNextAttr | Get the next attribute associated with the object. -| SetAttrValue | Set the value of an attribute. -| AddAttr | Add a new attribute to an object. -| OpenAttr | Open an attribute as if it were an object with content. -| GetFirstMethod | Get the first method of the object. -| GetNextMethod | Get the next method of an object. -| ExecuteMethod | Execute a method with an optional string parameter. +#### RootContentTypes +The `RootContentTypes` field is an XArray containing a list of strings, representing the type names that the driver can open. This should only include types the driver will open as node objects at the root of its tree, not other objects created by the driver within that tree. Thus, the sybase driver would include `"application/sybase"`, but not `"system/table"`. -The only method that can be set to NULL is the QueryDelete method, in which case the OSML will call QueryFetch() and Delete() in succession. However, if the underlying network resource has the capability of intelligently deleting objects matching the query's criteria, this method should be implemented (as with a database server). +For example: +```c +if (xaInit(&(drv->RootContentTypes), 2) != 0) goto error_handling; +if (xaAddItem(&(drv->RootContentTypes), "application/sybase") < 0) goto error_handling; +if (xaAddItem(&(drv->RootContentTypes), ""system/query"") < 0) goto error_handling; +``` -Another field in the driver structure is the Capabilities field. This field is a bitmask, and can currently contain zero or more of the following options: +- 📖 **Note**: To make a specific file extension (like `.qy`) open in a driver, edit `types.cfg` to map that file extension to an available root content type supported by the driver (such as `"system/query"`). -- OBJDRV_C_FULLQUERY: Indicates that this objectsystem driver will intelligently process the query's expression tree specified in the OpenQuery call, and will only return objects that match that expression. If this flag is missing, the OSML will filter objects returned by QueryFetch so that the calling user does not get objects that do not match the query. Typically this is set by database server drivers. +#### Capabilities +The capabilities field is a bitmask which can contain zero or more of the following flags: - THE ABOVE IS OUT-OF-DATE. From now on, a driver can determine whether to handle the Where and OrderBy on a per-query basis, by setting values in the ObjQuery structure used when opening a new query. This is because a driver may be able to handle Where and OrderBy for some object listings but not for others. +- `OBJDRV_C_FULLQUERY`: Indicates that this objectsystem driver will intelligently process the query's expression tree specified in the `OpenQuery()` call, and will only return objects that match that expression. If this flag is missing, the OSML will filter objects returned by `QueryFetch()` so that the calling user does not get objects that do not match the query. Typically this is set by database server drivers. + - > **THE ABOVE IS OUT-OF-DATE** (May 16th, 2022): A driver can now determine whether to handle the `Where` and `OrderBy` on a per-query basis, by setting values in the ObjQuery structure used when opening a new query. This allows a driver to handle `Where` and `OrderBy` selectively for some object listings but not others. -- OBJDRV_C_TRANS: Indicates that this objectsystem driver requires transaction management by the OSML's transaction layer (the OXT layer). OS drivers that require this normally are those that for some reason cannot complete operations in independence from one another. For example, with a database driver, the creation of a new row object and the setting of its attributes must be done as one operation, although the operation requires several calls from the end user's process. The OXT allows for the grouping of objectsystem calls so that the os driver does not have to complete them independently, but instead can wait until several calls have been made before actually completing the operation. +- `OBJDRV_C_TRANS`: Indicates that this objectsystem driver requires transaction management by the OSML's transaction layer (the OXT layer). OS drivers that require this normally are those that for some reason cannot complete operations in independence from one another. For example, with a database driver, the creation of a new row object and the setting of its attributes must be done as one operation, although the operation requires several calls from the end user's process. The OXT allows for the grouping of objectsystem calls so that the os driver does not have to complete them independently, but instead can wait until several calls have been made before actually completing the operation. -The 'Name' field should be filled in with a description of the OS driver, with a maximum length of 63 characters (plus the string null terminator). Normally, the 2-4 letter prefix of the driver is included at the beginning of 'Name', such as "UXD - UNIX filesystem driver". +#### Registering the Driver Struct +When all values within the structure have been initialized, the driver should call the OSML to register itself, using the `objRegisterDriver()` function: -Finally, the 'RootContentTypes' field is an XArray containing a list of strings, each of which specifies the node object types that the driver will handle. Such types are added to this XArray using the normal XArray utility functions, such as: +```c +if (objRegisterDriver(drv) != 0) goto error_handling; +``` - xaInit(&drv->RootContentTypes, 16); - xaAddItem(&drv->RootContentTypes, "system/file"); - xaAddItem(&drv->RootContentTypes, "system/directory"); -When the structure has been filled out, the os driver should call the OSML to register itself, using the objRegisterDriver function: +--- +### Function: Open() +```c +void* xxxOpen(pObject parent, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); +``` - objRegisterDriver(drv); +The `Open()` function opens a given file to create a new driver instance. This procedure normally includes the following steps: -The initialization function should return 0 to indicate success, or -1 on failure. Currently, initialization success/failure is not verified by lsmain.c. +1. Access or create the node object, depending on specified flags and whether or not it already exists. +2. Parse additional contents of the path after the driver node object. +3. Allocate a structure that will represent the open object, including a pointer to the node object. +4. Perform other opening operations (such as reading database table information, etc., when a db table's row is being accessed). +5. Return a pointer to the node instance as a void pointer. This pointer will be passed as `void* inf_v` to the driver in subsequent calls involving this object (except the Query functions, discussed below). -The driver should NOT nmFree() the allocated driver structure unless the objRegisterDriver() routine fails (returns -1). +- 📖 **Note - Transactions**: If the os driver specified the `OBJDRV_C_TRANS` capability, it must respect the current state of the user's transaction. If a new object is being created, an object is being deleted, or other modifications/additions are being performed, and if the OXT layer indicates a transaction is in process, the driver must either complete the current transaction and then complete the current call, or else add the current delete/create/modify call to the transaction tree (in which case the tree item is preallocated; all the driver needs to do is fill it in). This is handled using the transaction tree parameter (`oxt : pObjTrxTree*`). -Note that the RootContentTypes handled by the driver should only include the types of the objects this driver will handle as node objects. For instance, the Sybase database access driver uses "application/sybase" as its top level type. It won't register such things as "system/table". +#### Accessing the Node Object +If `O_CREAT` and `O_EXCL` are both specified in `parent->Mode`, the driver should **only** create a new file and fail if the file already exists (refusing to open and read it). Otherwise, the driver should read an existing file, or create one if it does not exist and `O_CREAT` is specified, failing if no file can be read or created. -### B. Opening And Closing Objects -As an overview, the normal procedure for the open routine to follow is this: +#### Parsing Path Contents +The task of parsing the provided path into the subtree beneath its node object is one of the more complex operations for a driver. For example, the path to a driver's node object might be `/datasources/Kardia_DB` and the user opens an object called `/datasources/Kardia_DB/p_partner/rows/1`. In this case, the OS driver must parse the meaning of the subtree path `p_partner/rows/1`, storing the data targetted by the user into the driver instance to allow later method calls to access the correct data. -1. Access the node object, or create it, depending on whether the object already exists as well as the open mode flags indicated by the end-user. -2. Upon successful node object access, determine what additional components of the pathname are to be handled by this driver, and verify that they can be opened, depending on the object's open mode (CREAT, EXCL, etc.) -3. If it hasn't been already, allocate a structure that will represent this open object and contain information about it and how we're to handle it. It should include a pointer to the node object. -4. Perform any operations inherent in the open process that have not already been performed (such as reading database table information, etc., when a db table's row is being accessed). -5. Return a pointer to the structure allocated in (3) as a void pointer. The OSML will pass this pointer back to the driver on subsequent calls that involve this object. +#### Parameters +The `Open()` routine is called with five parameters: -The first basic part of the OS driver consists of the Open and Close routines, normally named 'xxxOpen' and 'xxxClose' within the driver, where 'xxx' is the driver's prefix. The Close routine is normally fairly simple, but the Open routine is one of the most complicated routines in a typical OS driver, for the Open routine must parse the subtree pathname beneath the node object. For example, if the node object had a pathname like: +- `parent : pObject`: A pointer to the Object structure maintained by the OSML. This structure includes some useful fields: + + - `parent->Mode : int`: A bitmask of the OBJ_O_* flags, which include: `OBJ_O_RDONLY` (read only), `OBJ_O_WRONLY` (write only), `OBJ_O_RDWR` (read/write), `OBJ_O_CREAT` (create), `OBJ_O_TRUNC` (truncate), and `OBJ_O_EXCL` (exclusive, see above). + + - `parent->Pathname : pPathname`: A pointer to a Pathname struct (defined in `include/obj.h`) which contains the complete parsed pathname for the object. This provides a buffer for the pathname as well as an array of pointers to the pathname's components. The function `obj_internal_PathPart()` can be used to obtain at will any component or series of components of the pathname. - /datasources/OMSS_DB + - `parent->Pathname->OpenCtl : pStruct[]`: Parameters for the open() operation, as defined by the driver author. These are specified in the path in a similar way to URLs (`example.qy?param1=value¶m2=other_value`). Drivers typically only use `parent->Pathname->OpenCtl[parent->SubPtr]` (see SubPtr below) to retrieve their own parameters, ignoring parameters passed to other drivers in the path. -and the user opened an object called: + - `parent->SubPtr : short`: The number of components in the path that are a part of the path to the driver's node object, including the `.` for the top level directory and the driver's node object. For example, in the above path of `/data/file.csv`, the path would be internally represented as `./ data/ file.csv`, so SubPtr is 3. + + - For example, use `obj_internal_PathPart(parent->Pathname, parent->SubPtr - 1, 1)` to get the name of the file being openned, and use `obj_internal_PathPart(parent->Pathname, 0, parent->SubPtr)` to get the path. - /datasources/OMSS_DB/JNetHelp/rows/1 + - `parent->SubCnt : short`: _The driver should set this value_ to show the number of components it controls. This includes the driver's node object, so `SubCnt` will always be at least 1. For example, when opening `/data/file.csv/rows/1`, the CSV driver will read the `SubPtr` of 3 (see above), representing `./ data/ file.csv`. It will then set a `SubCnt` of 3, representing that it controls `file.csv /rows /1`. (The driver only sets `SubCnt`; `SubPtr` is provided.) -the OS driver would have to determine what the subtree pathname 'JNetHelp/rows/1' means, since this path will mean different things to different os drivers. + - `parent->Prev : pObject`: The underlying object as opened by the next-lower-level driver. The file can be accessed and parsed by calling functions and passing this pointer to them (such as the st_parse functions, see below). **DO NOT attempt to open the file directly with a call like `fopen()`,** as this would require hard coding the path to the root directory of the object system, which *will* break if the code runs on another machine. -The Open routine also must determine whether the object already exists or not, and if not, whether to create a new object. This logic is largely dependent on the obj->Mode flags, as if O_CREAT is included, the driver must attempt to create the object if it does not already exist, and if O_EXCL is included, the driver must refuse to open the object if it already exists, as with the UNIX open() system call semantics. + - `parent->Prev->Flags : short`: Contains some useful flags about the underlying object, such as: + - `OBJ_F_CREATED`: The underlying object was just created by this open() operation. In that case, this driver is expected to create the node with `snNewNode()` (see later in this document) as long as `parent->Mode` contains `O_CREAT`. -Finally, if the os driver specified a capability of OBJDRV_C_TRANS, it must pay attention to the current state of the end-user's trans- action. If a new object is being created, an object is being deleted, or other modifications/additions are being performed, and if the OXT layer indicates a transaction is in process, the driver must either complete the current transaction and then complete the current call, or else add the current delete/create/modify call to the transaction tree (in which case the tree item is preallocated; all the driver needs to do is fill it in). The transaction layer will be discussed in depth later in this document. +- `mask : int`: The permission mask to be given to the object, if it is being created. Typically, this will only apply to files and directories, so most drivers can ignore it. The values are the same as the UNIX [octal digit permissions](https://en.wikipedia.org/wiki/Chmod#:~:text=Octal%20digit%20permission) used for the `chmod()` command. -As a part of the Open process, the OS driver will normally allocate an internal structure to represent the current open object, and will return that structure as a void* data type in the return value. This pointer will be then passed to each of the other driver entry point functions, with the exception of QueryFetch, QueryDelete, and Query- Close, which will be discussed later. +- `sys_type : pContentType`: Indicates the content type of the node object as determined by the OSML. The ContentType structure is defined in `include/obj.h`. `sys_type->Name` lists the name of the content type (e.g. `"system/query"` for the query driver). This is also the type used to select which driver should open the node object, so it will be one of the types registered in the `Initialize()` function. -The Open() routine is called with five parameters: +- `usr_type : char*`: The object type requested by the user. This is normally used when creating a new object, though some drivers also use it when opening an existing object. For example, the reporting driver generates HTML report text or plaintext reports if `usr_type` is `"text/html"` or `"text/plain"` (respectively). -- obj (pObject) - This is a pointer to the Object sturcture maintained by the OSML. This structure will contain some important fields for processing the open() request. +- `oxt : pObjTrxTree*`: The transaction tree, used when the driver specifies the `OBJDRV_C_TRANS` capability. More on this field later. Non-transaction-aware drivers can safely ignore this field. - obj->Mode is a bitmask of the O_* flags, which include O_RDONLY, O_WRONLY, O_RDWR, O_CREAT, O_TRUNC, and O_EXCL. + - 📖 **Note**: Yes, this param *is* a pointer to a pointer. Essentially, a pointer passed by reference. This allows the driver to create a new transaction tree even if none is in progress. - obj->Pathname is a Pathname structure which contains the complete parsed pathname for the object. This structure is defined in the file include/obj.h, and has a buffer for the pathname as well as an array of pointers to the pathname's components. The function obj_internal_PathPart() can be used to obtain at will any component or series of components of the pathname. - obj->Pathname->OpenCtl[] contains parameters to the open() operation. Frequently these params provide additional information on how to open the object. The use of these parameters is determined by the author of the objectsystem driver. The parameters are those passed in normal URL fasion (?param=value, etc.). Typically, the only OpenCtl of interest is going to be obj->Pathname->OpenCtl[obj->SubPtr] (see below for SubPtr meaning). +The `Open()` routine should return a pointer to an internal driver structure on success, or `NULL` on failure. It is normal to allocate one such structure per `Open()` call, and for one of the structure fields to point to shared data describing the node object. Accessing the node object is described later in this document. - obj->SubPtr is the number of components in the path that are a part of the node object's path. For example, in the above path of '/datasources/OMSS_DB', the path would be internally represented as './datasources/ OMSS_DB', and the SubPtr would be 3. +While driver instance structures may vary, some fields are common in most drivers (`inf` is the pointer to the structure here): - obj->SubCnt reflects the number of components of the path which are under the control of the current driver. This includes the node object, so SubCnt will always be at least 1. For example, when opening '/data/file.csv/rows/1', and the driver in question is the CSV driver, SubPtr would be 3 (includes an "invisible" first component), from '/data/file.csv', and SubCnt would be 3, from 'file.csv/rows/1'. The driver will need to SET THE SUBCNT value in its Open function. SubPtr is already set. +| Field | Type | Description | +|-----------|---------|-------------------------------------------------| +| inf->Obj | pObject | A copy of the `obj` pointer passed to `Open()`. | +| inf->Mask | int | The `mask` argument passed to `Open()`. | +| inf->Node | pSnNode | A pointer to the node object. | - obj->Prev is the underlying object as opened by the next-lower-level driver. It is the duty of this driver to parse the content of that object and do something meaningful with it. +The driver's node pointer typically comes from `snNewNode()` or `snReadNode()` (for structure files), but it can also be other node struct information. - obj->Prev->Flags contains some critical infor- mation about the underlying object. If it contains the flag OBJ_F_CREATED, then the underlying object was just created by this open() operation. In that case, this driver is expected to create the node with snNewNode() (see later in this document) as long as obj->Mode contains O_CREAT. +--- +### Function: OpenChild() +*(Optional)* +```c +void* xxxOpenChild(void* inf_v, pObject obj, char* child_name, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); +``` +Opens a single child object of the provided object by name. Conceptually, this is similar to querying the object for all children where the name attribute equals the passed `child_name` parameter and fetching only the first result. This function is used to open children of a driver that do not map well into the driver's node object tree. For example, the query file driver uses this function to allow the caller to open a temporary collection declared in that query file. -- mask (int) - Indicates the security mask to be given to the object if it is being created. Typically, this will only apply to files and directories. The values are the same as UNIX chmod() type values. +The `OpenChild()` function is called with two parameters: -- systype (pContentType) - This param indicates the content type of the node object as determined by the OSML. The ContentType structure is defined in include/ obj.h, and includes among other things the name of the content type. For example, for the reporting driver, this type would be "system/report". +| Param | Type | Description | +|------------|--------------|---------------------------------------------------------------------------| +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). | +| obj | pObject | An object? | +| child_name | char* | The value for the name attribute of the child object to be openned. | +| mask | int | The permission mask to be given to the object (if created).* | +| sys_type | pContentType | Indicates the content type of the node object as determined by the OSML.* | +| usr_type | char* | The object type requested by the user.* | +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. | -- usrtype (char*) - This param is the requested object type by the user and is normally used when creating a new object, though under some circumstances it may change the way the open operates on an existing object. For example, the reporting driver can change whether it generates HTML report text or plaintext reports based on usrtype being either "text/html" or "text/plain". + -- oxt (pObjTrxTree*) - This param is only used by object drivers that specified a capability of OBJDRV_C_TRANS. More on this field later. For non-transaction-aware drivers, this field can be safely ignored. +*See [`Open()`](#function-open) above for more info. - Yes, this param *is* a pointer to a pointer. Essentially, a pointer passed by reference. +The `OpenChild()` function should a pointer to the node object for the newly openned child on success or `NULL` on failure. -The Open routine should return its internal structure pointer on success, or NULL on failure. It is normal to allocate one such structure per Open call, and for the structure to point, among other things, to shared data describing the node object. Accessing the node object is described later in this document. +--- +### Function: Close() +```c +int xxxClose(void* inf_v, pObjTrxTree* oxt); +``` +The close function closes a driver instance, freeing all allocated data and releasing all shared memory such as open connections, files, or other driver instances. The driver must ensure that all memory allocated by originally opening the object (or allocated by other functions that may be called on an open object) is properly deallocated. This includes the internal structure returned by `Open()`, or by `QueryFetch()`, which is passed in as `inf_v`. The driver may also need to decrement the Open Count (`node->OpenCnt--`) if it had to increment this value during `Open()`. Before doing so, it should also perform a `snWriteNode()` to write any modified node information to the node object. -It is important to know what kinds of fields normally are placed in the allocated data structure returned by Open. These fields are all determined by the driver author, but here are a few typical ones that are helpful to have ("inf" is the pointer to the structure here): +- 📖 **Note**: Remember that the passed driver instance may originally be from a call to `Open()` or a call to `QueryFetch()`. -| Field | Type | Description -| ---------- | --------- | ------------ -| inf->Obj | pObject | This is a copy of the 'obj' pointer passed to the Open routine. -| inf->Mask | int | The 'mask' argument passed to Open. -| inf->Node | pSnNode | A pointer to the node object, as returned from snNewNode() or snReadNode(), or if structure files aren't being used as the node content type, a pointer to whatever structure contains information about the node object. +- 📖 **Note**: Even if close fails, the object should still be closed in whatever way is possible. The end-user should deal with the resulting situation by reviewing the `mssError()` messages left by the driver. -The Close() routine is called with two parameters: +- 📖 **Note**: Information may be left unfreed if it is stored in a cache for later use. -| Param | Type | Description -| ------ | ------------ | ------------ -| inf_v | void* | This param is the pointer that the Open routine returned. Normally the driver will cast the void* parameter to some other structure pointer to access the object's information. -| oxt | pObjTrxTree* | The transaction tree pointer. +The `Close()` function is called with two parameters: -The Close routine should return 0 on success or -1 on failure. The os driver must make sure it properly deallocates the memory used by originally opening the object, such as the internal structure returned by open and passed in as inf_v. +| Param | Type | Description | +|-------|--------------|-----------------------------------------------------------------------| +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). | +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. | -Note the semantics of a Close failure - the object should still be closed in whatever way is still meaningful. The end-user must deal with the situation by reviewing the returned mssError messages. +The `Close()` function should return 0 on success or -1 on failure. -Before exiting, the Close routine should make sure it decrements the Open Count (node->OpenCnt--). Before doing so, it should also perform a snWriteNode() to write any modified node information back to the node object. -### C. Creating and Deleting Objects. -The Create and Delete functions are used for creating and deleting objects. Normally, the os driver will process the Pathname in the same manner for Create and Delete as for Open, thus such functionality could be placed in another function. +### Function: Create() +```c +int xxxCreate(pObject obj, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); +``` +The `Create()` function is used to create a new object, and uses the same parameters and return value as `Open()` (documented in detail above). This often means adding a new file to the file system to represent the object. Many drivers do not implement this and recommend that driver end-users create files using a standard text editor or programatically using more general means, such as general structure file generation. If implemented, this function frequently requires very similar path parsing functionality to `Open()`. -As a side note, within Centrallix, the standard function naming convention is to use xxx_internal_FunctionName for functions that are more or less internal to the module and not a part of any standard interface. +- 📖 **Note**: For many drivers, the `Create()` function calls the driver's `Open()` function with `O_CREAT`, then calls its `Close()` function, although some drivers may manage this differently. -The Create routine has parameters identical to the Open routine. It should return 0 on success and -1 on error. -The Delete routine is passed the following parameters: +### Function: Delete() +```c +int clusterDelete(pObject obj, pObjTrxTree* oxt); +``` +The `Delete()` function is used to delete an object, which often means removing a file from the file system. The Delete routine is passed the following parameters: -| Param | Type | Description -| ------ | ------------- | ------------ -| obj | pObject | The Object structure pointer, used in the same way as in Open and Delete. -| oxt | pObjTrxTree* | The transaction tree pointer. +| Param | Type | Description | +|-------|--------------|---------------------------------------------------------------------------| +| obj | pObject | The Object structure pointer, used in the same way as in Open and Delete. | +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. | -Delete should return 0 on success and -1 on failure. +`Delete()` should return 0 on success and -1 on failure. -For many objectsystem drivers, the Create function simply calls the driver's internal Open() with O_CREAT and then its internal Close, although some drivers could manage Create differently from Open. -### D. Reading and Writing Object Content. -Some, but not all, objects will have content. If the object does or can have content, the driver should handle these functions as is appropriate. Otherwise, the driver should return a failure code (-1) from these functions. +### Function: DeleteObj() +```c +int xxxDeleteObj(void* inf_v, pObjTrxTree* oxt); +``` +**No documentation provided.** -The Read routine reads content from the object, as if reading from a file. The parameters passed are almost identical to those used in the fdRead command in MTASK: -| Parameter | Type | Description -| --------- | ------------- | ------------ -| inf_v | void* | The generic pointer to the structure returned from Open(). -| buffer | char* | The destination buffer for the data being read in. -| maxcnt | int | The maximum number of bytes to read into the buffer. -| flags | int | Either 0 or FD_U_SEEK, in which case the user is specifying the seek offset for the read in the 5th argument. Of course, not all objects will be seekable, and furthermore, some of the objects handled by the driver may have full or limited seek functionality, even though others may not. -| arg | int | Extra argument, currently only used to specify an optional seek offset. -| oxt | pObjTrxTree* | The transaction tree pointer. +### Function: Read() +```c +int xxxRead(void* inf_v, char* buffer, int max_cnt, int offset, int flags, pObjTrxTree* oxt); +``` -The Write routine is very similar, except that instead of 'maxcnt', the third argument is 'cnt', and specifies how much data is in the buffer waiting to be written. +The `Read()` function reads content from objects that have content, similar to reading content from a file. If the object does or can have content, the driver should handle these functions as is appropriate. Otherwise, the driver should return a failure code (-1) and call `mssError()` in these functions. -Each of these routines should return -1 on failure and return the number of bytes read/written on success. At end of file or on device hangup, 0 should be returned once, and then subsequent calls should return -1. +The parameters passed are intentionally similar to the `fdRead()` function in `mtask.c`: -### E. Querying for Child Objects. -Many objects will have the capability of having sub-objects beneath them, called child objects. In such a case, the parent object becomes a directory of sorts, even though the parent object may also have content, something which is somewhat foreign in the standard filesystem world, but is common for web servers, where opening a directory returns the file 'index.html' on many occasions. +| Parameter | Type | Description | +|-----------|--------------|------------------------------------------------------------------------------------------------------------------------------| +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). | +| buffer | char* | The buffer where read data should be stored. | +| max_cnt | int | The maximum number of bytes to read into the buffer. | +| offset | int | An optional seek offset. | +| flags | int | Either `0` or `FD_U_SEEK`. If `FD_U_SEEK` is specified, the caller should specify a seek offset in the 5th argument (`arg`). | +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. | -To enumerate a parent object's child objects, the query functions are used. A query may have a specific criteria so that only objects having certain attributes will be listed. As mentioned earlier in this document, a driver may or may not choose to intelligently handle those criteria. The driver has the option of always enumerating all child objects via its query functions, and allowing the OSML filter them and only return to the user the objects that match the criteria. But it also can do the filtering itself or, more typically, pass the filtering on to the source of the data the driver manages, as with a database server. +- 📖 **Note**: Not all objects can be seekable and some of the objects handled by the driver may have limited seek functionality, even if others do not. -The query mechanism can also be used to delete a set of child objects, optionally matching a certain criteria. The QueryDelete method may be left NULL in the ObjDriver structure if the driver does not implement full query support, in which case the OSML will iterate through the query results and delete the objects one by one. +Each of these routines should return -1 on failure and return the number of bytes read/written on success. At end of file or on device hangup, 0 should be returned once, and then subsequent calls should return -1. -The first main function for handling queries is OpenQuery. This function is passed three arguments: +- 📖 **Note**: There is no separate seek command to help mitigate [Time-of-check to time-of-use attacks](https://en.wikipedia.org/wiki/Time-of-check_to_time-of-use). To seek without reading data, specify a buffer size of zero. -- inf_v (void*) The value returned from Open for this object. -- query (pObjQuery) The query structure setup by the OSML. It will contain several key fields: +### Function: Write() +```c +int xxxWrite(void* inf_v, char* buffer, int cnt, int offset, int flags, pObjTrxTree* oxt); +``` +The `Write()` function is very similar to the `Read()` function above, allowing the caller to write data to objects of supporting drivers with content. However, the third argument (`max_cnt`) is replaced with `cnt`, specifying the number of bytes of data in the buffer that should be written. - query->QyText: the text of the criteria (i.e., the WHERE clause, in Centrallix SQL syntax) - query->Tree: the compiled expression tree, which evaluates to nonzero for true or zero for false as the WHERE clause condition. +### Function: OpenQuery() +```c +void* xxxOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt); +``` +The `OpenQuery()` function opens a new query instance struct for fetching query results from a specific driver instance. Queries are often used to enumerate an object's child objects, although this is not a requirement. Queries may include specific criteria, and the driver may decide to intelligently handle them (either manually or, more often, by passing them on to a lower level driver or database) or simply to enumerating all results with its query functions. In the latter case, the OSML layer will filter results and only return objects that match the criteria to the user. - query->SortBy[]: an array of expressions giving the various components of the sorting criteria. +`OpenQuery()` is passed three parameters: - query->Flags: the driver should set and/or clear the flags OBJ_QY_F_FULLQUERY and OBJ_QY_F_FULLSORT if need be. The former indicates that the driver is willing to handle the full WHERE clause (the query->Tree). The latter indicates that the driver is willing to handle the sorting of the data as well (in query->SortBy[]). If the driver can easily have the sorting/selection done (as when querying an RDBMS), it should set these flags. Otherwise, it should let the OSML take care of the ORDER BY and WHERE conditions. +| Parameter | Type | Description | +|-----------|--------------|-----------------------------------------------------------------------| +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). | +| query | pObjQuery | A query structure created by the object system. | +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. | -- oxt (pObjTrxTree*) The transaction tree pointer. +The `query : pObjQuery` parameter contains several useful fields: +| Parameter | Type | Description +| --------------- | ----------------------- | ------------ +| query->QyText | char* | The text specifying the criteria (i.e., the WHERE clause, in Centrallix SQL syntax). +| query->Tree | void* (pExpression) | The compiled expression tree. This expression evaluates to a nonzero value for `true` if the where clause is satisfied, or zero for `false` if it is not. +| query->SortBy[] | void*[] (pExpression[]) | An array of expressions giving the various components of the sorting criteria. +| query->Flags | int | The driver should set and/or clear the `OBJ_QY_F_FULLQUERY` and `OBJ_QY_F_FULLSORT` flags, if needed. -The OpenQuery function should return a void* value, which will within the driver point to a structure used for managing the query. This structure will normally have a pointer to the inf_v value returned by Open as well, since inf_v is never passed to QueryFetch, QueryDelete or QueryClose. OpenQuery should return NULL if the object does not support queries or if some other error condition occurs that will prevent the execution of the query. +The `OBJ_QY_F_FULLQUERY` flag indicates that the driver will handle the full `where` clause specified in `query->Tree`. Even if this flag is not specified, the driver is still free to use the provided `where` clause to pre-filter data, which improves performance when the Object System does its final filtering. However, setting this flag disables the Object System filtering because it promises that the driver will _always_ handle _all_ filtering for _every_ valid queries. -Once the query is underway with OpenQuery, the user will either start fetching the results with QueryFetch, or will issue a delete operation with QueryDelete. +The `OBJ_QY_F_FULLSORT` flag indicates that the driver will handle all sorting for the data specified in `query->SortBy[]`. -The QueryFetch routine should return an inf_v pointer to the child object, or NULL if no more child objects are to be returned by the query. Some drivers may be able to use their internal Open function to generate the newly opened object, although others will directly allocate the inf_v structure and fill it in based on the current queried child object. QueryFetch will be passed these parameters: +If the driver can easily handle sorting/selection (as when querying an database), it should set these flags. Otherwise, it should let the OSML handle the ORDER BY and WHERE conditions to avoid unnecessary work for the driver author. -| Parameter | Type | Description -| ---------- | -------------- | ------------ -| qy_v | void* | The value returned by OpenQuery. -| obj | pObject | The newly-created object structure that the OSML is using to track the newly queried child object. -| mode | int | The open mode for the new object, as with obj->Mode in Open(). -| oxt | pObjTrxTree* | The transaction tree pointer. +The `OpenQuery()` function returns a `void*` for the query instance struct, which will be passed to the other query functions (`QueryDelete()`, `QueryFetch()`, and `QueryClose()`). This structure normally points to the driver instance struct to allow easy access to queried data. `OpenQuery()` returns `NULL` if the object does not support queries or if an error occurs, in which case `mssError()` should be called before returning. -All object drivers will need to add an element to the obj->Pathname structure to indicate the path to the child object being returned. This will involve a process somewhat like this: (given that new_name is the new object's name, qy is the current query structure, which contains a field 'Parent' that points to the inf_v originally returned by Open, and where the inf_v contains a field Obj that points to the Object structure containing a Pathname structure) - int cnt; - pObject obj; - char* new_name; - pMyDriversQueryInf qy; +### Function: QueryDelete() +*(Optional)* +```c +int xxxQueryDelete(void* qy_v, pObjTrxTree* oxt); +``` +Deletes results in the query result set, optionally matching a certain criteria. `QueryDelete()` is passed two parameters: - /** Build the filename. **/ - cnt = snprintf(obj->Pathname->Pathbuf, 256, "%s/%s", - qy->Parent->Obj->Pathname->Pathbuf,new_name); - if (cnt < 0 || cnt >= 256) return NULL; - obj->Pathname->Elements[obj->Pathname->nElements++] = - strrchr(obj->Pathname->Pathbuf,'/')+1; +| Parameter | Type | Description +| --------- | ------------- | ------------ +| qy_v | void* | A query instance pointer (returned from `QueryOpen()`). +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -QueryDelete is passed the qy_v void* parameter, and an oxt parameter. It should return 0 on successful deletion, and -1 on failure. +`QueryDelete()` returns 0 to indicate a successful deletion, or -1 to indicate failure, in which case `mssError()` should be called before returning. -QueryClose is also passed qy_v and oxt. It should close the query, whether or not QueryFetch has been called enough times to enumerate all of the query results. +If a delete is needed and this method is not implemented, the OSML will iterate through the query results and delete the objects one by one. -### F. Managing Object Attributes -All objects will have at least some attributes. Five attributes are mandatory: 'name', 'content_type', 'inner_type', 'outer_type', and 'annotation'. All compliant drivers must implement these five attributes, all of which have a data type of DATA_T_STRING. -Currently, the OS specification includes support for the following data types: +### Function: QueryFetch() +```c +void* xxxQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt); +``` +The `QueryFetch()` function fetches a driver instance pointer (aka. an `inf_v` pointer) to a child object, or `NULL` if there are no more child objects. It may be helpful to think of `QueryFetch()` as similar to an alternate form of `Open()`, even if your driver does not implement the functionality to `Open()` every object that can be found with `QueryFetch()`. In fact, some drivers may use an internal `Open()` function to generate the opened objects. + +`QueryFetch()` takes four parameters: + +| Parameter | Type | Description +| ---------- | ------------- | ------------ +| qy_v | void* | A query instance struct (returned by `OpenQuery()`). +| obj | pObject | An object structure that the OSML uses to track the newly queried child object. +| mode | int | The open mode for the new object, the same as `obj->Mode` in `Open()`. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +The driver should add an element to the `obj->Pathname` structure to indicate the path of the returned child object. This will involve a process somewhat like this, where: +- `new_name : char*` is the new object's name. +- `qy : pMyDriversQueryInf` is the current query structure. +- `qy->Parent->Obj->Pathname : pPathname` points to the affected Pathname struct. + +```c +int count; +pObject obj; +char* new_name; +pMyDriversQueryInf qy; + +/** Build the new filename. **/ +count = snprintf(obj->Pathname->Pathbuf, 256, "%s/%s", qy->Parent->Obj->Pathname->Pathbuf, new_name); +if (count < 0 || 256 <= count) goto error_handling; +obj->Pathname->Elements[obj->Pathname->nElements++] = strrchr(obj->Pathname->Pathbuf, '/') + 1; +``` -- DATA_T_INTEGER - 32-bit signed integer. -- DATA_T_STRING - Zero-terminated ASCII string. -- DATA_T_DOUBLE - Double-precision floating point. -- DATA_T_DATETIME - date/time structure. -- DATA_T_MONEY - money data type. +### Function: QueryCreate() +```c +void* xxxQueryCreate(void* qy_v, pObject new_obj, char* name, int mode, int permission_mask, pObjTrxTree *oxt); +``` +The `QueryCreate()` function is just a stub function that is not fully implemented yet. Simply not providing it (aka. setting the location in the driver initialization struct to `NULL`) is fine. -True/false or on/off attributes should be treated as DATA_T_INTEGER for the time being with values of 0 and 1. -Here is a description of the functionality of the five mandatory attributes: +### Function: QueryClose() +```c +int xxxQueryClose(void* qy_v, pObjTrxTree* oxt); +``` +The `QueryClose()` function closes a query instance, freeing all allocated data and releasing all shared memory such as open connections, files, or other driver instances. This function operates very similarly to `Close()`, documented in detail above. The query should be closed, whether or not `QueryFetch()` has been called enough times to enumerate all of the query results. -| Attribute | Description -| -------------- | ------------ -| 'name' | This attribute indicates the name of the object, just as it should appear in any directory listing. The name of the object must be unique for the directory it is in. -| 'content_type' | This is the type of the object's content, given as a MIME-type. -| 'annotation' | This is an annotation for the object. While users may not assign annotations to all objects, each object should be able to have an annotation. Normally the annotation is a short description of what the object is. For the Sybase driver, annotations for rows are created by assigning an 'expression' to the table in question, such as 'first_name + last_name' for a people table. -| 'inner_type' | An alias for 'content_type'. Both should be supported. -| 'outer_type' | This is the type of the object itself (the container). -A sixth attribute is not mandatory, but is useful if the object might have content that could in turn be a node object (be interpreted by another driver). This attribute is 'last_modification', of type DATA_T_DATETIME, and should indicate when the object's content was last updated or modified. +### Object Attributes +All objects can have attributes, and there are five required attributes that all drivers must implement (explained below). -The first function to be aware of is the GetAttrType function. This routine takes the inf_v pointer, the name of the attribute in question, and the oxt* pointer. It should return the DATA_T_xxx value for the data type of the attribute. +Currently, the OS specification includes support for the following data types: -Next is the GetAttrValue function, which takes four parameters: the inf_v pointer, the name of the attribute, a void pointer pointing to where the attribute's value will be put, and the oxt* pointer. The way the value pointer is handled depends on the data type. For DATA_T_INTEGER types, the value pointer is assumed to be pointing to a 32-bit integer where the integer value can be written. For DATA_T_ STRING types, the value pointer is assumed to be pointing to an empty pointer location where a pointer to the string can be stored. For DATA_T_DATETIME types, the value pointer is assumed to be pointing to an empty pointer where a pointer to a date time structure (from obj.h) can be stored. And for double values, the value pointer points to a double value where the double will be stored. In this way, integer and double values are returned from GetAttrValue by value, and string or datetime values are returned from GetAttrValue by reference. Items returned by reference must be guaranteed to be valid until the object is closed, or another GetAttrValue or SetAttrValue call is made. This function should return -1 on a non-existent attribute, 0 on success, and 1 if the value is NULL or unset. +| Name | Description +| ----------------- | ------------ +| `DATA_T_INTEGER` | 32-bit signed integer. +| `DATA_T_STRING` | Null-terminated ASCII string. +| `DATA_T_DOUBLE` | Double-precision floating point number. +| `DATA_T_DATETIME` | Date/time structure. +| `DATA_T_MONEY` | Money structure. -UPDATE ON GETATTR/SETATTR: These functions now, instead of taking a void* pointer for the value, take a pObjData pointer, which points to an ObjData structure. The POD(x) macro can be used to typecast appropriate pointers to a pObjData pointer. The ObjData structure is a UNION type of structure, allowing easy manipulation of data of various types. See 'datatypes.h'. Note that this is binary compatible with the old way of using a typecasted void pointer. +See `datatypes.h` for more information. -The SetAttrValue function works much the same way as GetAttrValue, just with the information moving in the opposite direction. The third parameter, void* value, is treated in the same manner. +For `true`/`false` or `on`/`off` attributes, use `DATA_T_INTEGER` where 0 indicates `false` and 1 indicates `true`. -The GetFirstAttr and GetNextAttr functions each take two parameters, the inf_v pointer and the oxt* pointer, and are used to iterate through the non-mandatory attributes for the object. GetFirstAttr should return a string naming the first attribute, and GetNextAttr should iterate through subsequent attributes. When the attributes are exhausted, these functions should return NULL. The attributes 'name', 'annotation', and 'content_type' should not be returned. If the object has no other attributes, GetFirstAttr should return NULL. +The following five attributes are required (all are of type `DATA_T_STRING`): -AddAttr is used to add a new attribute to an existing object. Not all objects support this, and many will refuse the operation. The parameters are as follows: void* inf_v, char* attrname, int type, void* value, and pObjTrxTree* oxt. +| Attribute | Description +| ------------ | ------------ +| name | The name of the object, just as it appears in any directory listing. The name of the object must always be unique for its level in the tree (e.g. a unique file name in a directory, the primary key of a database row, etc.). +| annotation | A short description of the object. While users may not assign annotations to all objects, each object should be able to have an annotation. For example, in the Sybase driver, annotations for rows are created by assigning an 'expression' to the table in question, such as `first_name + last_name` for a people table. This attribute should _never_ be null, however, it can be an empty string (`""`) if the driver has no meaningful way to provide an annotation. +| content_type | The type of the object's content, given as a MIME-type. Specify `"system/void"` if the object does not have content. +| inner_type | An alias for 'content_type'. Both should be supported. +| outer_type | This is the type of the object itself (the container). -OpenAttr is used to open an attribute for objRead/objWrite as if it were an object with content. Not all object drivers will support this; this routine should return an inf_v pointer for the new descriptor, and takes four parameters: void* inf_v, char* attrname, int mode, and pObjTrxTree* oxt. The mode is used in the same manner as the Open function. +The `last_modification : DATA_T_DATETIME` attribute is a sixth, optional attribute that may be useful in some situations. This attribute should indicate the last time that the object's content was modified or updated. -### G. Managing Object Methods -Objects may optionally have methods associated with them. Each method is given a unique name within the object, and can take a single string parameter. Three functions exist for managing methods. -The first two functions, GetFirstMethod and GetNextMethod, work identically to their counterparts dealing with attributes. The third function, ExecuteMethod, starts a method executing. This function takes four parameters: the inf_v pointer, the name of the method, the optional string parameter, and the oxt* pointer. +### Function: GetAttrType() +```c +int xxxGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt); +``` +The `GetAttrType()` function returns DATA_T_xxx value for the datatype of the requested. It takes three parameters: -## III Reading the Node Object -The Node object has content which controls what resource(s) this driver will actually access, so it is important for the driver to access the node object's content. If the driver's node objects are structure files (which is normally the case when dealing with a remote network resource), then the SN module can make opening the node object much more painless. It also performs caching automatically to improve performance. +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| attr_name | char* | The name of the attribute to be queried. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -Note that the Node object will technically ALREADY BE OPEN as an object in the objectsystem. The OSML does that for you. If your driver will not use the SN/ST modules, then it should read the node object via the normal objRead() function, and write it via objWrite(). Your driver should NEVER objClose() the node object! The OSML does that for you. +This function should return `DATA_T_UNAVAILABLE` if the requested attribute does not exist on the driver instance. It should return -1 to indicate an error, in which case `mssError()` should be called before returning. -An objectsystem driver will commonly configure itself by reading a text file at the root of its object subtree. There are two main modules available for making this easier. +For example, calling the following on any driver should return `DATA_T_STRING`. +```c +int datatype = driver->GetAttrType(inf_v, 'name', oxt); +``` -The normal way to manage object parameters is to use a structure file. Structure files are a little more complicated, but allow for arrays of values for a given attribute name, as well as allowing for tree- structured hierarchies of attributes and values. Structure files are accessed via the stparse and st_node modules. The stparse module provides access to the individual attributes and groups of attributes, and the st_node module loads and saves the structure file heirarchies as a whole. The st_node module also provides node caching to reduce disk activity and eliminate repeated parsing of one file. -For example, if two sessions open two files, '/test1.rpt' and '/test2.rpt' the st_node (SN) module will cache the internal representations of these node object files, and for successive uses of these node objects, the physical file will not be re-parsed. The file will be re-parsed if its timestamp changes. +### Function: GetAttrValue() +```c +int xxxGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +``` +The `GetAttrValue()` function takes four parameters: -If the underlying object does not support the attribute "last_modification" (assumed to be the timestamp), then SN prints a warning. In essence, this warning indicates that changes to the underlying object will not trigger the SN module to re-read the structure file defining the node object. Otherwise, the SN module keeps track of the timestamp, and if it changes, the node object is re-read and re-parsed. +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| attr_name | char* | The name of the attribute to be queried. +| datatype | int | The expected datatype for the requested value. +| val | pObjData | A pointer to a location where the value of the attribute should be stored. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +The value pointer points to a union struct which can hold one of several types of data in the same memory location. Which type of data is expected depends on the value of the `datatype` parameter. +| Field | Datatype | Description +| ----------- | ------------------ | ----------- +| `Integer` | `DATA_T_INTEGER` | An int where the value should be written. +| `String` | `DATA_T_STRING` | A `char*` where a pointer to the string should be written. +| `Double` | `DATA_T_DOUBLE` | A double where the double should be written. +| `DateTime` | `DATA_T_DATETIME` | A `pDateTime` where a pointer to the `DateTime` struct (see [`datatypes.h`](../centrallix/include/datatypes.h)) should be written. +| `IntVec` | `DATA_T_INTVEC` | A `pIntVec` where a pointer to the `IntVec` struct (see [`datatypes.h`](../centrallix/include/datatypes.h)) should be written. +| `StringVec` | `DATA_T_STRINGVEC` | A `pStringVec` where a pointer to the `StringVec` struct (see [`datatypes.h`](../centrallix/include/datatypes.h)) should be written. +| `Money` | `DATA_T_MONEY` | A `pMoneyType` where a pointer to the `MoneyType` struct (see [`datatypes.h`](../centrallix/include/datatypes.h)) should be written. +| `Generic` | ? | A `void*` to somewhere where something should be written should be written (usually implementation dependant). + +In this way, `int`s and `double`s can be returned by value while other types are returned by reference. Items returned by reference must be guaranteed to be valid until either the object is closed, or another `GetAttrValue()` or `SetAttrValue()` call is made on the same driver (which ever happens first). + +This function should return 0 on success, 1 if the value is `NULL` or undefined / unset, or -1 on a non-existent attribute or other error. + +- 📖 **Note**: The caller can use the `POD(x)` macro to typecast appropriate pointers to the `pObjData` pointer. For example: + ```c + char* name; + if (xxxGetAttrValue(obj, "name", DATA_T_STRING, POD(&name)) != 0) + goto error_handling; + printf("Object name: \"%s\"\n", name); + ``` + +- 📖 **Note**: In legacy code, a typecasted `void*` was used instead of a `pObjData` pointer used today. This method was binary compatible the current solution because of the union struct implementation (See [`datatypes.h`](../centrallix/include/datatypes.h) for more information). + + +### Function: SetAttrValue() +```c +int xxxSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +``` +The `SetAttrValue()` function is the same as `GetAttrValue()`, however it sets the value by reading it from the `val` parameter instead of getting the value by writing it to the `val` parameter. The return value is also identical, and `mssError()` should be invoked on failure, or if setting attributes programatically is not implemented. -The driver's first course of action to obtain node object data is to open the node object with the SN module. The SN module's functions are listed below: -### pSnNode snReadNode(pObject obj) -This function reads a Structure File from the already-open node object which is passed in the "obj" parameter in the xxxOpen() routine. The "obj" parameter has an element, obj->Prev, which is a link to the node object as opened by the previous driver in the OSML's chain of drivers for handling this open(). All you need to know to get the parsed node object is the following: +### Function: GetFirstAttr() & GetNextAttr() +```c +char* xxxGetFirstAttr(void* inf_v, pObjTrxTree* oxt); +char* xxxGetNextAttr(void* inf_v, pObjTrxTree* oxt); +``` +These functions return the names of attributes that can be queried on an object. They both take the same two parameters. - pSnNode node; +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. - node = snReadNode(obj->Prev); +These functions should only return the names of significant values, so `name`, `annotation`, etc. should not be returned from these functions, even though they are required to be valid values for any object. Typically, this is implemented by `GetFirstAttr()` resetting some internal value in the driver `inf_v`, then returning the result of `GetNextAttr()`. `GetNextAttr()` extracts a string from an array or other list of valid attribute names for the object and increments the internal counter. Once the attributes are exhausted, `GetNextAttr()` returns `NULL` and `GetFirstAttr()` can be used to restart and begin querying elements from the start of the list again. If an object has no significant attributes, `GetFirstAttr()` and `GetNextAttr()` both return NULL. -The returned node structure is managed by the SN module and need not be nmFree()ed. The only thing that must be done is that the driver should increment the node structure's link count like this: - node->OpenCnt++; +### Function: AddAttr() +```c +int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree* oxt); +``` +The `AddAttr()` function adds a new attribute to an existing object. Not all objects support this, and many will refuse the operation. The parameters are the same as those of `GetAttrValue()` and `SetAttrValue()`, documented in detail above. -When closing an object (and thus releasing a reference to the Node structure), the driver should decrement the link count. -### pSnNode snNewNode(pObject obj, char* content_type) -This function creates a new node object with a given content type. The open link count should be incremented as appropriate, as before with snReadNode(). +### Function: OpenAttr() +```c +void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree* oxt); +``` +The `OpenAttr()` function is used to open an attribute for `objRead()`/`objWrite()` as if it were an object with content. Not all object drivers will support this, and many will refuse the operation. - pSnNode node; +This function takes 4 parameters. `inf_v`, `attr_name`, and `oxt` are the same as they are for `GetAttrValue()` and `SetAttrValue()`. `mode` is the same as it is for `Open()`. This function should return an `inf_v` pointer for the new descriptor (similar to `Open()` and `QueryFetch()` above). - node = snNewNode(obj->Prev, "system/structure"); -The "system/structure" argument is the type that will be assigned to the newly created node object. Note that the underlying object must already exist in order for this to create a node object as that object's content. Normally the OSML does this for you by commanding the previous driver (handling obj->Prev) to create the underlying object in question. +### Function: ExecuteMethod() +```c +int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree* oxt); +``` +The `ExecuteMethod()` function is used to execute a method on an object. This feature is rarely used, but some drivers have created methods for actions like dropping their cache or printing debug information. Each method has a unique name within that object, and can take a single string parameter. -### int snWriteNode(pSnNode node) -This function writes a node's internal representation back out to the node file. The node's status (node->Status) should be set to SN_NS_DIRTY in order for the write to actually occur. Otherwise, snWriteNode() does nothing. +The `ExecuteMethod()` function takes four parameters: -### int snDeleteNode(pSnNode node) -This function deletes a node file. At this point, does not actually delete the file but instead just removes the node's data structures from the internal node cache. +| Parameter | Type | Description +| ----------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| method_name | char* | The name of the method to be executed. +| param | pObjData | A pointer to a location where the string value of the param is stored. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -### int snGetSerial(pSnNode node) -This function returns the serial number of the node. Each time the node is re-read because of modifications to the file or is written via snWriteNode because of modifications to the internal structure, the serial number is increased. This is a good way for a driver to refresh internal information that it caches should it determine a node object has changed. +- 📖 **Note**: The `pObjData` type of the `param` parameter makes it possible that other types of parameters could be supported in the future, however, this is not currently implemented. -The stparse module is used to examine the parsed contents of the node file. A node file using the stparse module (and thus st_node module) has a structure file format; see StructureFile.txt. The file format is a tree structure with objects, subobjects, and attributes. The internal parsed representation is a tree, with each tree node being an object in the structure file, and each node having attributes, each of which is also a tree node. Thus, there are three different node types in the tree representation: the top-level ST_T_STRUCT element, which can contain subgroups and attributes; a mid-level ST_T_SUBGROUP tree node, which has a content type, name, and can contain attributes and other subgroups, and lastly a ST_T_ATTRIB node which contains an attribute name and attribute values, either integer or string, and optional lists of such up to 64 items in length. To use this module, include the file stparse.h. +The function returns 0 on success, and -1 to indicate an error, in which case `mssError()` should be called before returning. -The following functions are used to manage a parsed structure file: -### pStructInf stParseMsg(pFile inp_fd, int flags) -This function is internal-use-only and is used by the st_node module to parse a structure file. +### Function: GetFirstMethod() & GetNextMethod() +```c +char* xxxGetFirstMethod(void* inf_v, pObjTrxTree* oxt); +char* xxxGetNextMethod(void* inf_v, pObjTrxTree* oxt); +``` +These functions work the same as `GetFirstAttr()` and `GetNextAttr()` (respectively), except that they return the method names instead of the attribute names. -### pStructInf stParseMsgGeneric(void* src, int (*read_fn)(), int flags) -This function is also internal-use-only (unless you want to parse the file manually without st_node's help) and is used to parse the structure file when the structure file isn't being read from an MTASK pFile descriptor. This is always the case, as the structure file data is being read from a pObject pointer. In such a case, src is the pObject pointer and read_fn is objRead(). -### int stGenerateMsg(pFile out_fd, pStructInf info, int flags) -This function, also internal-use only, is used by the st_node module to write a structure file whose internal representation is given in the 'info' parameter. +### Function: PresentationHints() +```c +pObjPresentationHints xxxPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt); +``` +The `PresentationHints()` function allows the caller to request extra information about a specific attribute on a specific driver instance object. Most of this information is intended to be used for displaying the attribute in a user interface, although it can also be useful for general data validation. As such, many drivers may not implement this function. -### int stGenerateMsgGeneric(void* dst, int (*write_fn)(), pStructInf info, int flags) -This function is stParseMsgGeneric's converse. +The `PresentationHints()` function takes three parameters: -### pStructInf stCreateStruct(char* name, char* type) -This function creates a new top-level tree item of type ST_T_STRUCT, with a given name and content-type. +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| attr_name | char* | The name of the requested attribute. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +The returns a new pObjPresentationHints struct on success, or NULL to indicate an error, in which case `mssError()` should be called before returning. This struct should be allocated using `nmMalloc()`, and memset to zero, like this: +```c +pObjPresentationHints hints = nmMalloc(sizeof(ObjPresentationHints)); +if (hints == NULL) goto error_handling; +memset(hints, 0, sizeof(ObjPresentationHints)); +``` -### pStructInf stAddAttr(pStructInf inf, char* name) -This function adds a node of type ST_T_ATTRIB to either a ST_T_STRUCT or ST_T_SUBGROUP type of node, with a given name and no values associated with that name (see AddValue, below). The new attribute tree node is linked under the 'inf' node passed, and is returned. +The return value, `hints : ObjPresentationHints`, contains the following useful fields which the function should set to give various useful information about the attribute. +- `hints->Constraint : void*`: An expression for determining if a value is valid. +- `hints->DefaultExpr : void*`: An expression defining the default value. +- `hints->MinValue : void*`: An expression defining the minimum valid value. +- `hints->MaxValue : void*`: An expression defining the maximum valid value. +- `hints->EnumList : XArray`: If the attribute is a string enum, this XArray lists the valid string values. +- `hints->EnumQuery : char*`: A query string which enumerates the valid values a string enum attribute. +- `hints->Format : char*`: A presentation format for datetime or money types, such as `"dd MMM yyyy HH:mm"` or `"$0.00"`. See `obj_datatypes.c` (near line 100) for more information creating a presentation format. +- `hints->AllowChars : char*`: An array of all valid characters for a string attribute, NULL to allow all characters. +- `hints->BadChars : char*`: An array of all invalid characters for a string attribute. If a character appears in both `hints->BadChars` and `hints->AllowChars`, the character should be rejected. +- `hints->Length : int`: The maximum length of data that can be included in a string attribute. +- `hints->VisualLength : int`: The length that the attribute should be displayed if it is show to the user. +- `hints->VisualLength2 : int`: The number of lines to use in a multi-line edit box for the attribute. +- `hints->BitmaskRO : unsigned int`: If the value is an integer that represents a bit mask, _this_ bit mask shows which bits of that bitmask are read-only. +- `hints->Style : int`: Style flags, documented below. +- `hints->StyleMask : int`: A mask for which style flags were set and which were left unset / undefined. +- `hints->GroupID : int`: Used to assign attributes to groups. Use -1 if the attribute is not in a group. +- `hints->GroupName : char*`: The name of the group to which this attribute belongs, or NULL if it is ungrouped or if the group is named elsewhere. +- `hints->OrderID : int`: Used to specify an attribute order. +- `hints->FriendlyName : char*`: Used to specify a "display name" for an attribute (e.g. `n_rows` might have a friendly name of `"Number of Rows"`). Should be [`nmSysMalloc()`](#nmsysmalloc)ed, often using [`nmSysStrdup()`](#nmsysstrdup). + +- ⚠️ **Warning**: Behavior is undefined if: + - The data is longer than length. + +The `hints->Style` field can be set with several useful flags. To specify that a flag is not set (e.g. to specify explicitly that a field does allow `NULL`s), set the coresponding bit in the `hints->StyleMask` field while leaving the the bit in the `hints->Style` field set to 0. + +The following macros are provided for setting style flags: +- `OBJ_PH_STYLE_BITMASK`: The items in `hints->EnumList` or `hints->EnumQuery` are bitmasked. +- `OBJ_PH_STYLE_LIST`: List-style presentation should be used for the values of an enum attribute. +- `OBJ_PH_STYLE_BUTTONS`: Radio buttons or check boxes should be used for the presentation of enum attribute values. +- `OBJ_PH_STYLE_NOTNULL`: The attribute does not allow `NULL` values. +- `OBJ_PH_STYLE_STRNULL`: An empty string (`""`) should be treated as a `NULL` value. +- `OBJ_PH_STYLE_GROUPED`: The GroupID should be checked and so that fields can be grouped together. +- `OBJ_PH_STYLE_READONLY`: The user is not allowed to modify this attribute. +- `OBJ_PH_STYLE_HIDDEN`: This attribute should be hidden and not presented to the user. +- `OBJ_PH_STYLE_PASSWORD`: Values in this attribute should be hidden, such as for passwords. +- `OBJ_PH_STYLE_MULTILINE`: String values should allow multiline editting. +- `OBJ_PH_STYLE_HIGHLIGHT`: This attribute should be highlighted when presented to the user. +- `OBJ_PH_STYLE_LOWERCASE`: This attribute only allows lowercase characters. +- `OBJ_PH_STYLE_UPPERCASE`: This attribute only allows uppercase characters. +- `OBJ_PH_STYLE_TABPAGE`: Prefer the tab-page layout for grouped fields. +- `OBJ_PH_STYLE_SEPWINDOW`: Prefer separate windows for grouped fields. +- `OBJ_PH_STYLE_ALWAYSDEF`: Always reset the default value when this attribute is modified. +- `OBJ_PH_STYLE_CREATEONLY`: This attribute is writeable only when created, after that it is read only. +- `OBJ_PH_STYLE_MULTISEL`: This enum attribute can accept more than one value from the list of valid values. Think of using checkboxes instead of radio buttons (although the flag does requirement this UI decision). +- `OBJ_PH_STYLE_KEY`: This attribute is a primary key. +- `OBJ_PH_STYLE_APPLYCHG`: Presentation hints should be applied on DataChange instead of on DataModify. + + +### Function: Info() +```c +int xxxInfo(void* inf_v, pObjectInfo info); +``` +The `Info()` function allows the caller to request extra information about a specific driver instance object. It takes two parameters: -### pStructInf stAddGroup(pStructInf inf, char* name, char* type) -This function adds a node of type ST_T_SUBGROUP to either a ST_T_SUBGROUP or ST_T_STRUCT tree node, with a given name and content type (content type such as 'report/query'). +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| info | pObjectInfo | A driver info struct allocated by the caller which the driver sets with information. + +The `pObjectInfo` struct has two fields: `Flags` and `nSubobjects`. This function should set `info->Flags` to 0 (to ensure no uninitialized noise gets into the data), then & it with all of the following flags that apply to that object. +- `OBJ_INFO_F_CAN_HAVE_SUBOBJ` / `OBJ_INFO_F_CANT_HAVE_SUBOBJ`: Indicates that the object can or cannot have subobjects. +- `OBJ_INFO_F_HAS_SUBOBJ` / `OBJ_INFO_F_NO_SUBOBJ`: Indicates that the object has or does not have subobjects. +- `OBJ_INFO_F_SUBOBJ_CNT_KNOWN`: Indicates that we know the number of subobjects. If set, the count should be stored in `info->nSubobjects`. +- `OBJ_INFO_F_CAN_HAVE_CONTENT` / `OBJ_INFO_F_CANT_HAVE_CONTENT`: Indicates that the object can or cannot have content (see `Read()` / `Write()`). +- `OBJ_INFO_F_HAS_CONTENT` / `OBJ_INFO_F_NO_CONTENT`: Indicates that this object does or does not have content (see `Read()` / `Write()`). +- `OBJ_INFO_F_CAN_SEEK_FULL`: Seeking is fully supported (both forwards and backwards) on the object. +- `OBJ_INFO_F_CAN_SEEK_REWIND`: Seeking is only supported with an offset of `0`. +- `OBJ_INFO_F_CANT_SEEK`: Seeking is not supported at all. +- `OBJ_INFO_F_CAN_ADD_ATTR` / `OBJ_INFO_F_CANT_ADD_ATTR`: Indicates that the object does or does not allow attributes to be added with the [AddAttr()](#function-addattr) function. +- `OBJ_INFO_F_SUPPORTS_INHERITANCE`: Indicates that the object supports inheritance through attributes such as `cx__inherit`. +- `OBJ_INFO_F_FORCED_LEAF`: Indicates that the object is forced to be a 'leaf' unless ls__type used. +- `OBJ_INFO_F_TEMPORARY`: Indicates that this is a temporary object without a valid pathname. + +The function returns 0 on success, and -1 to indicate an error, in which case `mssError()` should be called before returning. + + +### Function: Commit() +```c +int xxxCommit(void* inf_v, pObjTrxTree *oxt); +``` +The `Commit()` function immediately completes the current transaction, ensuring that all writes are applied to the affected data before returning. For example, if the current transaction involves creating a database row, this call will ensure that the row is created and the transaction is closed before returning. This allows the caller to ensure that actions in a transaction have been completed without needing to close the object, which they may wish to continue using. -### int stAddValue(pStructInf inf, char* strval, int intval) -This function adds a value to an attribute, and can be called multiple times on an attribute to add a list of values. If 'strval' is not null, a string value is added, otherwise an integer value is added. The string is NOT copied, but is simply pointed-to. If the string is non-static, and has a lifetime less than the ST_T_ATTRIB tree node, then the following procedure must be used: +The `Commit()` function takes two parameters: - char* ptr; - char* nptr; - pStructInf attr_inf; +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. - attr_inf = stAddAttr(my_parent_inf, "myattr"); - nptr = (char*)malloc(strlen(ptr)+1); - if (!nptr) go_report_the_error_and_return; - strcpy(nptr, ptr); - stAddValue(attr_inf, nptr, 0); - attr_inf->StrAlloc[0] = 1; +The function returns 0 on success, and -1 to indicate an error, in which case `mssError()` should be called before returning. -By following this method (making a copy of the string and then setting the StrAlloc value for that string), when the StructInf tree node is freed by the stparse module, the string will auto- matically be freed as well. -### pStructInf stLookup(pStructInf inf, char* name) -This routine examines all sub-tree-nodes, both group and attribute nodes, for a group or attribute with the given name. If it finds one, it returns a pointer to the sub-node, otherwise NULL. +### Function: GetQueryCoverageMask() +```c +int xxxGetQueryCoverageMask(pObjQuery this); +``` +This function is only intended to be used by the MultiQuery module. Any other driver should not provide this function by setting the appropriate struct field to `NULL`. -### int stAttrValue(pStructInf inf, int* intval, char** strval, int nval) -This function returns the value of the given attribute in an ST_T_ATTRIB tree node. If a string value is being returned, pass a pointer to the string pointer. If an integer value is being returned, pass a pointer to an integer. The pointer not being used must be left NULL. 'nval' can normally be 0, but if the attribute has several values, setting nval to 1,2,3, etc., returns the 2nd, 3rd, 4th item, respectively. This routing returns -1 if the attribute value did not exist or if the wrong type was requested. It also returns -1 if 'inf' was NULL. -It is common practice to use the stLookup and stAttrValue functions together to retrieve values, and search for an attribute StructInf and retrieve its value in one operation: +### Function: GetQueryIdentityPath() +```c +int xxxGetQueryIdentityPath(pObjQuery this, char* pathbuf, int maxlen); +``` +This function is only intended to be used by the MultiQuery module. Any other driver should not provide this function by setting the appropriate struct field to `NULL`. - pStructInf inf; - char* ptr; - if (stAttrValue(stLookup(inf, "myattr"),NULL,&ptr,0) == 0) - { - printf("%s is the value\n", ptr); - } -### int stFreeInf(pStructInf this) -This function is used to free a StructInf tree node. It will free any sub-nodes first, so if that is not desired, be sure to disconnect them by removing them from the SubInf array and appropriately adjusting the nSubInf counter, and setting the SubInf array position to NULL. This function also disconnects the tree node from its parent, if any, so if the parent is already free()'d, be sure to set the node's Parent pointer to NULL. Any strings marked allocated with the StrAlloc flags will be free()'d. +## III Reading the Node Object +A driver will commonly configure itself by reading text content from its node object file, at the root of its object subtree. This content may define what resource(s) a driver should provide, how it should access or compute them, and other similar information. Most drivers use the structure file format for their node objects because SN module makes parsing, reading, and writing these files easier. It also performs caching automatically to improve performance. -It is also common practice to bypass the stXxx() functions entirely and access the elements of the StructInf structures themselves. This is not forbidden, and may be done. See the file stparse.h for a description of the structure. For example, +- 📖 **Note**: The node object will **already be open** as an object in the ObjectSystem: The OSML does this for each driver. If a driver does not use the SN/ST modules, then it should read and write the node object directly with `objRead()` and `objWrite()`. A driver should **NEVER** `objClose()` the node object! The OSML handles that. - pStructInf inf; - int i; +Although using the structure file format may be complex, it allows significant flexibility, as well as greater consistency across drivers. The use of this shared syntax across different drivers makes learning to use a new driver far easier than it would be if they all used unique, custom syntax for specifying properties. In the structure file syntax, data is structured in hierarchies where each sub-object can have named attributes as well as sub-objects. Centrallix has many examples of this, including any `.qy`, `.app`, `.cmp`, or `.cluster` file. - for(i=0;inSubInf;i++) - { - if (inf->SubInf[i]->Type == ST_T_ATTRIB) - { - /** do stuff with attribute... **/ - } - } +Structure files are accessed via the st_node (SN) and stparse (SP) modules. The st_node module loads and saves the structure file heirarchies as a whole. It also manages caching to reduce disk activity and eliminate repeated parsing of the same file. The stparse module provides access to the individual attributes and groups of attributes within a node structure file. -## IV Memory Management in Centrallix -Centrallix has its own memory manager that caches freshly-deallocated blocks of memory in lists according to size so that they can be quickly reallocated. This memory manager also catches double-freeing of blocks, making debugging of memory problems a little easier. +For example, if two sessions open two files, `/test1.rpt` and `/test2.rpt` the st_node module will cache the internal representations of these node object files, and for successive uses of these node objects, the physical file will not be re-parsed. The file will be re-parsed if its timestamp changes. -In addition the memory manager provides statistics on the hit ratio of allocated blocks coming from the lists vs. malloc(), and information on how many blocks of each size/type are allocated out and cached. This information can be invaluable in tracking down memory leaks. +If the underlying object does not support the attribute "last_modification" (assumed to be the timestamp), then st_node prints a warning. In essence, this warning indicates that changes to the underlying object will not trigger the st_node module to re-read the structure file defining the node object. Otherwise, the st_node module keeps track of the timestamp, and if it changes, the node object is re-read and re-parsed. -One caveat is that this memory manager does not provide a realloc() function, so the standard malloc(), free(), and realloc() must be used for blocks of memory that might grow in size. This memory manager is also perhaps not the best to use for blocks of memory of arbitrary sizes, but rather is best for allocating structures quickly that are of a specific size and belong to specific objects, such as the StructInf structure or the SnNode structure, and others. In short, use it for structures, but not for strings. +### Module: st_node +To obtain node object data, the driver should first open the node object with the st_node module. To use this module, include the file `st_node.h`, which provides the following functions (read `st_node.c` for more functions and additional information): -Empirical testing has shown an increase of performance of around 50% or more in programs with the newmalloc module in use. -The following are the functions for the newmalloc module: +### st_node: snReadNode() +```c +pSnNode snReadNode(pObject obj); +``` +The `snReadNode()` function reads a Structure File from the `obj` parameter, which should be a previously opened object. In a driver's `Open()` function, this is `obj->Prev` (the node object as opened by the previous driver in the OSML's chain of drivers). -### void* nmMalloc(int size) -This function allocates a block of the given 'size'. It returns NULL if the memory could not be allocated. +**Usage:** +```c +pSnNode node = snReadNode(obj->Prev); +if (node == NULL) goto error_handling; +``` -### void nmFree(void* ptr, int size) -This function frees the block of memory. NOTE THAT THE CALLING FUNCTION MUST KNOW THE SIZE OF THE BLOCK. Getting this wrong is very bad. For structures, this is trivial, just use sizeof() just like with nmMalloc(). +The returned node structure is managed by the SN module and does not need to be `nmFree()`ed. Instead, the driver should increment the node structure's link count for as long as it intends to use this structure, using `node->OpenCnt++;`. When the structure is no longer needed (e.g. when the driver instance is closed), the driver should decrement the link count. -### void nmStats() -Prints out statistics on how well the memory manager is doing. -### void nmRegister(int size, char* name) -Registers a name with a block size. This allows the memory manager to be intelligent when reporting block allocation counts. The first argument is the size of the block, the second, an intelligent name for that size of block. A size can have more than one name. This function is optional and need not be used except when tracking down memory leaks, but can be used freely. +### st_node: snNewNode() +```c +pSnNode snNewNode(pObject obj, char* content_type); +``` +The `snNewNode()` function creates a new node object of the given content type. The open link count should be incremented and decremented when appropriate, as with `snReadNode()`. -Typically this function is called in a module's Initialize() function on each of the structures the module uses internally. +**Usage:** +```c +pSnNode node = snNewNode(obj->Prev, "system/structure"); +if (node == NULL) goto error_handling; +``` -### void nmDebug() -Prints out a listing of block allocation counts, giving (by size): 1) number of blocks allocated but not yet freed, 2) number of blocks in the cache, 3) total allocations for this block size, and a list of names (from nmRegister()) for that block size. +In this case, the new structure file will have the type: `"system/structure"`. -### void nmDeltas() -Prints a listing of all blocks whose allocation count has changed, and by how much, since the last nmDeltas() call. This function is VERY USEFUL FOR MEMORY LEAK DETECTIVE WORK. +- 📖 **Note**: This function only creates node object content, so the underlying object file must already exist. The OSML should do this for you because the previous driver (`obj->Prev`) creates the underlying object. -### void* nmSysMalloc(int size) -Allocates memory without using the block-caching algorithm. This is roughly equivalent to malloc(), but pointers returned by malloc and this function are not compatible with each other - i.e., you cannot free() something that was nmSysMalloc'ed, nor can you nmSysFree() something that was malloc'ed. -This function is much better to use on variable-sized blocks of memory. nmMalloc is better for fixed-size blocks, such as for data structures. +### st_node: snWriteNode() +```c +int snWriteNode(pSnNode node); +``` +The `snWriteNode()` function writes a node's internal data back out to the node file, if the node's status (`node->Status`) is set to `SN_NS_DIRTY`. Otherwise, `snWriteNode()` does nothing. -### void nmSysFree(void* ptr) -Frees a block of memory allocated by nmSysMalloc, nmSysStrdup, or nmSysRealloc. -### void* nmSysRealloc(void* ptr, int newsize) -Changes the size of an allocated block of memory that was obtained via nmSysMalloc or nmSysRealloc or nmSysStrdup. The new pointer may be different if the block had to be moved. This is the rough equivalent of realloc(). Usage Note: If you are realloc'ing a block of memory, and need to store pointers to data somewhere inside the block, it is often better to store the offset rather than a full pointer, as a pointer would become invalid if a nmSysRealloc caused the block to move. +### st_node: snDelete() +```c +int snDelete(pSnNode node); +``` +The `snDelete()` function deletes a node by removing the node's data from the internal node cache. -### char* nmSysStrdup(const char* str) -Allocates memory for a copy of the string str by using the nmSysMalloc function, and then makes a copy of the string str. It is a rough equivalent of strdup(). The resulting pointer can be free'd using nmSysFree(). +- 📖 **Note**: This does not actually delete the node file. -Calling free() on a block obtained from nmMalloc() or calling nmFree() on a block obtained from malloc() will not crash the program. Instead, it will result in either inefficient use of the memory manager, or a huge memory leak, respectively. These practices will also render the statistics and block count mechanisms useless. -## V Other Utility Modules -There are many other utility modules useful in Centrallix. These include the xarray module, used for managing growable arrays; the xhash module, used for managing hash tables with no overflow problems and variable-length keys, the xstring module used for managing growable strings; the expression module used for compiling and evaluating expressions; and the mtsession module, used for managing session-level variables and reporting errors. +### st_node: snGetSerial() +```c +int snGetSerial(pSnNode node); +``` +The `snGetSerial()` function returns the serial number of the node. -### A. XArray (XA) - Arrays -The first is the xarray (XA) module. +Each time the node is re-read because of modifications to the node file or is written with because `snWriteNode()` was called after modifications to the internal structure, the serial number is increased. This is a good way for a driver to determine if the node file has changed so it can refresh internal cached data. -#### xaInit(pXArray this, int init_size) -This function initializes an allocated-but-uninitialized xarray. It makes room for 'init_size' items initially, but this is only an optimization. A typical value for init_size is 16. -#### xaDeInit(pXArray this) -This de-initializes an xarray, but does not free the XArray structure itself. +### st_node: snGetLastModification() +```c +pDateTime snGetLastModification(pSnNode node); +``` +The `snGetLastModification()` function returns the date and time that a file was last modified. This pointer will remain valid as long as the passed `pSnNode` struct remains valid. It is managed by the `st_node` module, so the caller should not free the returned pointer. This function promises not to fail and return `NULL`. -#### xaAddItem(pXArray this, void* item) -This adds an item to the array. The item can be a pointer or an integer (but ints will need a typecast on the function call). -#### xaAddItemSorted(pXArray this, void* item, int keyoffset, int keylen) -This adds an item to the xarray, and keeps the array sorted. The value for sorting is expected to begin at offset 'keyoffset' and continue for 'keylen' bytes. This only works when pointers are stored in the array, not integers. +### Module: stparse +The stparse module is used to examine the parsed contents of the node file using the structure file format; see [StructureFile.txt](../centrallix-doc/StructureFile.txt). This format is a tree structure with node objects that can each have sub-objects and named attributes. Thus, stparse uses three distinct node types: +- `ST_T_STRUCT`: The top-level node, containing the subtrees and attributes in the file. +- `ST_T_SUBGROUP`: A mid-level type for subobjects within the top-level node. Each subgroup has a content type, name, and may contain attributes and other subgroups. +- `ST_T_ATTRIB`: A bottom-level type for each named attribute. Each attribute has a name and values, either of type integer or string, and optional lists of such up to 64 items in length. -#### xaFindItem(pXArray this, void* item) -This returns the offset into the array's items of the given value. An exact match is required. The array's items are given below: +To use this module, include the file `stparse.h`, which includes the following functions (read `stparse.c` for more functions and additional information): - XArray xa; - pStructInf inf; - int item_id; - xaInit(&xa, 16); +### stparse: stStructType() +```c +int stStructType(pStructInf this); +``` +The `stStructType()` function returns the struct type of the past `pStructInf` parameter, which is either `ST_T_ATTRIB` or `ST_T_SUBGROUP` (see above). - [...] +- ⚠️ **Warning**: The node object root of type `ST_T_STRUCT` will return `ST_T_SUBGROUP` from this function. In most cases, treating this node as ust another subgroup simplifies logic for the caller. However, if you wish to avoid this behavior, read `inf->Type` (see [stparse: Using Fields Directly](#stparse-using-fields-directly) for more info). - xaAddItem(&xa, inf); - [...] +### stparse: stLookup() +```c +pStructInf stLookup(pStructInf inf, char* name); +``` +The `stLookup()` function searches all sub-tree nodes for a group or attribute of the given name and returns a pointer to it or returns `NULL` if no group or attribute was found. - item_id = xaFindItem(&xa, inf); - inf == xa.Items[item_id]; -#### xaRemoveItem(pXArray this, int index) -This function removes an item from the xarray at the given index. +### stparse: stAttrValue() +```c +int stAttrValue(pStructInf inf, int* intval, char** strval, int nval); +``` +This function gets the value of the given attribute in an `ST_T_ATTRIB` node. If the value is an integer, the caller should pass a pointer to an integer where it can be stored. If the value is a string, the caller should pass a pointer to string (aka. a `char*`) where char* for the string can be stored. The unused alternate pointer must be left `NULL`. `nval` can normally be 0, but if the attribute has several values, setting nval to 1, 2, 3, etc., returns the 2nd, 3rd, 4th item, respectively. -### B. XHash (XH) - Hash Tables -The xhash module provides an extensible hashing table interface. The hash table is a table of linked lists of items, so collisions and overflows are not a problem as in conventional hash tables. +This function returns -1 if the attribute value did not exist, if the wrong type was requested, or if 'inf' was `NULL`. -### int xhInit(pXHashTable this, int rows, int keylen) -This initializes a hash table, giving it the given number of rows, and setting the key length. For variable length keys (null- terminated strings), use a key length of 0 (zero). The 'rows' should be an odd number, preferably prime, but does not need to be. It SHOULD NOT be a power of 2. It's value is an optimization depending on how much data you expect to be in the hash table. If its value is set to 1, the hash search degenerates to a linear array search. The value should be large enough to comfortably accomodate the elements. Typical values might be 31 or 255 (though 255 is not prime). +It is common practice to use `stLookup()` and `stAttrValue()` or `stGetExpression()` (see below) together to retrieve values, for example (where `inf` is a `pStructInfo` variable from somewhere): -#### int xhDeInit(pXHashTable this) -De-initializes a hash table. +```c +char* ptr; +if (stAttrValue(stLookup(inf, "my_attr"), NULL, &ptr, 0) != 0) + goto error_handling; +printf("The value is: %s\n", ptr); +``` -#### int xhAdd(pXHashTable this, char* key, char* data) -Adds an item to the hash table, with a given key value and data pointer. Both data and key pointers must have a lifetime that exceeds the time that they item is hashed. -#### int xhRemove(pXHashTable this, char* key) -Removes an item with the given key value from the hash table. +### stparse: stGetExpression() +```c +pExpression stGetExpression(pStructInf this, int nval); +``` +Returns a pointer to an expression that represents the value of the nval-th element of the given struct. -#### char* xhLookup(pXHashTable this, char* key) -Returns the data pointer for a given key, or NULL if the item is not found. -#### int xhClear(pXHashTable this, int free_blk) -Clears all items from a hash table. If free_blk is set to 1, the items are free()'d as they are removed. +### stparse: stCreateStruct() +```c +pStructInf stCreateStruct(char* name, char* type); +``` +This function creates a new top-level tree item of type `ST_T_STRUCT`, with a given name and content-type. -### C. XString (XS) - Strings -The xstring (XS) module is used for managing growable strings. It is based on a structure containing a small initial string buffer to avoid string allocations for small strings, but with the capability of performing realloc() operations to extend the string space for storing incrementally larger strings. The interface to this module allows for strings to contain arbitrary data, even null '\0' characters mid-string. Thus it is useful as an extensible buffer module as well. -#### int xsInit(pXString this) -Initializes an XString structure, to an empty string. +### stparse: stAddAttr() +```c +pStructInf stAddAttr(pStructInf inf, char* name); +``` +This function adds a node of type `ST_T_ATTRIB` to either an `ST_T_STRUCT` or an `ST_T_SUBGROUP` type of node, with a given name and no values (see AddValue, below). The new attribute tree node is linked under the `inf` node passed, and is returned. -#### int xsDeInit(pXString this) -Deinitializes an XString structure. -#### int xsConcatenate(pXString this, char* text, int len) -Concatenates the string 'text' onto the end of the XString's value. If len is -1, all data up to the null terminater is copied. If len is set, all data up to length 'len' is copied, including possible '\0' characters. +### stparse: stAddGroup() +```c +pStructInf stAddGroup(pStructInf inf, char* name, char* type); +``` +This function adds a node of type `ST_T_SUBGROUP` to either an `ST_T_SUBGROUP` or an `ST_T_STRUCT` tree node, with a given name and content type (content type such as `"report/query"`). -#### int xsCopy(pXString this, char* text, int len) -Copies the string 'text' into the XString. Like xsConcatenate, except that the previous string contents are overwritten. -#### char* xsStringEnd(pXString this) -Returns a pointer to the end of the string. Useful for finding the end of the string without performing: +### stparse: stAddValue() +```c +int stAddValue(pStructInf inf, char* strval, int intval); +``` +This function adds a value to an attribute, and can be called multiple times on an attribute to add a list of values. If `strval` is not null, a string value is added, otherwise an integer value is added. The string is NOT copied, but is simply pointed-to. - pXString xs; + - xs->String + strlen(xs->String) +### stparse: stFreeInf() +```c +int stFreeInf(pStructInf this); +``` +This function is used to free a `StructInf` tree node. This also recursively frees sub-tree nodes, so these should be disconnected before calling if they are still needed. To do this, remove them from the SubInf array by appropriately adjusting the nSubInf counter and setting the SubInf array position to `NULL`. This function also disconnects the tree node from its parent, if any, so if the parent is already `free()`'d, prevent this behavior by setting the node's Parent pointer to `NULL` before calling this function. Any strings marked allocated with the StrAlloc flags will also be `free()`'d by this function, so update that flag if necessary. + + +### stparse: Using Fields Directly +It is also common practice to bypass the stparse functions entirely and access the elements of the `StructInf` struct directly, which is allowed. (See `stparse.h` for more information about this structure.) + +For example (assuming `inf` is a `pStructInfo` variable in scope): +```c +for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + switch (inf->SubInf[i]->Type) + { + case ST_T_ATTRIB: + /** Do stuff with attribute... **/ + break; + + case ST_T_SUBGROUP: + /** Do stuff with group... **/ + break; + + ... + } + } +``` -since the xs module already knows the string length and does not have to search for the null terminator. Furthermore, since the string can contain nulls, the above statement could produce incorrect results in those situations. -The contents of the XString can be easily referenced via: - pXString xs; +## IV Module: Expression +The expression (EXP) module is used for compiling, evaluating, reverse-evaluating, and managing parameters for expression strings. The expression strings are compiled and stored in an expression tree structure. - printf("This string is %s\n", xs->String); +Expressions can be stand-alone expression trees, or they can take parameter objects. A parameter object is an open object (from `objOpen()`) whose values (attributes) are referenced within the expression string. By using such parameter objects, one expression can be compiled and then evaluated for many different objects with diverse attribute values. -IMPORTANT NOTE: Do not store pointers to values within the string while you are still adding text to the end of the string. If the string ends up realloc()ing, your pointers will be incorrect. Instead, if data in the middle of the string needs to be pointed to, store offsets from the beginning of the string, not pointers to the string. +Expression evaluation results in the top-level expression tree node having the final value of the expression, which may be `NULL`, and may be an integer, string, datetime, money, or double data type. For example, the final value of `:myobject:oneattribute == 'yes'` is the integer 1, `true`, if the attribute's value is indeed `'yes'` (and the integer 0, `false`, otherwise). -For example, this is WRONG: +Expression reverse-evaluation takes a given final value and attempts to assign values to the parameter object attributes based on the structure of the expression tree. It is akin to 'solving for X' in algebraic work, but isn't nearly that 'smart'. For example, with the previous expression, if the final value was set to 1 (`true`), then an `objSetAttrValue()` function would be called to set myobject's `oneattribute` to `yes`. Trying this with a final value of 0 (`false`) would result in no assignment to the attribute, since there would be no way of determining the proper value for that attribute (anything other than `yes` would work). - pXString xs; - char* ptr; +Reverse evaluation is typically very useful in updateable joins and views. - xsInit(&xs); - xsConcatenate(&xs, "This is the first sentence. ", -1); - ptr = xsStringEnd(&xs); - xsConcatenate(&xs, "This is the second sentence.", -1); - printf("A pointer to the second sentence is '%s'\n", ptr); +The expression module includes the following functions: -Instead, use pointer aritmetic and do this: +### expAllocExpression() +```c +pExpression expAllocExpression(); +``` +This function allocates space to store a new expression tree, returning a pointer to the allocated memory or `NULL` if an error occurs. - pXString xs; - int offset; +### expFreeExpression() +```c +int expFreeExpression(pExpression this); +``` +This function frees an expression tree allocated using `expAllocExpression()`, returning 0 if successful or -1 if an error occurs. - xsInit(&xs); - xsConcatenate(&xs, "This is the first sentence. ", -1); - offset = xsStringEnd(&xs) - xs->String; - xsConcatenate(&xs, "This is the second sentence.", -1); - printf("A pointer to the second sentence is '%s'\n",xs->String+offset); +### expCompileExpression() +```c +pExpression expCompileExpression(char* text, pParamObjects objlist, int lxflags, int cmpflags); +``` +This function compiles a textual expression into an expression tree. The `objlist` lists the parameter objects that are allowed in the expression (see below for param objects maintenance functions). +The `lxflags` parameter is a bitmask that provides flags which will be passed to the lexer. These flags alter the manner in which the input string is tokenized. For information about these flags, see [`mlxOpenSession()`](#mlxopensession). -### D. Expression (EXP) - Expression Trees -The expression (EXP) module is used for compiling, evaluating, reverse- evaluating, and passing parameters to expression strings. The expression strings are compiled and stored in an expression tree structure. +The `cmpflags` parameter is a bitmask that provides flags which will be passed to the expression compiler. It can contain the following values: -Expressions can be stand-alone expression trees, or they can take parameter objects. A parameter object is an open object (from objOpen()) whose values (attributes) are referenced within the expression string. By using such parameter objects, one expression can be compiled and then evaluated for many different objects with diverse attribute values. +| Value | Description +| -------------------- | ------------ +| `EXPR_CMP_ASCDESC` | Recognize `asc`/`desc` following a value as flags to indicate sort order. +| `EXPR_CMP_OUTERJOIN` | Recognize the `*=` and `=*` syntax for left and right outer joins. +| `EXPR_CMP_WATCHLIST` | A list (`"value,value,value"`) is expected first in the expression. +| `EXPR_CMP_LATEBIND` | Allow late object-name binding. +| `EXPR_CMP_RUNSERVER` | Compile as a `runserver` expression (for dynamic binding). +| `EXPR_CMP_RUNCLIENT` | Compile as a `runclient` expression (for client-side binding). +| `EXPR_CMP_REVERSE` | Lookup names in the reverse order. + +### expCompileExpressionFromLxs() +```c +pExpression expCompileExpressionFromLxs(pLxSession s, pParamObjects objlist, int cmpflags); +``` +This function is similar to [`expCompileExpression()`](#expcompileexpression), excpet that it compiles from a provided lexer session instead of from a string. -Expression evaluation results in the top-level expression tree node having the final value of the expression, which may be NULL, and may be an integer, string, datetime, money, or double data type. For example, the final value of +### expPodToExpression() +```c +pExpression expPodToExpression(pObjData pod, int type, pExpression provided_exp) +``` +This function builds an expression node from a single piece of data, passed using the `pObjData` of the given datatype. This function can be used to initialize a provided expression (`provided_exp`), or it will allocate a new one if none is provided (aka. `provided_exp` is `NULL`). - :myobject:oneattribute == 'yes' +For example, the following code creates an expression representing the integer 1. +```c +int value = 1; +pExpression exp = expPodToExpression(POD(value), DATA_T_INTEGER, NULL); +``` -would be integer 1 (true) if the attribute's value is indeed 'yes'. +This function returns a pointer to the expression if successful, or `NULL` if an error occurs. -Reverse expression evaluation takes a given final value and attempts to assign values to the parameter object attributes based on the structure of the expression tree. It is akin to 'solving for X' in algebraic work, but isn't nearly that 'smart'. For example, with the previous expression, if the final value was set to 1 (true), then an objSetAttrValue() function would be issued to set myobject's 'oneattribute' to 'yes'. Trying this with a final value of 0 (false) would result in no assignment to the attribute, since there would be no way of determining the proper value for that attribute (anything other than 'yes' would work). +- 📖 **Note**: There is also a `expPtodToExpression()` function for working with the `Ptod` (pointer to object data) struct. -Reverse evaluation is typically very useful in updateable joins and views. +### expExpressionToPod() +```c +int expExpressionToPod(pExpression this, int type, pObjData pod); +``` +This function reverses the functionality of [`expPodToExpression()`](#exppodtoexpression) to instead read data from an evaluated expression. Be careful, this does not evaluate the expression if it is not already evaluated. This function returns 0 if successful, 1 if the expression is NULL, or -1 if an error occurs. -Here are the basic expression functions: +- 📖 **Note**: The source code for this function can be a useful reference when interacting with expression structures, such as when implementing the c code for an exp_function. -#### pExpression expCompileExpression(char* text, pParamObjects objlist, int lxflags, int cmpflags) -This function compiles a textual expression into an expression tree. The 'objlist' lists the parameter objects that are allowed in the expression (see below for param objects maintenance functions). +- 📖 **Note**: There is also a `expExpressionToPtod()` function for working with the `Ptod` (pointer to object data) struct. -The 'lxflags' parameter gives a set of lexical analyzer flags for the compilation. These flags alter the manner in which the input string is tokenized. A bitmask; possible values are: +### expDuplicateExpression() +```c +pExpression expDuplicateExpression(pExpression this); +``` +This function creates a recursive deep copy of the expression and associated expression tree, returning a pointer to this new copy if successful and `NULL` if an error occurs. -| Value | Description -| ---------------- | ------------ -| MLX_F_ICASEK | automatically convert all keywords (non-quoted strings) to lowercase. -| MLX_F_POUNDCOMM | allow comment lines that begin with a # sign. -| MLX_F_CCOMM | allow c-style comments /* */ -| MLX_F_CPPCOMM | allow c-plus-plus comments // -| MLX_F_SEMICOMM | allow semicolon comments ;this is a comment -| MLX_F_DASHCOMM | allow double-dash comments --this is a comment -| MLX_F_DASHKW | keywords can include the dash '-'. Otherwise, the keyword is treated as two keywords with a minus sign between them. -| MLX_F_FILENAMES | Treat a non-quoted string beginning with a slash '/' or dot-slash './' as a filename, and allow slashes and dots in the string without quotes needed. -| MLX_F_ICASER | automatically convert all reserved words to lowercase. The use of this flag is highly recommended, and in some cases, required. -| MLX_F_ICASE | same as MLX_F_ICASER | MLX_F_ICASEK. +### expIsConstant() +```c +int expIsConstant(pExpression this); +``` +This function returns a truthy value if the provided expression is of a type that is always the same, such as an integer, string, double, etc. Otherwise, it returns a falsy value. -The 'cmpflags' is a bitmask parameter controlling the compilation of the expression. It can contain the following values: +### expEvalTree() +```c +int expEvalTree(pExpression this, pParamObjects objlist); +``` +This function evaluates the expression using the provided list of parameter objects. It returns 0 if successful or 1 if the result is `NULL`, and -1 if an error occurs. -| Value | Description -| ------------------- | ------------ -| EXPR_CMP_WATCHLIST | A list "value,value,value" is expected first in the expression. -| EXPR_CMP_ASCDESC | Recognize 'asc' and 'desc' following a value as flags to indicate sort order. -| EXPR_CMP_OUTERJOIN | Recognize the *= and =* syntax as outer joins. +### expCreateParamList() +```c +pParamObjects expCreateParamList(); +``` +This function allocates and returns a new parameter object list containing no parameters, or returns `NULL` if an error occurs. -#### expFreeExpression(pExpression this) -Frees an expression tree. +### expFreeParamList() +```c +int expFreeParamList(pParamObjects this); +``` +This function frees a parameter object list, returning 0 if successful and -1 if an error occurs. -#### int expEvalTree(pExpression this, pParamObjects objlist) -Evaluates an expression against a list of parameter objects. If the evaluation is successful, returns 0 or 1, otherwise -1. +### expAddParamToList() +```c +int expAddParamToList(pParamObjects this, char* name, pObject obj, int flags); +``` +This function adds a parameter to the parameter object list. The `obj` pointer may be left `NULL` during the expCompileExpression state of operation but must be set to a value before expEvalTree is called. Otherwise the attributes that reference that parameter object will result in `NULL` values in the expression. (Although this _technically_ is not an error, it's usually not intended behavior). Flags can be `EXPR_O_CURRENT` if the object is to be marked as the current one, or `EXPR_O_PARENT` if it is to be marked as the parent object. Current and Parent objects can be referenced in an expression like this: + +``` +:currentobjattr +::parentobjattr +``` -#### pParamObjects expCreateParamList() -Allocates a new parameter object list, with no parameters. +### expModifyParam() +```c +int expModifyParam(pParamObjects this, char* name, pObject replace_obj); +``` +This function is used to update a parameter object with a new open pObject, possibly one returned from `objOpen()` or `objQueryFetch()`. This function returns 0 if successful and -1 if an error occurs. -#### int expFreeParamList(pParamObjects this) -Frees a parameter object list. +### expRemoveParamFromList() +```c +int expRemoveParamFromList(pParamObjects this, char* name); +``` +This function removes a parameter object from the list, returning 0 if successful and -1 if an error occurs. -#### int expAddParamToList(pParamObjects this, char* name, pObject obj, int flags) -Adds a parameter to the parameter object list. The 'obj' pointer may be left NULL during the expCompileExpression state of operation but must be set to a value before expEvalTree is called. Otherwise the attributes that reference that parameter object will result in NULL values in the expression (it's technically not an error). Flags can be EXPR_O_CURRENT if the object is to be marked as the current one, or EXPR_O_PARENT if it is to be marked as the parent object. Current and Parent objects can be referenced in an expression like this: +- 📖 **Note**: There is also a `expRemoveParamFromListById()` function. - :currentobjattr - ::parentobjattr +### expSetParamFunctions() +```c +int expSetParamFunctions(pParamObjects this, char* name, int (*type_fn)(), int (*get_fn)(), int (*set_fn)()); +``` +This function sets the param accessor functions used to access params on a specific name. Some example function signatures for the `type_fn()`, `get_fn()`, and `set_fn()` are provided below: -and is thus a shortcut to typing the full object name. +```c +static int ci_GetParamType(void* v, char* attr_name); +static int ci_GetParamValue(void* v, char* attr_name, int datatype, pObjData val); +static int ci_SetParamValue(void* v, char* attr_name, int datatype, pObjData val); +``` -#### int expModifyParam(pParamObjects this, char* name, pObject replace_obj) -This function is used to update a parameter object with a new open pObject returned from objOpen or objQueryFetch. +- `v : void*` is the object provided in `expAddParamToList()` (or a similar function). +- `attr_name : char*` is the string name for the requested attribute. +- `datatype : int` is the data type for the requested attribute. +- `val : pObjectData` is either a buffer in which to store the requested data (`ci_GetParamValue()`) or a buffer containing data that will be copied to the parameter `ci_SetParamValue()`. -#### int expRemoveParamFromList(pParamObjects this, char* name) -This function removes a parameter object from the list. +All three of these functions return 0 for success, 1 if the attribute is `NULL`, or -1 if an error occurs. The `expSetParamFunctions()` function returns 0 if the functions were set successfully, or -1 if an error occurs. -#### int expReverseEvalTree(pExpression tree, pParamObjects objlist) +### expReverseEvalTree() +```c +int expReverseEvalTree(pExpression tree, pParamObjects objlist)l +``` This function reverse-evaluates a tree. The results of an expression evaluation can be accessed by examining the @@ -807,228 +1250,259 @@ top-level tree node. The following properties are useful: There are several other EXP functions used to deal with aggregates and a few other obscure features as well. Aggregates are mostly handled internally by Centrallix so further explanation should not be necessary here. -### E. MTSession (MSS) - Basic Session Management -The next utility module to be described here is the mtsession module (MSS). This module is used for session authentication, error reporting, and for storing session-wide variables such as the currently used date format, current username, and current password (for issuing a login request to a remote server). Care should be taken in the use of Centrallix that its coredump files are NOT in a world-readable location, as the password will be visible in the core file (or just ulimit the core file size to 0). - -#### char* mssUserName() -This function returns the current user name. - -#### char* mssPassword() -This function returns the password used to login to the Centrallix - -#### int mssSetParam(char* paramname, char* param) -This function sets a session parameter. The parameter MUST be a string value. - -#### char* mssGetParam(char* paramname) -Returns the value of a session parameter. Common ones are: - -- dfmt - current date format. -- mfmt - current money format. -- textsize - current max text size from a read of an object's content via objGetAttrValue(obj, "objcontent", POD(&str)) - -#### int mssError(int clr, char* module, char* message, ...) -Formats and caches an error message for return to the user. If 'clr' is set to 1, the assumption is that the error was JUST discovered and no other module has had reason to do an mssError on the current problem. Setting 'clr' to 1 clears all error messages from the current error message list and adds the current message. - -'module' is a two-to-five letter abbreviation of the module reporting the error. Typically it is all upper-case. - -'message' is a string for the error message. As this function will accept a variable-length argument list, the strings '%d' and '%s' can be included in 'message', and will be substituted with the appropriate integer or string arguments, in a similar way to how printf() works. - -#### int mssErrorErrno(int clr, char* module, char* message, ...) -Works much the same way as mssError, except checks the current value of 'errno' and includes a description of any error stored there. Used primarily when a system call was at fault for an error occurring. -Errors that occur inside a session context are normally stored up and not printed until other MSS module routines are called to fetch those errors. Errors occurring outside a session context (such as in Centrallix's network listener) are printed to Centrallix's standard output immediately. +## V Path Handling Functions +The OSML provides a set of utility functions that make it easier to handle path structs when writing drivers. Most of them are named `obj_internal_XxxYyy()` or similar. -These mssError routines need not be called at every function nesting level when an error happens. For example, if the expression compiler returns -1 indicating that a compilation error occurred, it probably has set one or more error messages in the error list. The calling function only needs to provide context information (e.g. _what_ expression failed compilation?) so that the user has enough information to locate the error. And once the user is told the full context of the expression compilation error, no more information need be returned. - -Another example of this is the memory manager, which sets an error message indicating when an nmMalloc() failed. The user probably does not care what kind of structure failed allocation -- he/she only needs to know that the hardware ran out of resources. Thus, upon receiving a NULL from nmMalloc, in most cases another mssError need not be issued. - -The mssError() routines do not cause the calling function to return. The function must still clean up after itself and return an appropriate value (like -1 or NULL) to indicate failure. - -### F. OSML Utility Functions -The OSML provides a set of utility functions that make it easier to write -drivers. Most of them are named obj_internal_XxxYyy or similar. - -#### char* obj_internal_PathPart(pPathname path, int start, int length) -The Pathname structure breaks down a pathname into path elements, which are text strings separated by the directory separator '/'. This function takes the given Pathname structure, and returns the number of path elements requested. For instance, if you have a path: - - /apps/kardia/data/Kardia_DB/p_partner/rows/1 - -that path would be stored internally in Centrallix as: - - ./apps/kardia/data/Kardia_DB/p_partner/rows/1 - -To just return "Kardia_DB/p_partner", you could call: +### obj_internal_PathPart() +```c +char* obj_internal_PathPart(pPathname path, int start, int length); +``` +The Pathname structure breaks down a pathname into path elements, which are text strings separated by the directory separator `'/'`. This function takes the given Pathname structure and returns the number of path elements requested (using `length`) after skipping to the `start`th element (where element 0 is the starting `.` that begins any Centrallix path). - obj_internal_PathPart(pathstruct, 4, 2); +For example, given the path: +```bash +/apps/kardia/data/Kardia_DB/p_partner/rows/1 +``` +Centrallix stores the path internally as the following (see [Parsing Path Contents](#parsing-path-contents) and [Parameters](#parameters) above): +```bash +./apps/kardia/data/Kardia_DB/p_partner/rows/1 +``` +Thus, calling `obj_internal_PathPart(pathstruct, 4, 2);` will return `"Kardia_DB/p_partner"` because the `.` is the 0th element, making `Kardia_DB` the 4th element, and we have requested two elements. -Note that return values from obj_internal_PathPart are only valid until the next call to PathPart on the given pathname structure. +- 📖 **Note**: The values returned from `obj_internal_PathPart()` use an internal buffer, so they are only valid until the next call to a PathPart function on the given pathname structure. -#### int obj_internal_AddToPath(pPathname path, char* new_element) +### obj_internal_AddToPath() +```c +int obj_internal_AddToPath(pPathname path, char* new_element); +``` This function lengthens the path by one element, adding new_element on to the end of the path. This function is frequently useful for drivers in the QueryFetch routine where the new child object needs to be appended onto the end of the given path. -This function returns < 0 on failure, or the index of the new element in the path on success. - -#### int obj_internal_CopyPath(pPathname dest, pPathname src) -Copies a pathname structure. - -#### void obj_internal_FreePathStruct(pPathname path) -Frees a pathname structure. - -## VI Network Connection Functionality -Sometimes a driver will need to initiate a network connection. This can be done via the MTASK module, which provides simple and easy TCP/IP connectivity. +This function returns the index of the new element in the path on success, or a value less than 0 on failure. -### pFile netConnectTCP(char* host_name, char* service_name, int flags) -This function connects to a server. The host name or ascii string for its ip address is in 'host_name'. The name of the service (from /etc/services) or its numeric representation in a string is the 'service_name'. Flags can normally be left 0. - -### int netCloseTCP(pFile net_filedesc, int linger_msec, int flags) -This function closes a network connection, and optionally waits up to 'linger_msec' milliseconds (1/1000 seconds) for any data written to the connection to make it to the other end before performing the close. If linger_msec is set to 0, the connection is aborted (reset). The linger time can be set to 1000 msec or so if no writes were performed on the connection prior to the close. If a large amount of writes were performed immediately perior to the close, offering to linger for a few more seconds (perhaps 5 or 10, 5000 or 10000 msec), might be a good idea. - -### int fdWrite(pFile filedesc, char* buffer, int length, int offset, int flags) -This function writes data to a file descriptor, from a given buffer and length, and to an optional seek offset and with some optional flags. Flags can be the following: +### obj_internal_CopyPath() +```c +int obj_internal_CopyPath(pPathname dest, pPathname src); +``` +This function copies a pathname structure from the `src` to the `dest`, returning 0 if successful or -1 if an error occurs. -- FD_U_NOBLOCK - If the write can't be performed immediately, don't perform it at all. -- FD_U_SEEK - The 'offset' value is valid. Seek to it before writing. Not valid for network connections. -- FD_U_PACKET - ALL of the data of 'length' in 'buffer' must be written. Normal write() semantics in UNIX state that not all data has to be written, and the number of bytes actually written is returned. Setting this flag makes sure all data is really written before returning. +### obj_internal_FreePathStruct() +```c +void obj_internal_FreePathStruct(pPathname path); +``` +This function frees a pathname structure. -#### int fdRead(pFile filedesc, char* buffer, int maxlen, int offset, int flags) -The complement to the above routine. Takes the same flags as the above routine, except FD_U_PACKET means that all of 'maxlen' bytes must be read before returning. This is good for reading a packet that is known to be exactly 'maxlen' bytes long, but which might be broken up into fragments by the network (TCP/IP has a maximum frame transmission size of about 1450 bytes). -## VII Parsing Data -Centrallix provides a lexical analyzer library that can be used for parsing many types of data. This module, mtlexer (MLX) can either parse data from a pFile descriptor or from a string value. This lexical analyzer is used by the expression compiler as well. It is basically a very fancy string tokenizer. -### pLxSession mlxOpenSession(pFile fd, int flags) -This function opens a lexer session from a file source. See the 'expression' module description previous in this document for more information on the flags. Some flags of use here but not mentioned in that section are: +## VI Parsing Data +The mtlexer (MLX) module is a lexical analyzer library provided by Centrallix for parsing many types of data. It can parse data from either a `pFile` descriptor or from a string value. This lexical analyzer is also used by the [expression compiler](#viii-module-expression). In simple terms, it's a very fancy string tokenizer. -| Flag | Description -| ------------------- | ------------ -| MLX_F_EOL | Return end-of-line as a token. Otherwise, the end of a line is just considered whitespace. -| MLX_F_EOF | Return end-of-file as a token. Otherwise, if end of file is reached it is an error. -| MLX_F_IFSONLY | Only return string values separated by tabs, spaces, newlines, and carriage returns. For example, normally the brace in "this{brace" is a token and that string will result in three tokens, but in IFSONLY mode it is just one token. -| MLX_F_NODISCARD | This flag indicates to the lexer that the calling function expects to be able to read data normally using fdRead() or another lexer session after the last token is read and the session is closed. The lexer will then attempt to "unread" bytes that it buffered during the lexical analysis process (it does fdRead() operations in 2k or so chunks). If this flag is not specified, up to 2k of information after the last token will be discarded and further fdRead()s on the file descriptor will start at an undefined place in the file. -| MLX_F_ALLOWNUL | Allow NUL characters ('\0') in the input stream. If this flag is not set, then NUL characters result in an error condition. This prevents unwary callers from mis-reading a token returned by mlxStringVal if the token contains a NUL. If ALLOWNUL is turned on, then the caller must ensure that it is safely handling values with NULs. +### mlxOpenSession() +```c +pLxSession mlxOpenSession(pFile fd, int flags); +``` +This function opens a lexer session, using a file descripter as its source. Some of the more useful values for `flags` include: + +| Value | Description +| ----------------- | ------------ +| `MLX_F_ICASEK` | Automatically convert all keywords (non-quoted strings) to lowercase. +| `MLX_F_ICASER` | Automatically convert all reserved words to lowercase. This flag is highly recommended, and in some cases, required. +| `MLX_F_ICASE` | Same as MLX_F_ICASER | MLX_F_ICASEK. +| `MLX_F_POUNDCOMM` | Respect # comment at the start of the line (`#comment`). +| `MLX_F_CCOMM` | Respect c-style comments (`/*comment*/`). +| `MLX_F_CPPCOMM` | Respect c-plus-plus comments (`//comment`). +| `MLX_F_SEMICOMM` | Respect semicolon comments (`;comment`). +| `MLX_F_DASHCOMM` | Respect double-dash comments (`--comment`). +| `MLX_F_EOL` | Return end-of-line as a token. Otherwise, this is considered whitespace. +| `MLX_F_EOF` | Return end-of-file as a token. Otherwise, reaching end of file is an error. +| `MLX_F_ALLOWNUL` | Allow null characters (`'\0'`) in the input stream, which otherwise cause an error. If this flag is set, the caller must ensure that null characters are handled safely. +| `MLX_F_IFSONLY` | Only return string values separated by tabs, spaces, newlines, and carriage returns. For example, normally the brace in `"this{brace"` is a token and that string will result in three tokens, but in `IFSONLY` mode it is just one token. +| `MLX_F_DASHKW` | Keywords can include the dash (`-`). Otherwise, the keyword is treated as two keywords with a minus sign between them. +| `MLX_F_FILENAMES` | Treat a non-quoted string beginning with a slash (`/`) or dot-slash (`./`) as a filename, and allow slashes and dots in the string without requiring quotes. +| `MLX_F_NODISCARD` | Attempt to unread unused buffered data rather than discarding it, allowing the calling function to continue reading with `fdRead()` or another lexer session after the last token is read and the session is closed. The lexer `fdRead()`s in 2k or so chunks for performance, and normally discards this data when done, causing future file decriptors to start at an undefined file location. +| `MLX_F_DBLBRACE` | Treat `{{` and `}}` as double brace tokens, not two single brace tokens. +| `MLX_F_NOUNESC` | Do not remove escapes in strings. +| `MLX_F_SSTRING` | Differentiate between strings values using `""` and `''`. + +This function returns a pointer to the new lexer session if successful, or `NULL` if an error occurs. + +### mlxStringSession() +```c +pLxSession mlxStringSession(char* str, int flags); +``` +This function opens a lexer session, using a text string as its source. The flags are the same as [`mlxOpenSession()`](#mlxopensession) above, except that `MLX_F_NODISCARD` has no effect. -### pLxSession mlxStringSession(char* str, int flags) -This function opens a lexer session from a text string. Same as the above function except that the flag MLX_F_NODISCARD makes no sense for the string. +This function returns a pointer to the new lexer session if successful, or `NULL` if an error occurs. -### int mlxCloseSession(pLxSession this) -Closes a lexer session. +### mlxCloseSession() +```c +int mlxCloseSession(pLxSession this); +``` +This function closes a lexer session, freeing all associated data. This does not also close the file descriptor used to open the lexer session, as this is assumed to be managed by the caller. This function returns 0 if successful, and -1 if an error occurs. -### int mlxNextToken(pLxSession this) +### mlxNextToken() +```c +int mlxNextToken(pLxSession this); +``` Returns the type of the next token in the token stream. Valid token types are: -| Token | Meaning | -|----------------------|---------------------------------------| -| MLX_TOK_STRING | String value, as in a "string". | -| MLX_TOK_INTEGER | Integer value. | -| MLX_TOK_EQUALS | = | -| MLX_TOK_OPENBRACE | { | -| MLX_TOK_CLOSEBRACE | } | -| MLX_TOK_ERROR | An error has occurred. | -| MLX_TOK_KEYWORD | An unquoted string. | -| MLX_TOK_COMMA | , | -| MLX_TOK_EOL | End-of-line. | -| MLX_TOK_EOF | End-of-file reached. | -| MLX_TOK_COMPARE | <> != < > >= <= == | -| MLX_TOK_COLON | : | -| MLX_TOK_OPENPAREN | ( | -| MLX_TOK_CLOSEPAREN | ) | -| MLX_TOK_SLASH | / | -| MLX_TOK_PERIOD | . | -| MLX_TOK_PLUS | + | -| MLX_TOK_ASTERISK | * | -| MLX_TOK_RESERVEDWD | Reserved word (special keyword). | -| MLX_TOK_FILENAME | Unquoted string starting with / or ./ | -| MLX_TOK_DOUBLE | Double precision floating point. | -| MLX_TOK_DOLLAR | $ | -| MLX_TOK_MINUS | - | - -### char* mlxStringVal(pLxSession this, int* alloc) -Gets the string value of the current token. If 'alloc' is NULL, only the first 255 bytes of the string will be returned, and the rest will be discarded. If 'alloc' is non-null and set to 0, the routine will set 'alloc' to 1 if it needed to allocate memory for a very long string, otherwise leave it at 0. If 'alloc' is non- null and set to 1, this routine will ALWAYS allocate memory for the string, whether long or short. - -This routine works no matter what the token type, and returns a string representation of the token if not MLX_TOK_STRING. +| Token | Required Flag | Meaning | +|-------------------------|-------------------|---------------------------------------------| +| `MLX_TOK_BEGIN` | - | Beginning of the input stream. | +| `MLX_TOK_STRING` | - | String value, e.g. `"string"`. | +| `MLX_TOK_INTEGER` | - | Integer value, e.g. `42`. | +| `MLX_TOK_EQUALS` | - | `=` | +| `MLX_TOK_OPENBRACE` | - | `{` | +| `MLX_TOK_CLOSEBRACE` | - | `}` | +| `MLX_TOK_ERROR` | - | An error has occurred. | +| `MLX_TOK_KEYWORD` | - | A keyword (unquoted string). | +| `MLX_TOK_COMMA` | - | `,` | +| `MLX_TOK_EOL` | `MLX_F_EOL` | End-of-line. | +| `MLX_TOK_EOF` | `MLX_F_EOF` | End-of-file reached. | +| `MLX_TOK_COMPARE` | - | `<>` `!=` `<` `>` `>=` `<=` `==` | +| `MLX_TOK_COLON` | - | `:` | +| `MLX_TOK_OPENPAREN` | - | `(` | +| `MLX_TOK_CLOSEPAREN` | - | `)` | +| `MLX_TOK_SLASH` | - | `/` | +| `MLX_TOK_PERIOD` | - | `.` | +| `MLX_TOK_PLUS` | - | `+` | +| `MLX_TOK_ASTERISK` | - | `*` | +| `MLX_TOK_RESERVEDWD` | - | Reserved word (special keyword). | +| `MLX_TOK_FILENAME` | `MLX_F_FILENAMES` | Unquoted string starting with / or ./ | +| `MLX_TOK_DOUBLE` | - | Double precision floating point. | +| `MLX_TOK_DOLLAR` | - | `$` | +| `MLX_TOK_MINUS` | - | `-` | +| `MLX_TOK_DBLOPENBRACE` | `MLX_F_DBLBRACE` | `{{` | +| `MLX_TOK_DBLCLOSEBRACE` | `MLX_F_DBLBRACE` | `}}` | +| `MLX_TOK_SYMBOL` | - | `+-=.,<>` etc. | +| `MLX_TOK_SEMICOLON` | - | `;` | +| `MLX_TOK_SSTRING` | `MLX_F_SSTRING` | Single quote string value, e.g. `'string'`. | +| `MLX_TOK_POUND` | - | `#` | +| `MLX_TOK_MAX` | - | Max token value (internal). | + +### mlxStringVal() +```c +char* mlxStringVal(pLxSession this, int* alloc); +``` +This function gets the string value of the current token. If `alloc` is `NULL`, only the first 255 bytes of the string will be returned, and the rest will be discarded. If `alloc` is non-null and set to 0, the routine will set `alloc` to 1 if it needed to allocate memory for a very long string, otherwise leave it as 0. If `alloc` is non-null and set to 1, this routine will _always allocate memory for the string, whether long or short. -This routine MAY NOT be called twice for the same token. +This routine works no matter what the token type, and returns a string representation of the token if not `MLX_TOK_STRING`. -Note that if MLX_F_ALLOWNUL is enabled, there is no way to tell from the return value of mlxStringVal() whether a NUL in the returned string is the end-of-string terminator, or whether it existed in the input data stream. Thus, this function should not be called when MLX_F_ALLOWNUL is being used. Use mlxCopyToken instead on MLX_TOK_STRING's, as it gives a definitive answer on the token length. (mlxStringVal can still be used on keywords since those will never contain a NUL, by definition). +This routine MAY NOT be called twice for the same token. -### int mlxIntVal(pLxSession this) -Returns the integer value of MLX_TOK_INTEGER tokens, or returns the compare type for MLX_TOK_COMPARE tokens. The compare type is a bitmask of the following flags: +- ⚠️ **Warning**: This function should not be called when `MLX_F_ALLOWNUL` is being used because it may return a null character, giving the caller no way to know whether it is the null-terminator or it simply existed in the input data stream. In this case, `mlxCopyToken()` should be used instead, as it gives a definitive answer on the token length. (`mlxStringVal()` can still be used on keywords, though, since they never contain a null, by definition). -- MLX_CMP_EQUALS -- MLX_CMP_GREATER -- MLX_CMP_LESS +### mlxIntVal() +```c +int mlxIntVal(pLxSession this); +``` +This function returns the integer value of `MLX_TOK_INTEGER` tokens, or returns the compare type for `MLX_TOK_COMPARE` tokens. The compare type is a bitmask of the `MLX_CMP_EQUALS`, `MLX_CMP_GREATER`, and `MLX_CMP_LESS` flags. For `MLX_TOK_DOUBLE` tokens, this function returns the whole part. -For MLX_TOK_DOUBLE tokens, returns the whole part. +### mlxDoubleVal() +```c +double mlxDoubleVal(pLxSession this); +``` +This function returns a double precision floating point number for either `MLX_TOK_INTEGER` or `MLX_TOK_DOUBLE` values. -### double mlxDoubleVal(pLxSession this) -Returns a double precision floating point number for either MLX_TOK_INTEGER or MLX_TOK_DOUBLE values. +### mlxCopyToken() +```c +int mlxCopyToken(pLxSession this, char* buffer, int maxlen); +``` +This function copies the contents of the current token to a string buffer, up to `maxlen` characters. It should be used instead of `mlxStringVal()`, _especially_ where null characters may be involved. This function returns the number of characters copied on success, or -1 on failure, and it can be called multiple times if more data needs to be read from the same token. -### int mlxCopyToken(pLxSession this, char* buffer, int maxlen) -For use instead of mlxStringVal, copies the contents of the current token to a string buffer, up to 'maxlen' characters. Returns the number of characters copied. This function can be called multiple times if more data needs to be read from the token. +### mlxHoldToken() +```c +int mlxHoldToken(pLxSession this); +``` +This function "puts back" a token, causing the next `mlxNextToken()` to return the current token again. This is useful when a function realizes after `mlxNextToken()` that it has read one-too-many. This function returns 0 on success, or -1 if an error occurs. -### int mlxHoldToken(pLxSession this) -Basically causes the next mlxNextToken() to do nothing but return the current token again. Used for when a routine realizes after mlxNextToken() that it has read one-too-many tokens and needs to 'put a token back'. +### mlxSetOptions() +```c +int mlxSetOptions(pLxSession this, int options); +``` +This function sets the options (`MLX_F_xxx`) for an active lexer session. The options that are valid here are `MLX_F_ICASE` and `MLX_F_IFSONLY`. This function returns 0 if successful, or -1 if an error occurs. -### int mlxSetOptions(pLxSession this, int options) -Sets options (MLX_F_xxx) in the middle of a lexer session. The options that are valid here are MLX_F_ICASE and MLX_F_IFSONLY. +### mlxUnsetOptions() +```c +int mlxUnsetOptions(pLxSession this, int options); +``` +Clears options set by [`mlxSetOptions()`](#mlxsetoptions). This function returns 0 if successful, or -1 if an error occurs. -### int mlxUnsetOptions(pLxSession this, int options) -Clears options (see above). +### mlxSetReservedWords() +```c +int mlxSetReservedWords(pLxSession this, char** res_words); +``` +This function sets the lexer to return the list of `res_words` as `MLX_TOK_RESERVEDWD` tokens instead of `MLX_TOK_KEYWORD` tokens. The list of words should be an array of character strings, with the last string in the list being `NULL`. This function returns 0 if successful, or -1 if an error occurs. -### int mlxSetReservedWords(pLxSession this, char** res_words) -Informs the lexer that a certain list of words are to be returned as MLX_TOK_RESERVEDWD instead of MLX_TOK_KEYWORD. The list of words should be an array of character strings, with the last string in the list NULL. mtlexer does not copy this list, so it must be static or have a lifetime greater than that of the lexer session. +- ⚠️ **Warning**: `mtlexer` does not copy this list! Ensure that it has a lifetime longer than that of the lexer session. -### int mlxNoteError(pLxSession this) -Generates an mssError() message of this form: +### mlxNoteError() +```c +int mlxNoteError(pLxSession this); +``` +This function generates an `mssError()` message of the form: +```bash +MLX: Error near '' +``` - MLX: Error near '' +- 📖 **Note**: The calling routine may have detected the error long after the actual place where it occurred. The MLX module just tries to come close :) -NOTE: the calling routine may have detected the error long after the actual place where it occurred. The MLX module just tries to come close :) +### mlxNotePosition() +```c +int mlxNotePosition(pLxSession this); +``` +This function generates an mssError() message of this form: +```bash +MLX: Error at line ## +``` -### int mlxNotePosition(pLxSession this) -Generates an mssError() message of this form: +- 📖 **Note**: If using a `StringSession` instead of a `pFile` session, this may not be accurate, as the string may have come from the middle of a file somewhere. Use with care. - MLX: Error at line ## -NOTE: If using a StringSession instead of a pFile session, this may not be accurate, as the string may have come from the middle of a file somewhere. Use with care. -## VIII Objectsystem Driver Testing -This section contains a list of things that can be done to test an objectsystem driver, to make sure that it is performing all basic operations normally. We will use the test_obj command line interface for testing here. For more information on test_obj commands, see the online Centrallix documentation at: http://www.centrallix.net/docs/docs.php +## VII Driver Testing +This section contains a list of things that can be done to test an objectsystem driver and ensure that it preforms all basic operations correctly, using the [test_obj command line interface](http://www.centrallix.net/docs/docs.php). -Testing for memory leaks for each of these items is strongly encouraged, by watching memory utilization using nmDeltas() during repetitive operations (e.g., nmDeltas(), open, close, nmDeltas(), open, close, and then nmDeltas() again). +It is strongly recommended to test for invalid reads, writes, frees, and memory leaks during each of these by watching memory utilization using nmDeltas() during repetitive operations (e.g., nmDeltas(), open, close, nmDeltas(), open, close, and then nmDeltas() again). -Testing for more general bugs using the "valgrind" tool is also strongly encouraged, via running these various tests in test_obj while test_obj is running under valgrind. +Testing for more general memory bugs using the "valgrind" tool is also strongly encouraged, via running these various tests in test_obj while test_obj is running under valgrind. To properly test under Valgrind, centrallix-lib must be compiled with the configure flag `--enable-valgrind-integration` turned on. This disables `nmMalloc()` block caching (so that valgrind can properly detect memory leaks and free memory reuse), and it provides better information to valgrind's analyzer regarding MTASK threads. -Magic number checking on data structures is encouraged. To use magic number checking, determine a magic number value for each of your structures, and code that as a constant #define in your code. The magic number should be a 32-bit integer, possibly with 0x00 in either the 2nd or 3rd byte of the integer. Many existing magic number values can be found in the file "magic.h" in centrallix-lib. The 32-bit integer is placed as the first element of the structure, and set using the macro SETMAGIC(), and then tested using the macros ASSERTMAGIC(), and less commonly, ASSERTNOTMAGIC(). ASSERTMAGIC() should be used any time a pointer to the structure crosses an interface boundary. It also may be used at the entry to internal methods/functions, or when traversing linked lists of data structures, or when retrieving data structures from an array. +Magic number checking on data structures is encouraged. To use magic number checking, determine a magic number value for each of your structures, and add a #define for that constant in your code. The magic number should be a 32-bit integer, possibly with 0x00 in either the 2nd or 3rd byte of the integer. Many existing magic number values can be found in [magic.h](../centrallix-lib/include/magic.h). The 32-bit integer is placed as the first element of the structure, and set using the `SETMAGIC()` macro, then tested using the macros `ASSERTMAGIC()` macro or, less commonly, `ASSERTNOTMAGIC()`. Common times to `ASSERTMAGIC()` include: +- Any time a pointer to the structure crosses an interface boundary. +- At the entry to internal methods/functions. +- When traversing linked lists of data structures. +- When retrieving data structures from an array. +- etc. -When used in conjunction with nmMalloc() and nmFree(), ASSERTMAGIC also helps to detect the reuse of already-freed memory, since nmFree() tags the first four bytes of the memory block with the constant MGK_FREEMEM. nmFree() also looks for the constant MGK_FREEMEM in the magic number slot to detect already-freed memory (so do not use that same constant for your own magic numbers). +When used in conjunction with `nmMalloc()` and `nmFree()`, `ASSERTMAGIC` also helps to detect the reuse of already-freed memory, since `nmFree()` tags the first four bytes of the memory block with the constant `MGK_FREEMEM`. `nmFree()` also looks for the constant `MGK_FREEMEM` in the magic number slot to detect already-freed memory. (**DO NOT** use that constant for your own magic numbers!) -To properly test under Valgrind, centrallix-lib must be compiled with the configure flag --enable-valgrind-integration turned on. This disables nmMalloc block caching (so that valgrind can properly detect memory leaks and free memory reuse), and it provides better information to valgrind's analyzer regarding MTASK threads. +The term "**MUST**", as used here, means that the driver will likely cause problems if the functionality is not present. -The term "MUST", as used here, means that the driver will likely cause problems if the functionality is not present. +The term "**SHOULD**" indicates behavior which is desirable, but that might not cause immediate problems if not fully implemented. -The term "SHOULD" indicates behavior which is desirable, but may not cause problems if not fully implemented. +The term "**MAY**" refers to optional, but permissible, behavior. -The term "MAY" refers to optional, but permissible, behavior. -### A. Object opening, closing, creation, and deletion +### A. Opening, closing, creating, and deleting -1. Any object in the driver's subtree, including the node object itself, MUST be able to be opened using objOpen() and then closed using objClose(). Although it does more than just open and close, the "show" command in test_obj can be useful for testing this. +1. Any object in the driver's subtree, including the node object itself, MUST be able to be opened using `xxxOpen()` and then closed using `xxxClose()`. Although it does more than just open and close, the "show" command in test_obj can be useful for testing this. 2. Objects MUST be able to be opened regardless of the location of the node object in the ObjectSystem. For example, don't just test the driver with the node object in the top level directory of the ObjectSystem - also try it in other subdirectories. -3. New objects within the driver's subtree SHOULD be able to be created using objOpen with OBJ_O_CREAT, or using objCreate(). The flags OBJ_O_EXCL and OBJ_O_TRUNC should also be supported, where meaningful. +3. New objects within the driver's subtree SHOULD be able to be created using `xxxOpen()` with `OBJ_O_CREAT`, or using `objCreate()`. The flags `OBJ_O_EXCL` and `OBJ_O_TRUNC` should also be supported, where meaningful. + +4. Where possible, `OBJ_O_AUTONAME` should be supported on object creation. With this, the name of the object will be set to `*` in the pathname structure, and `OBJ_O_CREAT` will also be set. The driver should automatically determine a suitable "name" for the object, and subsequent calls to objGetAttrValue on "name" should return the determined name. A driver MAY choose to return NULL for "name" until after certain object properties have been set and an `xxxCommit()` operation performed. A driver MUST NOT return `*` for the object name unless `*` is truly the name chosen for the object. -4. Where possible, OBJ_O_AUTONAME should be supported on object creation. With this, the name of the object will be set to `*` in the pathname structure, and OBJ_O_CREAT will also be set. The driver should automatically determine a suitable "name" for the object, and subsequent calls to objGetAttrValue on "name" should return the determined name. A driver MAY choose to return NULL for "name" until after certain object properties have been set and an objCommit operation performed. A driver MUST NOT return `*` for the object name unless `*` is truly the name chosen for the object. +5. A driver SHOULD support deletion of any object in its subtree with the exception of the node object itself. Deletion may be done directly with `xxxDelete()`, or on an already-open object using `xxxDeleteObj()`. A driver MAY refuse to delete an object if the object still contains deletable sub-objects. Some objects in the subtree might inherently not be deletable apart from the parent objects of said objects. In those cases, deletion should not succeed. -5. A driver SHOULD support deletion of any object in its subtree with the exception of the node object itself. Deletion may be done directly with objDelete(), or on an already-open object using objDeleteObj(). A driver MAY refuse to delete an object if the object still contains deletable sub-objects. Some objects in the subtree might inherently not be deletable apart from the parent objects of said objects. In those cases, deletion should not succeed. -### B. Object attribute enumeration, getting, and setting. -1. The driver MUST NOT return system attributes (name, inner_type, and so forth) when enumerating with objGetFirst/NextAttr. +### B. Attributes -2. The driver does not need to handle objGetAttrType on the system attributes. The OSML does this. +1. The driver MUST NOT return system attributes (name, inner_type, etc) when enumerating with `xxxGetFirst()`/`xxxNextAttr()`. -3. The driver SHOULD support the attribute last_modification if at all reasonable. Not all objects can have this property however. +2. The driver MAY choose not to handle `xxxGetAttrType` on the system attributes. The OSML handles this. + +3. The driver SHOULD support the attribute `last_modification` if at all reasonable. Not all objects can have this property however. 4. The driver SHOULD support the attribute "annotation" if reasonable to do so. Database drivers should have a configurable "row annotation expression" to auto-generate annotations from existing row content, where reasonable. The driver MAY permit the user to directly set annotation values. The driver MUST return an empty string ("") for any annotation values that are unavailable. @@ -1038,43 +1512,44 @@ The term "MAY" refers to optional, but permissible, behavior. 7. The "show" command in test_obj is a good way to display a list of attributes for an object. -8. Attribute enumeration, retrieval, and modification MUST work equally well on objects returned by objOpen() and objects returned by objQueryFetch(). +8. Attribute enumeration, retrieval, and modification MUST work equally well on objects returned by `xxxOpen()` and objects returned by `xxxQueryFetch()`. -9. If a driver returns an attribute during attribute enumeration, then that attribute MUST return a valid type via objGetAttrType. +9. If a driver returns an attribute during attribute enumeration, then that attribute MUST return a valid type via `xxxGetAttrType`. -10. A driver MUST return -1 and error with a "type mismatch" type of error from objGet/SetAttrValue, if the data type is inappropriate. +10. A driver MUST return -1 and error with a "type mismatch" type of error from `xxxGetAttrValue()`/`xxxSetAttrValue()`, if the data type is inappropriate. 11. A driver MAY choose to perform auto-conversion of data types on certain attributes, but SHOULD NOT perform such auto conversion on a widespread wholesale basis. -12. A driver MAY support the DATA_T_CODE attribute data type. +12. A driver MAY support the `DATA_T_CODE` attribute data type. + +13. Drivers MAY support `DATA_T_INTVEC` and `DATA_T_STRINGVEC`. -13. Drivers MAY support DATA_T_INTVEC and DATA_T_STRINGVEC. +14. Drivers MAY support `xxxAddAttr()` and `xxxOpenAttr()`. -14. Drivers MAY support objAddAttr and objOpenAttr. +15. Drivers MAY support methods on objects. Objects without any methods should be indicated by a `NULL` return value from the method enumeration functions. -15. Drivers MAY support methods on objects. Objects without any methods should be indicated by a NULL return value from the method enumeration functions. +16. When returning attribute values, the value MUST remain valid at least until the next call to `xxxGetAttrValue()`, `xxxSetAttrValue()`, or `xxxGetAttrType()`, or until the object is closed, whichever occurs first. Drivers MUST NOT require the caller to free attribute memory. -16. When returning attribute values, the value MUST remain valid at least until the next call to objGetAttrValue, objSetAttrValue, or objGetAttrType, or until the object is closed, whichever occurs first. Drivers MUST NOT require the caller to free attribute memory. +17. When `xxxSetAttrValue()` is used, drivers MUST NOT depend on the referenced value (in the POD) being valid past the end of the call to `xxxSetAttrValue()`. -17. When objSetAttrValue is used, drivers MUST NOT depend on the referenced value (in the POD) being valid past the end of the call to objSetAttrValue(). -### C. Object querying (for subobjects) +### C. Querying Subobjects -1. If an object cannot support queries for subobjects, the OpenQuery call SHOULD fail. +1. If an object cannot support queries for subobjects, `xxxOpenQuery()` call SHOULD fail. -2. If an object can support the existence of subobjects, but has no subobjects, the OpenQuery should succeed, but calls to QueryFetch MUST return NULL. +2. If an object can support the existence of subobjects, but has no subobjects, the `xxxOpenQuery()` should succeed, but calls to `xxxQueryFetch()` MUST return `NULL`. -3. Objects returned by QueryFetch MUST remain valid even after the query is closed using QueryClose. +3. Objects returned by `xxxQueryFetch()` MUST remain valid even after the query is closed using `xxxQueryClose()`. -4. Objects returned by QueryFetch MUST also be able to be passed to OpenQuery to check for the existence of further subobjects, though the OpenQuery call is permitted to fail as in (C)(1) above. +4. Objects returned by `xxxQueryFetch()` MUST also be able to be passed to `xxxOpenQuery()` to check for the existence of further subobjects, though the `xxxOpenQuery()` call is permitted to fail as in (C)(1) above. -5. Any name returned by objGetAttrValue(name) on a queried subobject MUST be able to be used to open the same object using objOpen(). +5. Any name returned by `xxxGetAttrValue(name)` on a queried subobject MUST be usable to open the same object using `xxxOpen()`. -6. Drivers which connect to resources which are able to perform sorting and/or selection (filtering) of records or objects SHOULD use the OBJ_QY_F_FULLSORT and OBJ_QY_F_FULLQUERY flags (see previous discussion) as well as pass on the sorting and filtering expressions to the remote resource so that resource can do the filtering and/or sorting. +6. Drivers which connect to resources which are able to perform sorting and/or selection (filtering) of records or objects SHOULD use the [`OBJ_QY_F_FULLSORT`](#function-openquery) and [`OBJ_QY_F_FULLQUERY`](#function-openquery) flags. Further, they SHOULD pass on the sorting and filtering expressions to the remote resource so that resource can optimize sorting and/or filtering as needed. -7. If the driver's remote resource can filter and/or sort, but can only do so imperfectly (e.g., the resource cannot handle the potential complexity of all sorting/selection expressions, but can handle parts of them), then OBJ_QY_F_FULLSORT and/or OBJ_QY_F_FULL- QUERY MUST NOT be used. However the remote resource MAY still provide partial sorting and/or selection of data. +7. If the driver's remote resource can filter and/or sort, but can only do so imperfectly (e.g., the resource cannot handle the potential complexity of all sorting/selection expressions, but can handle parts of them), then `OBJ_QY_F_FULLSORT` and/or `OBJ_QY_F_FULL`- QUERY MUST NOT be used. However the remote resource MAY still provide partial sorting and/or selection of data. -8. Drivers SHOULD NOT use OBJ_QY_F_FULLSORT and OBJ_QY_F_FULLQUERY if there is no advantage to letting the resource perform these operations (usually, however, if the resource provides such functionality, there is advantage to letting the resource perform those operations. However, the coding burden to provide the filtering and sorting expressions to the resource, and in the correct format for the resource, may be not worth the work). +8. Drivers SHOULD NOT use `OBJ_QY_F_FULLSORT` and `OBJ_QY_F_FULLQUERY` if there is no advantage to letting the resource perform these operations (usually, however, if the resource provides such functionality, there is advantage to letting the resource perform those operations. However, the coding burden to provide the filtering and sorting expressions to the resource, and in the correct format for the resource, may be not worth the work). 9. Testing of query functionality can be done via test_obj's "query", "csv", and "ls" (or "list") commands. To test for nested querying of objects returned from QueryFetch, a SUBTREE select can be used with the "query" or "csv" commands. diff --git a/centrallix-sysdoc/Prefixes.md b/centrallix-sysdoc/Prefixes.md index b0e12c6b..30e53632 100644 --- a/centrallix-sysdoc/Prefixes.md +++ b/centrallix-sysdoc/Prefixes.md @@ -4,6 +4,7 @@ |---------|--------------------------------------------------------------------- | aud | OSDriver - Linux OSS /dev/dsp audio (plays WAV files on ExecMethod) | bar | BarCode generator module (for prt mgmt) +| cluster | OSDriver - Cluster & search file | dat | OSDriver - Flat data file (CSV/etc) | ev | MTASK internal - event handling | exp | Expression compiler/parser/evaluator diff --git a/centrallix-sysdoc/string_comparison.md b/centrallix-sysdoc/string_comparison.md deleted file mode 100644 index 222e3e6d..00000000 --- a/centrallix-sysdoc/string_comparison.md +++ /dev/null @@ -1,99 +0,0 @@ -# String Comparison -The following sections discuss the two approaches to calculating similarity between two strings. Both approaches use a SQL function to calculate a similarity metric (on a scale of 0 to 1) for two string parameters. - -## Table of Contents -- [String Comparison](#string-comparison) - - [Table of Contents](#table-of-contents) - - [Levenshtein Similarity](#levenshtein-similarity) - - [Levenshtein](#levenshtein) - - [Cosine Similarity](#cosine-similarity) - - [CHAR_SET](#char_set) - - [Frequency Table](#frequency-table) - - [Relative Frequency Table](#relative-frequency-table) - - [TF-IDF](#tf-idf) - - [Dot Product](#dot-product) - - [Magnitude](#magnitude) - - [Similarity](#similarity) - - [Future Implementation](#future-implementation) - - [Inverse Document Frequency (IDF)](#inverse-document-frequency-idf) - -## Levenshtein Similarity -The [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) distance is defined as the number of insertions, deletions, or substitutions required to make one string exactly like another string. - -### Levenshtein -```c -int exp_fn_levenshtein(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) -``` -Returns the levenshtein edit distance between two strings. - -```c -int exp_fn_fuzzy_compare(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) -``` -Returns a value between 0.0 (complete match) and 1.0 (complete difference) between strings a and b, based on the (levenshtein distance) / (max len of input strings). -Some alterations to the calculation are as follows: -- matching an empty string against anything returns 0.5. -- a string that only required insertions to become the other string has its (lev_dist)/(strlen) value halved before returning -The parameter max_field_width is required, but not used. - -## Cosine Similarity - -The [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) function is defined as the dot product of two vectors divided by the product of the magnitude of the two vectors. We use the relative frequency of the individual characters within each term as the vectors in the calculation. The following functions are used to calculate cosine similarity. - -### CHAR_SET -```c -const char *CHAR_SET ... -``` -`CHAR_SET` represents all of the characters that should be considered during the calculation of similarity. `CHAR_SET` can be extended to include additional characters, as necessary. - -### Frequency Table -```c -int exp_fn_i_frequency_table(double *table, char *term) -``` -Helper function for similarity(). Creates a frequency table containing indices corresponding to all characters in `CHAR_SET` (all other characters are ignored). The values in the frequency table will contain the number of times each character appers in `term`. - -The `table` parameter must be allocated prior to calling the function with `nmMalloc()` using `sizeof(x * sizeof(double))`, where `x` is the length of `CHAR_SET`. The function will initialize all `table` values to 0, before calculating the frequency values. - -### Relative Frequency Table -```c -int exp_fn_i_relative_frequency_table(double *frequency_table) -``` -Helper function for similarity(). Converts a frequency table into a relative frequency table, where each value in the `frequency_table` is converted to the percent of occurrence (i.e., frequency divided by the sum of total occurrences). - -The `frequency_table` parameter must have been created using the `exp_fn_i_frequency_table` function above. - -### TF-IDF -```c -int exp_fn_i_tf_idf_table(double *frequency_table) -``` -Helper function for similarity(). Creates a TF x IDF vector from a frequency table, where each value in the resulting table is created by multiplying the relative frequency of each letter by the corresponding coefficient in the IDF array. - -The `frequency_table` parameter must have been created using the `exp_fn_i_frequency_table` function above. - -### Dot Product -```c -int exp_fn_i_dot_product(double *dot_product, double *r_freq_table1, double *r_freq_table2) -``` -Helper function for similarity(). Calculates the dot product of two relative frequency tables (sum of the squared values from each relative frequency table). - -The `dot_product` parameter should be initialized to 0 before calling the function. The table parameters must contain relative frequency tables that are generated from the `exp_fn_i_relative_frequency_table` function. The lengths of both tables must equal the length of `CHAR_SET`. - -### Magnitude -```c -int exp_fn_i_magnitude(double *magnitude, double *r_freq_table) -``` -Helper function for similarity(). Calculates the magnitude of a relative frequency table (square root of the sum of the squared relative frequencies). - -The `magnitude` parameter should be initialized to 0 before calling the function. The table parameter must contain a relative frequency table that was generated from the `exp_fn_i_relative_frequency_table` function. The length of the frequency table must equal the length of `CHAR_SET`. - -### Similarity -```c -int exp_fn_similarity(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) -``` -Returns a value between 0.0 (completely different) and 1.0 (complete match) reflecting the similarity between the value passed in to i0 and the value passed in to i1. The first two parameters should contain strings that need to be compared. If the value 1 is passed in the third parameter, then the similarity function will rely on TF x IDF scores to determine similarity. If no third parameter is passed, then the function will rely only on relative frequency scores. - -## Future Implementation - -### Inverse Document Frequency (IDF) -In text mining, the most common metric to use in the cosine similarity function is the [TF x IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) metric. Our approach uses only TF (term frequency). Inverse document frequency calculates a weighting factor for each character. This could increase precision a small amount by weighting characters that appear on many records as less important in distinguishing matches, and weighting characters that appear on only certain records as more important. IDF could be calculated by iterating through the entire partner dataset each time. The current approach uses the relative frequency of each letter used in the English language on [Wikipedia](https://en.wikipedia.org/wiki/Letter_frequency), which may not be consistent with the data in the partner database. - - diff --git a/centrallix-sysdoc/string_similarity.md b/centrallix-sysdoc/string_similarity.md new file mode 100644 index 00000000..b6c96e8a --- /dev/null +++ b/centrallix-sysdoc/string_similarity.md @@ -0,0 +1,157 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# String Similarity +The following sections discuss the approaches to calculating similarity between two strings using the `clusters.c` library. This library can be included using `#include "cxlib/clusters.h"` in the centrallix codebase (use `#include "clusters.h"` in other libaries in centrallix-lib). + + +## Table of Contents +- [String Comparison](#string-comparison) + - [Table of Contents](#table-of-contents) + - [Cosine Similarity](#cosine-similarity) + - [Character Sets](#character-sets) + - [Character Pair Hashing](#character-pair-hashing) + - [String Vectors](#string-vectors) + - [Sparse Vectors](#sparse-vectors) + - [Computing Similarity](#computing-similarity) + - [Levenshtein Similarity](#levenshtein-similarity) + - [Clustering](#clustering) + - [K-means Clustering](#k-means-clustering) + - [K-means++ Clustering](#k-means-clustering-1) + - [K-medoids Clustering](#k-medoids-clustering) + - [DBScan Clustering](#db-scan) + - [Sliding Clusters](#sliding-clusters) + - [Future Implementation](#future-implementation) + - [K-means Fuzzy Clustering](#k-means-fuzzy-clusterings) + - [Implement Missing Algorithms](#implement-missing-algorithms) + + +## Cosine Similarity +The [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) function is defined as the dot product of two vectors divided by the product of the magnitude of the two vectors. Conceptually, it's like finding the _angle_ between two vectors. To get these vectors, we use the relative frequency of character pairs within each string. To reduce memory cost and speed up computation, we store them in a special sparsely allocated form, described below. + +### Character Sets +Cosine compare currently uses the following character sets. These can be extended or modified later, if necessary. +```c +const char ALLOW_SET[] = " \n\v\f\r!#$%&\"'()*+,-./:;<=>?@[]^_{|}~ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; +const char CHAR_SET[] = "`abcdefghijklmnopqrstuvwxyz0123456789"; +const char SIGNIFICANT_SET[] = "`ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; +const char IGNORE_SET[] = " \n\v\f\r!#$%&\"'()*+,-./:;<=>?@[]^_{|}"; +const char BOUNDARY_CHAR = ('a' - 1); // aka. '`' +``` +- `ALLOW_SET` represents all characters which can be passed to a similarity detection algorithm. Passing other characters may cause warnings and errors, undefined or unintended behavior, and even security concerns. +- `CHAR_SET` represents all of the characters that will be uniquely considered during the calculation of similarity. Currently, this is all lowercase letters and numbers. +- `SIGNIFICANT_SET` represents all of the characters that are significant for the purposes of similarity. For example, the uppercase letters are significant because they are considered identical to lowercase letters. Thus, they are included in the `SIGNIFICANT_SET`, but not in the `CHAR_SET`. +- `IGNORE_SET` represents characters which, while allowed to be passed to a similarity algorithm, will be ignored. For example, the strings "Ya!!" and "Ya..." will be considered identical. +- The `BOUNDARY_CHAR` is a special character which is conceptually added to the start and end of any string to be checked. + - This allows for pairs that functionally include only the first and last character. + - This character appears to have been selected to be one before the first character in `CHAR_SET` (thus convention dictates that it be written `'a' - 1` to indicate this), although it's unknown if that's the main or only reason. + - If `clusters.h` is included, it can be accessed using the `CA_BOUNDARY_CHAR` macro. + +### Character Pair Hashing +Even with a small set of ASCII characters (say 36), there are still `36^2 = 1296` possible character pairs. If the number of characters in the `CHAR_SET` ever needed to be expanded - for example, to include all UTF-8 characters - this number would quickly explode exponentially to utterly infeasible proportions. Thus, a hashing algorithm is employed to hash each character pair down to a more reasonable number of dimensions (which can be accessed with the `CA_NUM_DIMS` macro). + +### String Vectors +Any string of characters in the `ALLOW_SET` can be represented by a vector. For simplicity, imagine this vector has only `5` dimensions. To find this vector, we hash each character pair in the string. As each character pair is hashed (for example, that the pair "ab" happens to hash to `3`), the corresponding dimension is increased by some amount. This amount varies to based on the characters in the pair, helping to mitigate the impact of collisions where different character pairs hash to identical numbers (a larger number of dimensions also helps to mitigate this). + +Remember that the first and last characters form a pair with the `BOUNDARY_CHAR`, so the string "ab" has three pairs: "a", "ab", and "b". If these each hash to `2`, `3`, and `0`. Thus, the vector generated by the string "ab" might be: `[7, 0, 4, 3, 0]`. Notice that dimensions #1 and #4 are both `0` because no character pairs generated a hash of `1` or `4`. In real usecases, the vast majority of elements are `0`s because the number of dimensions used is much larger than the number of character pairs in a typical string. + +### Sparse Vectors +As noted above, the vast majority of elements in a vector generated by a typical string are `0`s. This would lead to a large waste of memory and computation if every `0` was stored separately, so instead, vectors are stored sparsely. Because all hashes are positive integers, we represent `n` `0`s with a value of ` -n`. Thus, the vector `[0, 1, 0, 0, 0]` (representing an empty string in `5` dimensions) would be represented sparsely as `[-1, 1, -3]`. + +**Note**: A value of `0` in a sparse vector is undefined, so no element should be equal to `0`. + +**Note**: Sparse arrays can vary greatly in length. To find their size, one needs to traverse the array until the total number of values found adds up to `CA_NUM_DIMS`. The `ca_sparse_len()` function can be used to do this. Also, the `ca_build_vector()` and `ca_free_vector()` use the `nmSys` functions from `newmalloc.h` to avoid conflicts over the size of the allocated data. + +### Computing Similarity +Finally, to find the cosine similarity between two strings, we can simply take the [dot product](https://en.wikipedia.org/wiki/Dot_product) of their coresponding vectors. Then, we normalize the dot product by dividing by the magnitudes of both vectors multiplied together. Two strings can be compared this way using the `ca_cos_compare()` function. + + +## Levenshtein Similarity +The [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) distance is defined as the number of insertions, deletions, or substitutions required to make one string exactly like another string. The version implemented in `clusters.c` additionally allows a new operation called a "swap" in which two adjacent characters change places. Transpositions of larger pieces of text are, unfortunately, not handled as well, which is a potential downfall of using levenshtein edit distance. + +The levenshtein similarity of two strings can be compared using the `ca_lev_compare()` function. + + +## Clustering +When searching for similar strings in a large amount of data (for example, `1,000,000` strings), comparing every string to every other string can be very computationally expensive. To speed up this process, it is helpful to _cluster_ similar strings together, then only compare strings within similar clusters. This sacrifices some accuracy to allow large amounts of data to be searched and compared in a feasible amount of time. + +### K-means Clustering +When clustering data using the [k-means](https://en.wikipedia.org/wiki/K-means_clustering) algorithm, data is divided into a predefined number of clusters with the goal of maximizing the average similarity of data points within any given cluster. To quickly summarize the algorithm: +1. Randomly select `k` data points to be the initial centroids of each cluster. +2. For each data point, find the centroid it is most similar to, and assign it to that cluster. +3. For each cluster, find the new centroid by averaging all data points in the cluster. +4. Repeat steps 2 and 3 until the clusters stabilize (i.e. no data point changes clusters). + +The implementation used in `clusters.c` also allows the programmer to specify a maximum number of iterations (called `max_iter` in the code) to prevent this process from running forever. Additionally, successive iterations can give diminishing results or even produce clusters that are slightly worse. To improve performance, the programmer can also specify a minimum improvement threshold (called `min_improvement`). Clusters must become more similar by at least this amount each iteration, otherwise the algorithm ends, even if the maximum number of iterations has not yet been reached. + +The `ca_kmeans()` function can be invoked using [the cosine comparison string vectors](#string-vectors) (see above) to cluster them into similar clusters. + +### K-means++ Clustering +**Not yet implemented** +This method is largely identical to k-means, except that [k-means++](https://en.wikipedia.org/wiki/K-means%2B%2B) assigns the initial centroids using an approximate algorithm designed to avoid some of the poor clustering possible with random assignment. + +### K-medoids Clustering +**Not yet implemented** +This method is also very similar to k-means, except that [k-medoids](https://en.wikipedia.org/wiki/K-medoids) places an additional requirement that all centroids be points in the data. This would theoretically allow for other similarity measures (such as Levenshtein edit distance) to be used for clustering instead of only cosine compare. + +### DB-Scan +**Proposed, not yet implemented or documented** + +### Sliding Clusters +A far more basic method of "clustering" is to simply sort all data alphabetically, then, instead of comparing each string to all other strings, it can be compared to only the next `n` strings. Of course, differences near the start of a string (for example, "fox" vs. "box") will cause those strings to sort far away from each other, leading them to be completely missed. + +Sorting using a similarity measure, such as `ca_cos_compare()` or `ca_lev_compare()` would resolve this issue. However, these comparison functions do not meet the transitivity requirement for sorting, which is that `(A < B) & (B < C) -> (A < C)`. For example, "car" is similar to "boxcar", which is also similar to "box". However, "car" and "box" are not similar at all. + +Additionally, sorting by the cosine vectors (similarly to how we cluster by them when using k-means) was proposed, but further investigation showed that this was also not possible. + +For problems where a sorting algorithm exists which can mitigate the above issues, this solution may prove very promising. However, so far we have not found such a problem, so the other clustering algorithms tend to outperform Sliding Clusters. + + +## Future Implementation + +### K-means Fuzzy Clustering +One of the biggest downsides with k-means is that it creates very arbitrary boundaries between clusters. Elements on either side of these boundaries may be highly similar, but if comparisons only occur within a cluster, these similar entries will be missed. The problem becomes more extreme as a higher k value (more clusters) is used, creating more arbitrary boundaries. This drawback is probably the main reason that clustering sacrifices some accuracy over searching every element. + +Running the entire search multiple types may allow some of these to be found because the initial cluster locations are random. This approach is partially implemented for duplicate searching because the algorithm runs nightly anyway, so a simple upsert (**UP**date existing entries; in**SERT** new entries) slightly reduces this problem. However, this solution is obviously far from ideal. + +If the clustering could be expanded with an additional step that makes clusters larger, adding elements from other clusters to them, this might effectively mitigate the issue. It may also allow developers to use larger numbers of clusters, improving performance as well as accuracy. Further research is needed to verify the effectiveness of this approach before an implementation is written. + +### Implement Missing Algorithms +Several algorithms (such as [k-means++](#k-means-clustering-1), [k-medoids](#k-medoids-clustering), and [DBScan](#db-scan)) above are proposed but lack an implementation. They may be effective and useful, however, to reduce development time, they have not yet been implemented. + +### Upgrade Other Duplicate Detection Systems +When a new record is entered, a quick scan is run to check if it might be a duplicate. There is also a button in the UI for a record that lets you run a duplicate check. These systems could also be upgraded using the new algorithms and strategies developed for general duplicate detection. + +### Known Issues +- The cluster driver often fails to open the structure file if it was modifed since the last time the path was openned. Opening a different path (including the root path, even though it does not support queries) fixes this issue. This is either a bug in the st_node caching or in the cluster driver's usage of stparse. +- The cluster does not invalidate caches if the underlying data source changes. This bug exists because I wasn't sure how to do this, but I'm pretty sure it's possible. Workaround: Developers should use `exec "cache" "drop_all"` to invalidate caches when data is changed, or use a fresh object system instance. diff --git a/centrallix/Makefile.in b/centrallix/Makefile.in index 7d2b1e23..827a23b5 100644 --- a/centrallix/Makefile.in +++ b/centrallix/Makefile.in @@ -80,6 +80,7 @@ XSUPPORT=stparse.o \ endorsement_utils.o \ obfuscate.o \ json_util.o \ + double_metaphone.o \ double.o SUPPORT=$(patsubst %,utility/%,$(XSUPPORT)) @@ -115,6 +116,7 @@ XOBJDRIVERS=objdrv_ux.o \ objdrv_uxprint.o \ objdrv_qytree.o \ objdrv_qypivot.o \ + objdrv_cluster.o \ objdrv_datafile.o \ objdrv_audio.o \ objdrv_link.o \ @@ -133,6 +135,7 @@ XV3OBJDRIVERS= \ objdrv_uxprint_v3.o \ objdrv_qytree.o \ objdrv_qypivot.o \ + objdrv_cluster.o \ objdrv_query.o \ objdrv_datafile.o \ objdrv_audio.o \ diff --git a/centrallix/centrallix.c b/centrallix/centrallix.c index 6467ab2b..b8a9e4ea 100644 --- a/centrallix/centrallix.c +++ b/centrallix/centrallix.c @@ -440,6 +440,7 @@ cxDriverInit() stxInitialize(); /* Structure file driver */ qytInitialize(); /* Query Tree driver */ qypInitialize(); /* Query Pivot driver */ + clusterInitialize(); /* Cluster driver */ qyInitialize(); /* stored query (aka view) driver */ rptInitialize(); /* report writer driver */ uxpInitialize(); /* UNIX printer access driver */ @@ -694,4 +695,3 @@ cxLinkSigningSetup(pStructInf my_config) return 0; } - diff --git a/centrallix/etc/types.cfg b/centrallix/etc/types.cfg index 11ebc3e3..6cbac5ae 100644 --- a/centrallix/etc/types.cfg +++ b/centrallix/etc/types.cfg @@ -51,6 +51,7 @@ "system/symbolic-link" "Symbolic Link" lnk "" "text/plain" "text/css" "CSS File" css "" "text/plain" "system/querypivot" "Query Pivot Object" qyp "" "system/structure" +"system/cluster" "Clustering Object" cluster "" "system/structure" "application/json" "JSON data" json "" "text/plain" "text/json" "JSON data" "" "" "application/json" "text/x-json" "JSON data" "" "" "application/json" diff --git a/centrallix/expression/exp_compiler.c b/centrallix/expression/exp_compiler.c index 48f97592..3fcd83ef 100644 --- a/centrallix/expression/exp_compiler.c +++ b/centrallix/expression/exp_compiler.c @@ -1043,8 +1043,8 @@ expCompileExpression(char* text, pParamObjects objlist, int lxflags, int cmpflag /*** expBindExpression - do late binding of an expression tree to an *** object list. 'domain' specifies the requested bind domain, whether - *** runstatic (EXP_F_RUNSTATIC), runserver (EXP_F_RUNSERVER), or runclient - *** (EXP_F_RUNCLIENT). 'domain' can also be -0-, in which case we rebind + *** runstatic (EXPR_F_RUNSTATIC), runserver (EXPR_F_RUNSERVER), or runclient + *** (EXPR_F_RUNCLIENT). 'domain' can also be -0-, in which case we rebind *** a domainless expression. ***/ int @@ -1072,20 +1072,11 @@ expBindExpression(pExpression exp, pParamObjects objlist, int flags) break; } } - if (exp->ObjID == -1) - { - cm |= EXPR_MASK_EXTREF; - } - } - else if (exp->ObjID == -2 || exp->ObjID == -3) - { - if (exp->ObjID == -2) cm |= (1<<(objlist->CurrentID)); - if (exp->ObjID == -3) cm |= (1<<(objlist->ParentID)); - } - else if (exp->ObjID >= 0) - { - cm |= (1<<(exp->ObjID)); + cm |= EXPR_MASK_EXTREF; } + else if (exp->ObjID == EXPR_OBJID_CURRENT) cm |= (1<<(objlist->CurrentID)); + else if (exp->ObjID == EXPR_OBJID_PARENT) cm |= (1<<(objlist->ParentID)); + else if (exp->ObjID >= 0) cm |= (1<<(exp->ObjID)); } /** Check for absolute references in functions **/ @@ -1105,4 +1096,3 @@ expBindExpression(pExpression exp, pParamObjects objlist, int flags) return cm; } - diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 6425114d..3ee97016 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -1,27 +1,3 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include "obj.h" -#include "cxlib/mtask.h" -#include "cxlib/xarray.h" -#include "cxlib/xhash.h" -#include "cxlib/mtlexer.h" -#include "expression.h" -#include "cxlib/mtsession.h" -#include "cxss/cxss.h" -#include -#include -#include -#include -#include - - /************************************************************************/ /* Centrallix Application Server System */ /* Centrallix Core */ @@ -65,6 +41,514 @@ /* that issue in exp_evaluate.c */ /************************************************************************/ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cxlib/clusters.h" +#include "cxlib/mtask.h" +#include "cxlib/mtlexer.h" +#include "cxlib/mtsession.h" +#include "cxlib/newmalloc.h" +#include "cxlib/util.h" +#include "cxlib/xarray.h" +#include "cxlib/xhash.h" +#include "cxss/cxss.h" +#include "double_metaphone.h" +#include "expression.h" +#include "obj.h" + + +/*** Specifies expectations about an argument. + *** + *** @param Datatypes An array of datatypes (terminated with a -1). Set to NULL + *** to accept any datatype as valid for this argument. + *** @param Flags Flags to require other properties about an argument. If the + *** flag a required behavior for specific types, the requirement will be + *** skipped for other types. + *** + *** Valid Flags: + *** - `EXP_ARG_OPTIONAL`: The arg is optional. It is not valid a required + *** argument after an optional one. + *** - `EXP_ARG_NOT_NULL`: Expect the arg to not be null. + *** - `EXP_ARG_FORCE_TYPE`: Run type check on null args (not recommended). + *** - `EXP_ARG_NON_EMPTY`: Expect string to be non-empty. Expect a + *** stringvec or intvec to have elements (does not check them). + *** - `EXP_ARG_POSITIVE`: Expect a positive or zero value for int, double, + *** money, or datetime. (Includes NON_NAN: NAN is not positive). + *** - `EXP_ARG_NEGATIVE`: Expect a negative or zero value for int, double, + *** money, or datetime. (Includes NON_NAN: NAN is not negative). + *** - `EXP_ARG_NON_NAN`: Expect a double to be a number, not NAN. + *** + *** @attention - Checks like `EXP_ARG_NON_EMPTY`, `EXP_ARG_NON_NAN`, etc. also + *** succeed for `NULL` values. To avoid this, specify `EXP_ARG_NOT_NULL`. + ***/ +typedef struct + { + int* Datatypes; + int Flags; + } + ArgExpect, *pArgExpect; + +#define EXP_ARG_END (ArgExpect){NULL, -1} +#define EXP_ARG_NO_FLAGS (0) +#define EXP_ARG_OPTIONAL (1 << 0) +#define EXP_ARG_NOT_NULL (1 << 1) +#define EXP_ARG_FORCE_TYPE (1 << 2) +#define EXP_ARG_NON_EMPTY (1 << 3) +#define EXP_ARG_NEGATIVE (1 << 4) +#define EXP_ARG_POSITIVE (1 << 5) +#define EXP_ARG_NON_NAN (1 << 6) + +/*** An internal function used by the schema verifier (below) to verify each + *** argument of the provided schema. + *** + *** @param fn_name The name of the expression function to be verified. + *** @param arg The argument to be verified. + *** @param arg_expect The expectation struct which specifies the requirements + *** for this argument. + *** @returns 0 if the expectations are successfully met, + *** -1 if an expectation is violated (and mssError() is called). + ***/ +static int +exp_fn_i_verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg_expect) + { + /** The expectation struct cannot be NULL. **/ + if (arg_expect == NULL) + { + mssErrorf(1, "EXP", + "%s(...): Expectation struct cannot be NULL", + fn_name + ); + return -1; + } + + /** Extract values. **/ + ASSERTMAGIC(arg, MGK_EXPRESSION); + int actual_datatype = arg->DataType; + + /** Check for a provided NULL value. **/ + if (arg->Flags & EXPR_F_NULL) + { + if (arg_expect->Flags & EXP_ARG_NOT_NULL) + { + mssErrorf(1, "EXP", + "%s(...): Expects a non-null value, but got NULL : %s (%d).", + fn_name, objTypeToStr(actual_datatype), actual_datatype + ); + return -1; + } + + /** Skip type checks for NULL values (unless they are forced). **/ + if (!(arg_expect->Flags & EXP_ARG_FORCE_TYPE)) goto skip_type_checks; + } + + /** Skip type checks if none are requested. **/ + if (arg_expect->Datatypes == NULL) goto skip_type_checks; + + /** Type checks requested, but no valid types given: Likely a mistake. **/ + if (arg_expect->Datatypes[0] == -1) + { + mssErrorf(1, "EXP", "%s(...): Invalid Schema! Empty array of allowed datatypes.", fn_name); + fprintf(stderr, "Hint: To skip type checks, pass NULL for the array of data types.\n"); + return -1; + } + + /** Verify datatypes. **/ + bool found = false; + for (int j = 0; arg_expect->Datatypes[j] != -1; j++) + { + const int expected_datatype = arg_expect->Datatypes[j]; + if (expected_datatype == actual_datatype) + { + found = true; + break; + } + } + + /** Handle failure. **/ + if (!found) + { + /** Accumulate additional valid types. **/ + char buf[256] = {'\0'}; + int cur = 0, j = 1; + while (true) + { + int datatype = arg_expect->Datatypes[j++]; + if (datatype == -1) break; + + cur += snprintf( + buf + cur, 256 - cur, + " or %s (%d)", + objTypeToStr(datatype), datatype + ); + } + + /** Print error. **/ + int first_datatype = arg_expect->Datatypes[0]; + mssErrorf(1, "EXP", + "%s(...): Expects type %s (%d)%s but got type %s (%d).", + fn_name, objTypeToStr(first_datatype), first_datatype, buf, objTypeToStr(actual_datatype), actual_datatype + ); + return -1; + } + + skip_type_checks: + /** All flag checks not implemented above should pass on NULL values. **/ + if (arg->Flags & EXPR_F_NULL) return 0; + + /** Verify other Flags by type, if specified. **/ + switch (actual_datatype) + { + case DATA_T_INTEGER: + { + int value = arg->Integer; + if (arg_expect->Flags & EXP_ARG_POSITIVE && value < 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects positive int but got %d.", + fn_name, value + ); + return -1; + } + if (arg_expect->Flags & EXP_ARG_NEGATIVE && value > 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects negative int but got %d.", + fn_name, value + ); + return -1; + } + break; + } + + case DATA_T_DOUBLE: + { + double value = arg->Types.Double; + if (arg_expect->Flags & EXP_ARG_NON_NAN && isnan(value)) + { + mssErrorf(1, "EXP", + "%s(...): Expects non-nan double but got %g.", + fn_name, value + ); + return -1; + } + if (arg_expect->Flags & EXP_ARG_POSITIVE && value < 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects positive double but got %g.", + fn_name, value + ); + return -1; + } + if (arg_expect->Flags & EXP_ARG_NEGATIVE && value > 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects negative double but got %g.", + fn_name, value + ); + return -1; + } + break; + } + + case DATA_T_STRING: + { + char* str = arg->String; + if (arg_expect->Flags & EXP_ARG_NON_EMPTY && str[0] == '\0') + { + mssErrorf(1, "EXP", + "%s(...): Expects string to contain characters, but got \"\".", + fn_name + ); + return -1; + } + break; + } + + case DATA_T_DATETIME: + { + pDateTime value = &arg->Types.Date; + if (arg_expect->Flags & EXP_ARG_POSITIVE && value->Value < 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects positive date offset but got %llu.", + fn_name, value->Value + ); + return -1; + } + if (arg_expect->Flags & EXP_ARG_NEGATIVE && value->Value > 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects negative date offset but got %llu.", + fn_name, value->Value + ); + return -1; + } + break; + } + + case DATA_T_MONEY: + { + pMoneyType value = &arg->Types.Money; + if (arg_expect->Flags & EXP_ARG_POSITIVE && value->WholePart < 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects positive money value but got $%d.%g.", + fn_name, value->WholePart, (double)value->FractionPart / 100.0 + ); + return -1; + } + if (arg_expect->Flags & EXP_ARG_NEGATIVE && value->WholePart > 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects negative money value but got $%d.%d.", + fn_name, value->WholePart, (double)value->FractionPart / 100.0 + ); + return -1; + } + } + + case DATA_T_STRINGVEC: + { + pStringVec str_vec = &arg->Types.StrVec; + if (arg_expect->Flags & EXP_ARG_NON_EMPTY && str_vec->nStrings == 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects StringVec to contain strings, but got [].", + fn_name + ); + return -1; + } + break; + } + + case DATA_T_INTVEC: + { + pIntVec int_vec = &arg->Types.IntVec; + if (arg_expect->Flags & EXP_ARG_NON_EMPTY && int_vec->nIntegers == 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects IntVec to contain strings, but got [].", + fn_name + ); + return -1; + } + break; + } + } + + return 0; + } + +/*** Verify that arguments passed to a function match some expected values. + *** + *** @param arg_expects A pointer to an array of ArgExpect structs, each + *** representing expectations for a single argument, in the order they + *** are passed to the function. + *** @param num_args The number of arguments to expect to be passed to the + *** function (and the length of arg_expects). + *** @param tree The tree containing the actual arguments passed. + *** @param obj_list The object list scope which was passed to the function. + *** @returns 0 if verification passes, or + *** -1 if an error occurs or arguments are incorrect. + *** + *** @attention - Promises that an error message will be printed with a call + *** to mssError() if an error occurs. + *** + *** Example: + *** ```c + *** if (exp_fn_i_verify_schema( + *** (ArgExpect[]){ + *** {(int[]){DATA_T_INTEGER, DATA_T_DOUBLE, DATA_T_DATETIME, -1}, EXP_ARG_NOT_NULL}, + *** {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, + *** {(int[]){DATA_T_STRING, -1}, EXP_ARG_OPTIONAL}, + *** EXP_ARG_END + *** }, tree + *** ) != 0) + *** { + *** mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", tree->Name); + *** return -1; + *** } + *** ``` + ***/ +static int +exp_fn_i_verify_schema(const ArgExpect* arg_expects, pExpression tree) + { + /** Verify expression tree. **/ + ASSERTMAGIC(tree, MGK_EXPRESSION); + + /** Count arguments. **/ + unsigned int req_args = 0u, opt_args = 0u; + for (unsigned int i = 0; arg_expects[i].Flags != EXP_ARG_END.Flags; i++) + { + if (arg_expects[i].Flags & EXP_ARG_OPTIONAL) + opt_args++; + else if (opt_args > 0) + { + /** Required argument follows optional argument (not allowed). **/ + mssErrorf(1, "EXP", "%s(?): Invalid Schema! Required argument #%u after optional argument.", tree->Name, i); + return -1; + } + else + req_args++; + } + const unsigned int total_args = req_args + opt_args; + + /** Verify argument count. **/ + const int actual_args = tree->Children.nItems; + if (opt_args == 0) + { + if (actual_args != req_args) + { + mssErrorf(1, "EXP", + "%s(?): Expects %u argument%s, got %d argument%s.", + tree->Name, req_args, (req_args == 1) ? "" : "s", actual_args, (actual_args == 1) ? "" : "s" + ); + return -1; + } + } + else + { + if (actual_args < req_args || total_args < actual_args) + { + mssErrorf(1, "EXP", + "%s(?): Expects between %u and %u arguments, got %d argument%s.", + tree->Name, req_args, total_args, actual_args, (actual_args == 1) ? "" : "s" + ); + return -1; + } + } + + /** Verify arguments. **/ + for (int i = 0; i < actual_args; i++) + { + if (exp_fn_i_verify_arg(tree->Name, tree->Children.Items[i], &arg_expects[i]) != 0) + { + mssErrorf(0, "EXP", + "%s(...): Error while reading arg #%d/%d.", + tree->Name, i + 1, max(i + 1, req_args) + ); + return -1; + } + } + + /** Pass. **/ + return 0; + } + +/*** Extract a number from a numeric expression. + *** + *** @param numeric_expr The numeric expression to be extracted. + *** @param result_ptr A pointer to a double where the result is stored. + *** @returns 0 on success, + *** -1 on failure, + *** 1 if the expression is NULL. + ***/ +static int +exp_fn_i_get_number(pExpression numeric_expr, double* result_ptr) + { + /** Check for null values. **/ + if (numeric_expr == NULL || numeric_expr->Flags & EXPR_F_NULL) return 1; + + /** Check for null destination. **/ + if (result_ptr == NULL) + { + mssError(1, "EXP", "Null location provided to store numeric result."); + return -1; + } + + /** Get the numeric value. **/ + double n; + switch(numeric_expr->DataType) + { + case DATA_T_INTEGER: n = numeric_expr->Integer; break; + case DATA_T_DOUBLE: n = numeric_expr->Types.Double; break; + case DATA_T_MONEY: n = objDataToDouble(DATA_T_MONEY, &(numeric_expr->Types.Money)); break; + default: + mssError(1, "EXP", + "%s (%d) is not a numeric type.", + objTypeToStr(numeric_expr->DataType), numeric_expr->DataType + ); + return -1; + } + + /** Store the result. **/ + *result_ptr = n; + + return 0; + } + +/*** Free the given tree's result string, if it has one. + *** + *** @param tree The affected tree. + ***/ +static void +exp_fn_i_free_result_string(pExpression tree) + { + /** If no string is allocated, no work is needed. **/ + if (tree->Alloc == 0) return; + + /** Free the string, if it exists. **/ + if (tree->String != NULL) nmSysFree(tree->String); + + /** No string is allocated anymore. */ + tree->Alloc = 0; + + return; + } + +/*** Ensure that the allocated result string is long enough to store a given + *** amount of required data. This function promises that `tree->String` will + *** point to at least `required_space` bytes after it returns 0. + *** + *** @param tree The affected tree. + *** @param required_space The number of bytes required. + *** @returns 0 if successful, or + *** -1 if an error occurs. + ***/ +static int +exp_fn_i_alloc_result_string(pExpression tree, const size_t required_space) + { + /** Free the previous string (if needed) so we can store a new one. **/ + exp_fn_i_free_result_string(tree); + + /** Decide how to allocate space. **/ + if (required_space <= 64) + { + /** We can use the preallocated buffer. **/ + tree->String = tree->Types.StringBuf; + tree->Alloc = 0; + } + else + { + /** We need to allocate new memory. **/ + char* result = check_ptr(nmSysMalloc(required_space * sizeof(char*))); + if (result == NULL) return -1; + tree->String = result; + tree->Alloc = 1; + } + + return 0; + } + /****** Evaluator functions follow for expEvalFunction ******/ @@ -1111,111 +1595,162 @@ int exp_fn_reverse(pExpression tree, pParamObjects objlist, pExpression i0, pExp return 0; } - -int exp_fn_lztrim(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) +/** Leading zero trim. */ +int +exp_fn_lztrim(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) { - char* ptr; - - if (!i0 || i0->Flags & EXPR_F_NULL) - { - tree->Flags |= EXPR_F_NULL; + /** Expect one nullable string parameter. **/ + if (exp_fn_i_verify_schema((ArgExpect[]){ + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, + EXP_ARG_END, + }, tree) != 0) + { + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", tree->Name); + return -1; + } + + /** Extract the arg string. **/ + pExpression maybe_str = check_ptr(tree->Children.Items[0]); + if (maybe_str->Flags & EXPR_F_NULL) + { + /** Propegate null values. **/ + tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_STRING; + return 0; + } + char* str = maybe_str->String; + + /*** We don't need to allocate new memory or copy anything because we + *** can simply point to the first character in the previous string + *** that isn't trimmed. + ***/ + + /** Iterate over all the characters that need to be removed. **/ + while (*str == '0' && (str[1] >= '0' && str[1] <= '9')) str++; + + /** Return the results using the tree. **/ + exp_fn_i_free_result_string(tree); tree->DataType = DATA_T_STRING; - return 0; - } - if (i0->DataType != DATA_T_STRING) - { - mssError(1,"EXP","lztrim() only works on STRING data types"); - return -1; - } - if (tree->Alloc && tree->String) - { - nmSysFree(tree->String); - } - tree->DataType = DATA_T_STRING; - ptr = i0->String; - while(*ptr == '0' && (ptr[1] >= '0' && ptr[1] <= '9')) ptr++; - tree->String = ptr; - tree->Alloc = 0; + tree->String = str; + return 0; } -int exp_fn_ltrim(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) +/** Left trim spaces. **/ +int +exp_fn_ltrim(pExpression tree) { - char* ptr; - - if (!i0 || i0->Flags & EXPR_F_NULL) - { - tree->Flags |= EXPR_F_NULL; - tree->DataType = DATA_T_STRING; - return 0; - } - if (i0->DataType != DATA_T_STRING) - { - mssError(1,"EXP","ltrim() only works on STRING data types"); - return -1; - } - if (tree->Alloc && tree->String) - { - nmSysFree(tree->String); - } - tree->DataType = DATA_T_STRING; - ptr = i0->String; - while(*ptr == ' ') ptr++; - tree->String = ptr; - tree->Alloc = 0; + /** Expect one nullable string parameter. **/ + if (exp_fn_i_verify_schema((ArgExpect[]){ + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, + EXP_ARG_END, + }, tree) != 0) + { + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", tree->Name); + return -1; + } + + /** Extract the arg string. **/ + pExpression maybe_str = check_ptr(tree->Children.Items[0]); + if (maybe_str->Flags & EXPR_F_NULL) + { + /** Propegate null values. **/ + tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_STRING; + return 0; + } + char* str = maybe_str->String; + + /*** We don't need to allocate new memory or copy anything because we + *** can simply point to the first character in the previous string + *** that isn't trimmed. + ***/ + + /** Iterate until we find the a charater that isn't a space. **/ + /** Note: Only spaces are trimmed, as with similar trim functions in most SQL languages. **/ + while (*str == ' ') str++; + + /** Return the results using the tree. **/ + exp_fn_i_free_result_string(tree); + tree->DataType = DATA_T_STRING; + tree->String = str; + return 0; } -int exp_fn_rtrim(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) +/** Right trim spaces. **/ +int +exp_fn_rtrim(pExpression tree) { - char* ptr; - int n; - - if (!i0 || i0->Flags & EXPR_F_NULL) - { - tree->Flags |= EXPR_F_NULL; - tree->DataType = DATA_T_STRING; - return 0; - } - if (i0->DataType != DATA_T_STRING) - { - mssError(1,"EXP","rtrim() only works on STRING data types"); - return -1; - } - if (tree->Alloc && tree->String) - { - nmSysFree(tree->String); - } - tree->Alloc = 0; - tree->DataType = DATA_T_STRING; - ptr = i0->String + strlen(i0->String); - while(ptr > i0->String && ptr[-1] == ' ') ptr--; - if (ptr == i0->String + strlen(i0->String)) - { - /** optimization for strings are still the same **/ - tree->String = i0->String; - } - else - { - /** have to copy because we removed spaces **/ - n = ptr - i0->String; - if (n < 63) + /** Expect one nullable string parameter. **/ + if (exp_fn_i_verify_schema((ArgExpect[]){ + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, + EXP_ARG_END, + }, tree) != 0) { - tree->String = tree->Types.StringBuf; - memcpy(tree->String, i0->String, n); - tree->String[n] = '\0'; - tree->Alloc = 0; + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", tree->Name); + return -1; } - else + + /** Extract the arg string. **/ + pExpression maybe_str = check_ptr(tree->Children.Items[0]); + if (maybe_str->Flags & EXPR_F_NULL) { - tree->String = (char*)nmSysMalloc(n+1); - memcpy(tree->String, i0->String, n); - tree->String[n] = '\0'; - tree->Alloc = 1; + /** Propegate null values. **/ + tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_STRING; + return 0; } - } + char* str = maybe_str->String; + + /** Trim spaces from the end of the string. **/ + /** Note: Only spaces are trimmed, as with similar trim functions in most SQL languages. **/ + const int len = strlen(str); + int n = len; + while (n > 0 && str[n - 1] == ' ') n--; + + /** Optimization for strings that are still the same. **/ + if (n == len) + { + tree->String = str; + goto end; + } + + /** We need to copy to remove spaces. **/ + if (!check(exp_fn_i_alloc_result_string(tree, n + 1))) return -1; + memcpy(tree->String, str, n); + tree->String[n] = '\0'; + + end: + /** Return the results in the tree. **/ + tree->DataType = DATA_T_STRING; + tree->Alloc = 0; + + return 0; + } + + +/** Left and right trim spaces. **/ +int +exp_fn_trim(pExpression tree) + { + /** Left trim the expression. **/ + exp_fn_ltrim(tree); + + /** Temporarily override the arg1 str pointer with the result from ltrim(). **/ + pExpression arg1 = tree->Children.Items[0]; + char* arg1_str = arg1->String; + arg1->String = tree->String; + tree->Alloc = 0; + + /** Right trim the expression, which will use the overriden string above. **/ + exp_fn_rtrim(tree); + + /** Restore the arg1 tree. **/ + arg1->String = arg1_str; + return 0; } @@ -2356,17 +2891,37 @@ int exp_fn_truncate(pExpression tree, pParamObjects objlist, pExpression i0, pEx /*** constrain(value, min, max) ***/ int exp_fn_constrain(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) { - if (!i0 || !i1 || !i2 || (i0->DataType != i1->DataType) || i0->DataType != i2->DataType || !(i0->DataType == DATA_T_INTEGER || i0->DataType == DATA_T_MONEY || i0->DataType == DATA_T_DOUBLE)) - { - mssError(1,"EXP","constrain() requires three numeric parameters of the same data type"); - return -1; - } + /** Skip null value. **/ tree->DataType = i0->DataType; if ((i0->Flags & EXPR_F_NULL)) { tree->Flags |= EXPR_F_NULL; return 0; } + + /** Verify parameters. **/ + if (i0 == NULL || i1 == NULL || i2 == NULL) + { + mssError(1, "EXP", "constrain() expects three parameters."); + return -1; + } + if (i0->DataType != DATA_T_INTEGER && i0->DataType != DATA_T_DOUBLE && i0->DataType != DATA_T_MONEY) + { + mssError(1, "EXP", + "constrain() expects three numeric parameters: %s is not numeric.", + objTypeToStr(i0->DataType) + ); + if (i0->DataType == DATA_T_STRING) printf("Value: '%s'\n", i0->String); + return -1; + } + if (i0->DataType != i1->DataType || i1->DataType != i2->DataType) + { + mssError(1, "EXP", + "constrain() expects three numeric parameters of the same data type but got types %s, %s, and %s.", + objTypeToStr(i0->DataType), objTypeToStr(i1->DataType), objTypeToStr(i2->DataType) + ); + return -1; + } /* check min */ if (!(i1->Flags & EXPR_F_NULL)) @@ -3224,99 +3779,111 @@ int exp_fn_from_base64(pExpression tree, pParamObjects objlist, pExpression i0, return -1; } - -int exp_fn_log10(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) +static int +exp_fn_i_do_math(pExpression tree, double (*math)(), int arg_num) { - double n; - - if (!i0) + /** Verify function schema: expect arg_num numeric values. **/ + ArgExpect expects[arg_num + 1]; + for (int i = 0; i < arg_num; i++) + expects[i] = (ArgExpect){(int[]){DATA_T_INTEGER, DATA_T_DOUBLE, DATA_T_MONEY, -1}, EXP_ARG_NO_FLAGS}; + expects[arg_num] = EXP_ARG_END; + if (exp_fn_i_verify_schema(expects, tree) != 0) { - mssError(1, "EXP", "log10() requires a number as its first parameter"); - goto error; + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", tree->Name); + return -1; } - if (i0->Flags & EXPR_F_NULL) + + /** Null checks. **/ + for (int i = 0; i < arg_num; i++) { - tree->DataType = DATA_T_DOUBLE; - tree->Flags |= EXPR_F_NULL; - return 0; + pExpression arg = tree->Children.Items[i]; + if (arg->Flags & EXPR_F_NULL) + { + tree->DataType = DATA_T_DOUBLE; + tree->Flags |= EXPR_F_NULL; + return 0; + } } - switch(i0->DataType) + + /** Maximum supported args. **/ + if (arg_num > 4) { - case DATA_T_INTEGER: - n = i0->Integer; - break; - case DATA_T_DOUBLE: - n = i0->Types.Double; - break; - case DATA_T_MONEY: - n = objDataToDouble(DATA_T_MONEY, &(i0->Types.Money)); - break; - default: - mssError(1, "EXP", "log10() requires a number as its first parameter"); - goto error; + mssErrorf(1, "EXP", "%s(...): exp_fn_i_do_math() does not support functions with more than 4 arguments. If this is an issue, please increase the number of arguments here: %s:%d", tree->Name, __FILE__, __LINE__); + return -1; } - if (n < 0) + + /** Get the numbers for the args. **/ + double n[4]; + for (int i = 0; i < arg_num; i++) { - mssError(1, "EXP", "log10(): cannot compute the logarithm of a negative number"); - goto error; + if (!check(exp_fn_i_get_number(tree->Children.Items[i], &(n[i])))) + { + mssErrorf(0, "EXP", "%s(...): Failed to get arg%d.", tree->Name, i); + return -1; + } } + + /** Return results. **/ tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = log10(n); - return 0; + tree->Types.Double = math(n[0], n[1], n[2], n[3]); /* Call function with all supported args. */ + + return 0; + } - error: - return -1; +int +exp_fn_power(pExpression tree) + { + return exp_fn_i_do_math(tree, pow, 2); } +int +exp_fn_ln(pExpression tree) + { + return exp_fn_i_do_math(tree, log, 1); + } -int exp_fn_power(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) +int +exp_fn_log10(pExpression tree) { - double n, p; + return exp_fn_i_do_math(tree, log10, 1); + } - if (!i0 || !i1) - { - mssError(1, "EXP", "power() requires numbers as its first and second parameters"); - goto error; - } - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL)) +int +exp_fn_log(pExpression tree) + { + /** Verify function schema: A number and an optional base. **/ + if (exp_fn_i_verify_schema((ArgExpect[]){ + {(int[]){DATA_T_INTEGER, DATA_T_DOUBLE, DATA_T_MONEY, -1}, EXP_ARG_NO_FLAGS}, + {(int[]){DATA_T_INTEGER, DATA_T_DOUBLE, DATA_T_MONEY, -1}, EXP_ARG_OPTIONAL}, + EXP_ARG_END, + }, tree) != 0) { - tree->DataType = DATA_T_DOUBLE; - tree->Flags |= EXPR_F_NULL; - return 0; + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", tree->Name); + return -1; } - switch(i0->DataType) + + /** Extract args. **/ + double number, base; + if (!check(exp_fn_i_get_number(check_ptr(tree->Children.Items[0]), &number))) { - case DATA_T_INTEGER: - n = i0->Integer; - break; - case DATA_T_DOUBLE: - n = i0->Types.Double; - break; - case DATA_T_MONEY: - n = objDataToDouble(DATA_T_MONEY, &(i0->Types.Money)); - break; - default: - mssError(1, "EXP", "power() requires a number as its first parameter"); - goto error; + mssErrorf(0, "EXP", "%s(...): Failed to get arg1 (number).", tree->Name); + return -1; } - switch(i1->DataType) + if (tree->Children.nItems > 1) { - case DATA_T_INTEGER: - p = i1->Integer; - break; - case DATA_T_DOUBLE: - p = i1->Types.Double; - break; - default: - mssError(1, "EXP", "power() requires an integer or double as its second parameter"); - goto error; + if (!check(exp_fn_i_get_number(check_ptr(tree->Children.Items[1]), &base))) + { + mssErrorf(0, "EXP", "%s(...): Failed to get arg2 (base).", tree->Name); + return -1; + } } + else base = M_E; + + /** Return the results in the tree. **/ tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = pow(n, p); - return 0; + tree->Types.Double = log(number) / log(base); - error: - return -1; + return 0; } @@ -3978,368 +4545,188 @@ int exp_fn_nth(pExpression tree, pParamObjects objlist, pExpression i0, pExpress return 0; } -/* See centrallix-sysdoc/string_comparison.md for more information. */ -int exp_fn_levenshtein(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) - { - if (!i0 || !i1) - { - mssError(1,"EXP","levenshtein() requires two parameters"); - return -1; - } - - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL)) - { - tree->DataType = DATA_T_INTEGER; - tree->Flags |= EXPR_F_NULL; - return 0; - } - - if ((i0->DataType != DATA_T_STRING) || (i1->DataType != DATA_T_STRING)) - { - mssError(1,"EXP","levenshtein() requires two string parameters"); - return -1; - } - - // for all i and j, d[i,j] will hold the Levenshtein distance between - // the first i characters of s and the first j characters of t - int length1 = strlen(i0->String); - int length2 = strlen(i1->String); - //int levMatrix[length1+1][length2+1]; - int (*levMatrix)[length1+1][length2+1] = nmSysMalloc(sizeof(*levMatrix)); - int i; - int j; - //set each element in d to zero - for (i = 0; i < length1; i++) +int +exp_fn_metaphone(pExpression tree, pParamObjects obj_list) { - for (j = 0; j < length2; j++) - { - (*levMatrix)[i][j] = 0; - } - } - - // source prefixes can be transformed into empty string by - // dropping all characters - for (i = 0; i <= length1; i++) - { - (*levMatrix)[i][0] = i; - } - - // target prefixes can be reached from empty source prefix - // by inserting every character - for (j = 0; j <= length2; j++) - { - (*levMatrix)[0][j] = j; - } - - for (i = 1; i <= length1; i++) - { - for (j = 1; j <= length2; j++) - { - if (i0->String[i-1] == i1->String[j-1]) - { - (*levMatrix)[i][j] = (*levMatrix)[i-1][j-1]; - } - else - { - int value1 = (*levMatrix)[i - 1][j] + 1; - int value2 = (*levMatrix)[i][j-1] + 1; - int value3 = (*levMatrix)[i-1][j-1] + 1; - (*levMatrix)[i][j] = (value1 < value2) ? - ((value1 < value3) ? value1 : value3) : - (value2 < value3) ? value2 : value3; - } - } - } - tree->DataType = DATA_T_INTEGER; - tree->Integer = (*levMatrix)[length1][length2]; - nmSysFree(levMatrix); - return 0; - } - -/* See centrallix-sysdoc/string_comparison.md for more information. */ -int exp_fn_lev_compare(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) - { - - if (!i0 || !i1) - { - mssError(1,"EXP","lev_compare() requires two or three parameters"); - return -1; - } - - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL) || (i2 && (i2->Flags & EXPR_F_NULL))) - { - tree->DataType = DATA_T_DOUBLE; - tree->Flags |= EXPR_F_NULL; - return 0; - } - - if ((i0->DataType != DATA_T_STRING) || (i1->DataType != DATA_T_STRING) || (i2 && i2->DataType != DATA_T_INTEGER)) - { - mssError(1,"EXP","lev_compare() requires two string and one optional integer parameters"); - return -1; - } - - exp_fn_levenshtein(tree, objlist, i0, i1, i2); - //!!! I am not checking for errors here, because IN THEORY we have two strings... if we don't, big uh-oh. - int lev_dist = tree->Integer; + /** Verify function schema. **/ + if (exp_fn_i_verify_schema((ArgExpect[]){ + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, + EXP_ARG_END, + }, tree) != 0) + { + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", tree->Name); + return -1; + } - int length1 = strlen(i0->String); - int length2 = strlen(i1->String); - - double clamped_dist = 1.0; - - if (length1 == 0 || length2 == 0) //empty string - { - clamped_dist = 0.5; - } - else //normal case - { - int max_len = (length1 > length2) ? length1 : length2; - clamped_dist = ((double) lev_dist) / max_len; + /** Allocate space to store metaphone pointers. **/ + char* primary = NULL; + char* secondary = NULL; - if (abs(length1-length2) == lev_dist) //only inserts. Maybe substring. - { - clamped_dist /= 2; - } - - //use max_field_width if it was provided as a sensible value. If not, don't use it. - double max_field_width = i2?(i2->Integer):0; - if (max_field_width && max_field_width >= max_len) { - double mod = (lev_dist + max_field_width * 3/4) / max_field_width; - if (mod < 1) { //don't make clamped_dist bigger - clamped_dist *= mod; - } - } - } + /** Extract string param. **/ + pExpression maybe_str = check_ptr(tree->Children.Items[0]); + if (maybe_str->Flags & EXPR_F_NULL) + { + tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_STRING; + return 0; + } + const char* str = check_ptr(maybe_str->String); + const size_t str_len = strlen(str); + if (str_len == 0u) + { + primary = ""; + secondary = ""; + goto store_data; + } + /** Compute DoubleMetaphone. **/ + meta_double_metaphone(str, &primary, &secondary); - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = 1.0 - clamped_dist; - return 0; -} - -// This is the size of the vector table. It is also used in calculating the table indices. -const int EXP_VECTOR_TABLE_SIZE = 251; - -/* - * hash_char_pair - * This method creates an vector table index based a given character pair. The characters are represented - * as their ASCII code points. - * - * Parameters: - * num1 : first ASCII code point (double) - * num2 : second ASCII code point (double) - * - * Returns: - * vector table index (integer) - */ -int exp_fn_i_hash_char_pair(double num1, double num2) - { - int func_result = round(((num1 * num1 * num1) + (num2 * num2 * num2)) * ((num1+1)/(num2+1))) -1; - return func_result % EXP_VECTOR_TABLE_SIZE; + /** Store the results. **/ + store_data:; + const size_t length = strlen(primary) + 1lu + strlen(secondary) + 1lu; + if (!check(exp_fn_i_alloc_result_string(tree, length))) return -1; + sprintf(tree->String, "%s%c%s", primary, CA_BOUNDARY_CHAR, secondary); + tree->DataType = DATA_T_STRING; + + return 0; } -/* - * exp_fn_i_frequency_table - * This method creates a vector frequency table based on a string of characters. - * - * Parameters: - * table : integer pointer to vector frequency table (unsigned short) - * term : the string of characters (char*) - * - * Returns: - * 0 - */ -int exp_fn_i_frequency_table(unsigned short *table, char *term) +/*** Computes cosine or Levenshtein similarity between two strings. These two + *** tasks have a large amount of overlapping logic (mostly error checking), + *** so doing them with one function greatly reduces code duplocation. + *** + *** @param tree The tree resulting from this function. + *** @param obj_list The evaluation "scope", including available variables. + *** @param fn_name Either `cos_compare()` or `lev_compare()`. + *** @returns 0 for success, -1 for failure. + ***/ +static int +exp_fn_compare(pExpression tree, pParamObjects obj_list, const char* fn_name) { - int i; - // Initialize hash table with 0 values - for (i = 0; i < EXP_VECTOR_TABLE_SIZE; i++) - { - table[i] = 0; - } - - int j = -1; - for(i = 0; i < strlen(term) + 1; i++) - { - // If latter character is punctuation or whitespace, skip it - if (ispunct(term[i]) || isspace(term[i])) + /** Verify function schema. **/ + if (exp_fn_i_verify_schema((ArgExpect[]){ + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, + EXP_ARG_END, + }, tree) != 0) { - continue; - } - - double temp1 = 0.0; - double temp2 = 0.0; - - // If previous character is null - if (j == -1) - { - temp1 = 96; + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", fn_name); + return -1; } - - // Else character is not null - else + + /** Extract strings. **/ + pExpression maybe_str1 = check_ptr(tree->Children.Items[0]); + pExpression maybe_str2 = check_ptr(tree->Children.Items[1]); + if (maybe_str1->Flags & EXPR_F_NULL || maybe_str2->Flags & EXPR_F_NULL) { - temp1 = (int)tolower(term[j]); + tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_DOUBLE; + return 0; } - - // If latter character is null - if (i == strlen(term)) - { - temp2 = 96; + char* str1 = check_ptr(maybe_str1->String); + char* str2 = check_ptr(maybe_str2->String); + + /** Handle either cos_compare() or lev_compare(). **/ + if (fn_name[0] == 'c') + { /* cos_compare() */ + int ret; + + /** Build vectors. **/ + const pVector v1 = check_ptr(ca_build_vector(str1)); + const pVector v2 = check_ptr(ca_build_vector(str2)); + if (v1 == NULL || v2 == NULL) + { + mssErrorf(1, "EXP", + "%s(\"%s\", \"%s\"): Failed to build vectors.", + fn_name, str1, str2 + ); + ret = -1; + } + else + { + /** Compute the similarity. **/ + tree->Types.Double = ca_cos_compare(v1, v2); + tree->DataType = DATA_T_DOUBLE; + ret = 0; + } + + /** Clean up. **/ + if (v1 != NULL) ca_free_vector(v1); + if (v2 != NULL) ca_free_vector(v2); + return ret; } - - // Else character is not null else - { - temp2 = (int)tolower(term[i]); - } - - // Else character is not null // If either character is a number, reassign the code point - if (temp1 >= 48 && temp1 <= 57) - { - temp1 += 75; - } - - if (temp2 >= 48 && temp2 <= 57) - { - temp2 += 75; + { /* lev_compare() */ + double lev_sim = check_double(ca_lev_compare(str1, str2)); + if (isnan(lev_sim)) + { + mssErrorf(1, "EXP", "%s(\"%s\", \"%s\"): Failed to compute levenstein edit distance."); + return -1; + } + + /** Return the computed result. **/ + tree->Types.Double = lev_sim; + tree->DataType = DATA_T_DOUBLE; + return 0; } - - // Hash the character pair into an index - int index = exp_fn_i_hash_char_pair(temp1, temp2); - - // Increment Frequency Table value by number from 0 to 13 - table[index] += ((unsigned short)temp1 + (unsigned short)temp2) % 13 + 1; - - // Move j up to latter character before incrementing i - j = i; - - } - - return 0; - + + return -1; /* Unreachable. */ } -/* - * exp_fn_i_dot_product - * This method calculautes the dot product of two vectors. - * - * Parameters: - * dot_product : the place where the result is stored (double) - * r_freq_table1 : the first vector (unsigned short) - * r_freq_table2 : the second vector (unsigned short) - * - * Returns: - * 0 - */ -int exp_fn_i_dot_product(double *dot_product, unsigned short *r_freq_table1, unsigned short *r_freq_table2) +int +exp_fn_cos_compare(pExpression tree, pParamObjects obj_list) { - int i; - for (i = 0; i < EXP_VECTOR_TABLE_SIZE; i++) - { - *dot_product = *dot_product + ((double)r_freq_table1[i] * (double)r_freq_table2[i]); - } - return 0; + return exp_fn_compare(tree, obj_list, "cos_compare"); } -/* - * exp_fn_i_magnitude - * This method calculates the magnitude of a vector - * - * Parameters: - * magnitude : the place where the result is stored (double) - * r_freq_table : the vector (unsigned short) - */ -int exp_fn_i_magnitude(double *magnitude, unsigned short *r_freq_table) +int +exp_fn_lev_compare(pExpression tree, pParamObjects obj_list) { - int i; - for (i = 0; i < EXP_VECTOR_TABLE_SIZE; i++) - { - *magnitude = *magnitude + ((double)r_freq_table[i] * (double)r_freq_table[i]); - } - *magnitude = sqrt(*magnitude); - return 0; + return exp_fn_compare(tree, obj_list, "lev_compare"); } -/* - * exp_fn_cos_compare - * This method calculates the cosine similarity of two vector frequency tables - * See centrallix-sysdoc/string_comparison.md for more information. - * - * Parameters: - * tree : structure where output is stored - * objlist: - * i0 : first data entry (pExpression) - * i1 : second data entry (pExpression) - * i2 : - * - * Returns: - * 0 - */ -int exp_fn_cos_compare(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) +int +exp_fn_levenshtein(pExpression tree, pParamObjects obj_list) { - // Ensure function receives two non-null parameters - if (!i0 || !i1) - { - mssError(1,"EXP","cos_compare() requires two parameter."); - return -1; - } - - // Ensure value passed in both parameters is not null - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL)) - { - tree->DataType = DATA_T_DOUBLE; - tree->Flags |= EXPR_F_NULL; - return 0; - } - - // Ensure both parameters contain string values - if ((i0->DataType != DATA_T_STRING) || (i1->DataType != DATA_T_STRING)) - { - mssError(1,"EXP","cos_compare() requires two string parameters."); - return -1; - } - - //If the two strings are identical, don't bother running cosine compare - if (strcmp(i0->String, i1->String) == 0) - { - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = 1.0; - return 0; - } - - // Allocate frequency tables (arrays of integers) for each term - unsigned short *table1 = nmMalloc(EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - unsigned short *table2 = nmMalloc(EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - - if (table1 == NULL || table2 == NULL) - { - mssError(1,"EXP","Memory allocation failed."); - return -1; - } - - // Calculate frequency tables for each term - exp_fn_i_frequency_table(table1, i0->String); - exp_fn_i_frequency_table(table2, i1->String); + /** Verify function schema. **/ + if (exp_fn_i_verify_schema((ArgExpect[]){ + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, + EXP_ARG_END, + }, tree) != 0) + { + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", tree->Name); + return -1; + } + + /** Extract strings. **/ + pExpression maybe_str1 = check_ptr(tree->Children.Items[0]); + pExpression maybe_str2 = check_ptr(tree->Children.Items[1]); + if (maybe_str1->Flags & EXPR_F_NULL || maybe_str2->Flags & EXPR_F_NULL) + { + tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_INTEGER; + return 0; + } + char* str1 = check_ptr(maybe_str1->String); + char* str2 = check_ptr(maybe_str2->String); + + /** Compute edit distance. **/ + /** Length 0 is provided for both strings so that the function will compute it for us. **/ + int edit_dist = ca_edit_dist(str1, str2, 0lu, 0lu); + if (!check_neg(edit_dist)) + { + mssErrorf(1, "EXP", "%s(\"%s\", \"%s\"): Failed to compute edit distance.\n", tree->Name, str1, str2); + return -1; + } - // Calculate dot product - double dot_product = 0; - exp_fn_i_dot_product(&dot_product, table1, table2); - - // Calculate magnitudes of each relative frequency vector - double magnitude1 = 0; - double magnitude2 = 0; - exp_fn_i_magnitude(&magnitude1, table1); - exp_fn_i_magnitude(&magnitude2, table2); + /** Return the computed distance. **/ + tree->Integer = edit_dist; + tree->DataType = DATA_T_INTEGER; - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = dot_product / (magnitude1 * magnitude2); - nmFree(table1, EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - nmFree(table2, EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - return 0; } @@ -4351,7 +4738,7 @@ int exp_fn_cos_compare(pExpression tree, pParamObjects objlist, pExpression i0, * Parameters: * pExpression tree: * pParamObjects: - * pExpression passowrd: The password, passed as a pExpression + * pExpression password: The password, passed as a pExpression * pExpression salt: The salt, passed as a pExpression * * returns: @@ -4466,7 +4853,9 @@ int exp_fn_argon2id(pExpression tree, pParamObjects objlist, pExpression passwor int exp_internal_DefineFunctions() { - + /** Initialize clustering library. **/ + ca_init(); + /** Function list for EXPR_N_FUNCTION nodes **/ xhAdd(&EXP.Functions, "getdate", (char*)exp_fn_getdate); xhAdd(&EXP.Functions, "user_name", (char*)exp_fn_user_name); @@ -4485,6 +4874,7 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "ltrim", (char*)exp_fn_ltrim); xhAdd(&EXP.Functions, "lztrim", (char*)exp_fn_lztrim); xhAdd(&EXP.Functions, "rtrim", (char*)exp_fn_rtrim); + xhAdd(&EXP.Functions, "trim", (char*)exp_fn_trim); xhAdd(&EXP.Functions, "substring", (char*)exp_fn_substring); xhAdd(&EXP.Functions, "right", (char*)exp_fn_right); xhAdd(&EXP.Functions, "ralign", (char*)exp_fn_ralign); @@ -4519,8 +4909,11 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "hash", (char*)exp_fn_hash); xhAdd(&EXP.Functions, "hmac", (char*)exp_fn_hmac); xhAdd(&EXP.Functions, "log10", (char*)exp_fn_log10); + xhAdd(&EXP.Functions, "ln", (char*)exp_fn_ln); + xhAdd(&EXP.Functions, "log", (char*)exp_fn_log); xhAdd(&EXP.Functions, "power", (char*)exp_fn_power); xhAdd(&EXP.Functions, "pbkdf2", (char*)exp_fn_pbkdf2); + xhAdd(&EXP.Functions, "metaphone", (char*)exp_fn_metaphone); xhAdd(&EXP.Functions, "levenshtein", (char*)exp_fn_levenshtein); xhAdd(&EXP.Functions, "lev_compare", (char*)exp_fn_lev_compare); xhAdd(&EXP.Functions, "cos_compare", (char*)exp_fn_cos_compare); @@ -4535,7 +4928,7 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "row_number", (char*)exp_fn_row_number); xhAdd(&EXP.Functions, "dense_rank", (char*)exp_fn_dense_rank); xhAdd(&EXP.Functions, "lag", (char*)exp_fn_lag); - + /** Aggregate **/ xhAdd(&EXP.Functions, "count", (char*)exp_fn_count); xhAdd(&EXP.Functions, "avg", (char*)exp_fn_avg); @@ -4545,9 +4938,9 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "first", (char*)exp_fn_first); xhAdd(&EXP.Functions, "last", (char*)exp_fn_last); xhAdd(&EXP.Functions, "nth", (char*)exp_fn_nth); - + /** Reverse functions **/ xhAdd(&EXP.ReverseFunctions, "isnull", (char*)exp_fn_reverse_isnull); - + return 0; } diff --git a/centrallix/include/cxss/policy.h b/centrallix/include/cxss/policy.h index aeee11ce..6f9ca7d8 100644 --- a/centrallix/include/cxss/policy.h +++ b/centrallix/include/cxss/policy.h @@ -2,6 +2,7 @@ #define _CXSS_POLICY_H #include "cxss/cxss.h" +#include "obj.h" /************************************************************************/ /* Centrallix Application Server System */ @@ -89,4 +90,3 @@ typedef struct _CXSSPOL CxssPolicy, *pCxssPolicy; #endif /* defined _CXSS_POLICY_H */ - diff --git a/centrallix/include/double_metaphone.h b/centrallix/include/double_metaphone.h new file mode 100644 index 00000000..75719586 --- /dev/null +++ b/centrallix/include/double_metaphone.h @@ -0,0 +1,83 @@ +#ifndef DOUBLE_METAPHONE_H +#define DOUBLE_METAPHONE_H + +/************************************************************************/ +/* Text-DoubleMetaphone */ +/* Centrallix Base Library */ +/* */ +/* Copyright 2000, Maurice Aubrey . */ +/* All rights reserved. */ +/* */ +/* This code is copied for redistribution with modification, from the */ +/* gitpan/Text-DoubleMetaphone implementation on GitHub (1), which is */ +/* under the following license. */ +/* */ +/* This code is based heavily on the C++ implementation by Lawrence */ +/* Philips and incorporates several bug fixes courtesy of Kevin */ +/* Atkinson . */ +/* */ +/* This module is free software; you may redistribute it and/or */ +/* modify it under the same terms as Perl itself. */ +/* */ +/* A summary of the relevant content from https://dev.perl.org/licenses */ +/* has been included below for the convenience of the reader. This */ +/* information was collected and saved on September 5th, 2025 and may */ +/* differ from current information. For the most up to date copy of */ +/* this information, please use the link provided above. */ +/* */ +/* Perl5 is Copyright © 1993 and later, by Larry Wall and others. */ +/* */ +/* It is free software; you can redistribute it and/or modify it */ +/* under the terms of either: */ +/* */ +/* a) the GNU General Public License (2) as published by the Free */ +/* Software Foundation (3); either version 1 (2), or (at your */ +/* option) any later version (4), or */ +/* */ +/* b) the "Artistic License" (5). */ +/* */ +/* Citations: */ +/* 1: https://github.com/gitpan/Text-meta_double_metaphone */ +/* 2: https://dev.perl.org/licenses/gpl1.html */ +/* 3: http://www.fsf.org */ +/* 4: http://www.fsf.org/licenses/licenses.html#GNUGPL */ +/* 5: https://dev.perl.org/licenses/artistic.html */ +/* */ +/* Centrallix is published under the GNU General Public License, */ +/* satisfying the above requirement. A summary of this is included */ +/* below for the convenience of the reader. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: double_metaphone.c, double_metaphone.h */ +/* Author: Maurice Aubrey and Israel Fuller */ +/* Description: This module implements a "sounds like" algorithm by */ +/* Lawrence Philips which he published in the June, 2000 */ +/* issue of C/C++ Users Journal. Double Metaphone is an */ +/* improved version of the original Metaphone algorithm */ +/* written by Philips'. This implementaton was written by */ +/* Maurice Aubrey for C/C++ with bug fixes provided by */ +/* Kevin Atkinson. It was revised by Israel Fuller to */ +/* better align with the Centrallix coding style and */ +/* standards so that it could be included here. */ +/************************************************************************/ + +void meta_double_metaphone(const char* str, char** primary_code, char** secondary_code); + +#endif /* End of .h file. */ diff --git a/centrallix/include/obj.h b/centrallix/include/obj.h index 0db1ea3e..7015b05e 100644 --- a/centrallix/include/obj.h +++ b/centrallix/include/obj.h @@ -307,7 +307,7 @@ typedef struct _OA #define OBJ_INFO_F_NO_CONTENT (1<<13) /* object does not have content, objRead() would fail */ #define OBJ_INFO_F_SUPPORTS_INHERITANCE (1<<14) /* object can support inheritance attr cx__inherit, etc. */ #define OBJ_INFO_F_FORCED_LEAF (1<<15) /* object is forced to be a 'leaf' unless ls__type used. */ -#define OBJ_INFO_F_TEMPORARY (1<<16) /* this is a temporary object without a vaoid pathname. */ +#define OBJ_INFO_F_TEMPORARY (1<<16) /* this is a temporary object without a valid pathname. */ /** object virtual attribute - these are attributes which persist only while @@ -735,6 +735,8 @@ void obj_internal_OpenCtlToString(pPathname pathinfo, int pathstart, int pathend int obj_internal_PathToText(pPathname pathinfo, int pathend, pXString str); /** objectsystem datatype functions **/ +int objTypeFromStr(const char* str); +char* objTypeToStr(const int type); int objDataToString(pXString dest, int data_type, void* data_ptr, int flags); double objDataToDouble(int data_type, void* data_ptr); int objDataToInteger(int data_type, void* data_ptr, char* format); diff --git a/centrallix/include/stparse.h b/centrallix/include/stparse.h index b19a77cc..2f7e4163 100644 --- a/centrallix/include/stparse.h +++ b/centrallix/include/stparse.h @@ -46,7 +46,7 @@ typedef struct _SI int Magic; int LinkCnt; char* Name; /* name of attrib or group */ - char* UsrType; /* type of group, null if attrib */ + char* UsrType; /* type of group (e.g. "system/object"), null if attrib */ pExpression Value; /* value; EXPR_N_LIST if several listed */ struct _SI* Parent; /* Parent inf, null if toplevel */ struct _SI** SubInf; /* List of attrs/groups included */ diff --git a/centrallix/multiquery/multiquery.c b/centrallix/multiquery/multiquery.c index 89736275..438c011f 100644 --- a/centrallix/multiquery/multiquery.c +++ b/centrallix/multiquery/multiquery.c @@ -1046,11 +1046,14 @@ mq_internal_ParseSelectItem(pQueryStructure item_qs, pLxSession lxs) n_tok = 0; while(1) { + /** Get the next token. **/ t = mlxNextToken(lxs); if (t == MLX_TOK_ERROR || t == MLX_TOK_EOF) break; n_tok++; - if ((t == MLX_TOK_RESERVEDWD || t == MLX_TOK_COMMA || t == MLX_TOK_SEMICOLON) && parenlevel <= 0) + + /** Special handling for certain token types. **/ + if ((t == MLX_TOK_COMMA || t == MLX_TOK_SEMICOLON) && parenlevel <= 0) break; if (t == MLX_TOK_OPENPAREN) parenlevel++; @@ -1061,9 +1064,19 @@ mq_internal_ParseSelectItem(pQueryStructure item_qs, pLxSession lxs) break; } - /** Copy it to the raw data **/ + /** Get the token string. **/ ptr = mlxStringVal(lxs,NULL); if (!ptr) break; + + /** Skip all reserved words except log(). **/ + if (t == MLX_TOK_RESERVEDWD && parenlevel <= 0) + { + /** Treat "log" as a keyword to allow the log function to be handled properly. **/ + if (strcmp(ptr, "log") == 0) t = MLX_TOK_KEYWORD; + else break; + }; + + /** Copy the token string into item_qs->RawData. **/ if (t == MLX_TOK_STRING) xsConcatQPrintf(&item_qs->RawData, "%STR&DQUOT", ptr); else @@ -2086,6 +2099,7 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p mssError(1,"MQ","Expected equals after EXEC parameter"); mlxNoteError(lxs); xsFree(xs); + xs = NULL; break; } @@ -2098,6 +2112,7 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p mssError(1,"MQ","Error in EXEC parameter"); mlxNoteError(lxs); xsFree(xs); + xs = NULL; xsFree(param); break; } @@ -2108,6 +2123,7 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p mssError(1,"MQ","Could not evaluate EXEC parameter"); mlxNoteError(lxs); xsFree(xs); + xs = NULL; xsFree(param); break; } @@ -2120,8 +2136,11 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p } } - strtcpy(new_qs->Source, xs->String, sizeof(new_qs->Source)); - next_state = LookForClause; + if (xs != NULL) + { + strtcpy(new_qs->Source, xs->String, sizeof(new_qs->Source)); + next_state = LookForClause; + } } else { @@ -4774,5 +4793,3 @@ mqInitialize() return 0; } - - diff --git a/centrallix/objectsystem/obj_datatypes.c b/centrallix/objectsystem/obj_datatypes.c index c674fe3d..388a75e6 100644 --- a/centrallix/objectsystem/obj_datatypes.c +++ b/centrallix/objectsystem/obj_datatypes.c @@ -127,6 +127,93 @@ char* obj_default_money_fmt = "$0.00"; char* obj_default_null_fmt = "NULL"; +/** Should maybe replace current type parsing in the presentation hints. **/ +/*** Parse the given string into a datatype. The case of the first character + *** is ignored, but all other characters must be capitalized correctly. + *** + *** @attention - This function is optimized to prevent performance hits + *** situations where it may need to be called many thousands of times. + *** + *** @param str The string to be parsed to a datatype. + *** @returns The datatype. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +int +objTypeFromStr(const char* str) + { + /** All valid types are non-null strings, at least 2 characters long. **/ + if (str == NULL || str[0] == '\0' || str[1] == '\0') return -1; + + /** Check type. **/ + switch (str[0]) + { + case 'A': case 'a': + if (strcmp(str+1, "Array"+1) == 0) return DATA_T_ARRAY; + if (strcmp(str+1, "Any"+1) == 0) return DATA_T_ANY; + break; + + case 'B': case 'b': + if (strcmp(str+1, "Binary"+1) == 0) return DATA_T_BINARY; + break; + + case 'C': case 'c': + if (strcmp(str+1, "Code"+1) == 0) return DATA_T_CODE; + break; + + case 'D': case 'd': + if (strcmp(str+1, "Double"+1) == 0) return DATA_T_DOUBLE; + if (strcmp(str+1, "DateTime"+1) == 0) return DATA_T_DATETIME; + break; + + case 'I': case 'i': + if (strcmp(str+1, "Integer"+1) == 0) return DATA_T_INTEGER; + if (strcmp(str+1, "IntVector"+1) == 0) return DATA_T_INTVEC; + break; + + case 'M': case 'm': + if (strcmp(str+1, "Money"+1) == 0) return DATA_T_MONEY; + break; + + case 'S': case 's': + if (strcmp(str+1, "String"+1) == 0) return DATA_T_STRING; + if (strcmp(str+1, "StringVector"+1) == 0) return DATA_T_STRINGVEC; + break; + + case 'U': case 'u': + if (strcmp(str+1, "Unknown"+1) == 0) return DATA_T_UNAVAILABLE; + if (strcmp(str+1, "Unavailable"+1) == 0) return DATA_T_UNAVAILABLE; + break; + } + + /** Invalid type. **/ + return -1; + } + + +/*** Convert a type to its string name. + *** + *** @param type The type to be converted. + *** @returns A char* to the type name, or + *** "(unknown)" if the type is unknown, or + *** "invalid" if the type number cannot even be a valid type. + ***/ +char* +objTypeToStr(const int type) + { + /** Guard out of bounds reads. **/ + if (type < 0 || OBJ_TYPE_NAMES_CNT <= type) + { + /** Invalid type. **/ + mssErrorf(1, "Cluster", "Invalid type %d.\n", type); + + return "invalid"; /* Shall not parse to a valid type in ci_TypeFromStr(). */ + } + + return obj_type_names[type]; + } + + /*** obj_internal_ParseDateLang - looks up a list of language internationalization *** strings inside the date format. WARNING - modifies the "srcptr" data in *** place. diff --git a/centrallix/objectsystem/obj_query.c b/centrallix/objectsystem/obj_query.c index 9b64f241..c4dff40a 100644 --- a/centrallix/objectsystem/obj_query.c +++ b/centrallix/objectsystem/obj_query.c @@ -414,7 +414,6 @@ objQueryFetch(pObjQuery this, int mode) { pObject obj = NULL; void* obj_data; - char* name; char buf[OBJSYS_MAX_PATH + 32]; pObjQuerySortItem sort_item; int rval; @@ -529,14 +528,6 @@ objQueryFetch(pObjQuery this, int mode) goto error; } obj->Data = obj_data; - - this->Obj->Driver->GetAttrValue(obj_data, "name", DATA_T_STRING, &name, NULL); - if (strlen(name) + strlen(this->Obj->Pathname->Pathbuf) + 2 > OBJSYS_MAX_PATH) - { - mssError(1,"OSML","Filename in query result exceeded internal limits"); - OSMLDEBUG(OBJ_DEBUG_F_APITRACE, " null\n"); - goto error; - } /** If we need to check it, do so now. **/ if (!(this->Flags & OBJ_QY_F_FULLQUERY) && this->Tree) @@ -778,4 +769,3 @@ objGetQueryIdentityPath(pObjQuery this, char* pathbuf, int maxlen) return 0; } - diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c new file mode 100644 index 00000000..b525dd9a --- /dev/null +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -0,0 +1,4742 @@ +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: objdrv_cluster.c */ +/* Author: Israel Fuller */ +/* Creation: September 17, 2025 */ +/* Description: Cluster object driver. */ +/************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cxlib/clusters.h" +#include "cxlib/mtsession.h" +#include "cxlib/newmalloc.h" +#include "cxlib/util.h" +#include "cxlib/xarray.h" +#include "cxlib/xhash.h" +#include "expression.h" +#include "obj.h" +#include "param.h" +#include "st_node.h" +#include "stparse.h" + +/*** File notes: + *** This file uses comment anchors, provided by the Comment Anchors VSCode + *** extension from Starlane Studios. This allows developers with the extension + *** to control click the "LINK " comments to navigate to the coresponding + *** "ANCHOR[id=]" comment. (Note: Invalid or broken links will default to + *** the first line of the file.) + *** + *** For example, this link should take you to the function signatures: + *** LINK #functions + *** + *** Any developers without this extension can safely ignore these comments, + *** although please try not to break them. :) + *** + *** Comment Anchors VSCode Extension: + *** https://marketplace.visualstudio.com/items?itemName=ExodiusStudios.comment-anchors + ***/ + +/** Defaults for unspecified optional attributes. **/ +#define DEFAULT_MIN_IMPROVEMENT 0.0001 +#define DEFAULT_MAX_ITERATIONS 64u + +/** ================ Stuff That Should Be Somewhere Else ================ **/ +/** ANCHOR[id=temp] **/ + +/** TODO: Greg - I think this should be moved to mtsession. **/ +/*** I caused at least 10 bugs early in the project trying to pass format + *** specifiers to mssError() without realizing that it didn't support them. + *** Eventually, I got fed up enough having to write errors to a sting buffer + *** and passing that buffer to mssError(), so I wrote this wrapper that does + *** it for me. Adding this behavior to mssError() would be better, though. + ***/ +/*** Displays error text to the user. Does not print a stack trace. Does not + *** exit the program, allowing for the calling function to fail, generating + *** an error cascade which may be useful to the user since a stack trace is + *** not readily available. + *** + *** @param clr Whether to clear the current error stack. As a rule of thumb, + *** if you are the first one to detect the error, clear the stack so that + *** other unrelated messages are not shown. If you are detecting an error + *** from another function that may also call an mssError() function, do + *** not clear the stack. + *** @param module The name or abbreviation of the module in which this + *** function is being called, to help developers narrow down the location + *** of the error. + *** @param format The format text for the error, which accepts any format + *** specifier that would be accepted by printf(). + *** @param ... Variables matching format specifiers in the format. + *** @returns Nothing, always succeeds. + ***/ +void +mssErrorf(int clr, char* module, const char* format, ...) + { + /** Prevent interlacing with stdout flushing at a weird time. **/ + check(fflush(stdout)); /* Failure ignored. */ + + /** Insert convenient newline before error stack begins. **/ + if (clr == 1) fprintf(stderr, "\n"); + + /** Process the format with all the same rules as printf(). **/ + char buf[BUFSIZ]; + va_list args; + va_start(args, format); + const int num_chars = vsnprintf(buf, sizeof(buf), format, args); + va_end(args); + + /** Error check vsnprintf, just to be safe. **/ + if (num_chars < 0) + { + perror("vsnprintf() failed"); + fprintf(stderr, "FAIL: mssErrorf(%d, \"%s\", \"%s\", ...)\n", clr, module, format); + return; + } + if (num_chars > BUFSIZ) + fprintf(stderr, "Warning: Error truncated (length %d > buffer size %d).\n", num_chars, BUFSIZ); + + /** Print the error. **/ + const int ret = mssError(clr, module, "%s", buf); + + /** Not sure why you have to error check the error function... **/ + if (ret != 0) fprintf(stderr, "FAIL %d: mssError(%d, \"%s\", \"%%s\", \"%s\")\n", ret, clr, module, buf); + + return; + } + + +/** TODO: Greg - I think this should be moved to xarray. **/ +/*** Trims an xArray, returning a new array (with nmSysMalloc). + *** + *** @param arr The array to be trimmed. + *** @param cleanup 0: No clean up. + *** 1: DeInit arr. + *** 2: Free arr. + *** *: Any other value prints a warning and does nothing. + *** @returns The new array, or null if and only if the passed pXArray has 0 items. + ***/ +static void** +ci_xaToTrimmedArray(pXArray arr, int array_handling) + { + const size_t arr_size = arr->nItems * sizeof(void*); + void** result = check_ptr(nmSysMalloc(arr_size)); + if (result == NULL) return NULL; + memcpy(result, arr->Items, arr_size); + + /** Handle the array. **/ + switch (array_handling) + { + case 0: break; + case 1: check(xaDeInit(arr)); arr->nAlloc = 0; break; /* Failure ignored. */ + case 2: check(xaFree(arr)); break; /* Failure ignored. */ + default: + /** Uh oh, there might be a memory leak... **/ + fprintf(stderr, + "Warning: ci_xaToTrimmedArray(%p, %d) - Unknown value (%d) for array_handling.\n", + arr, array_handling, array_handling + ); + break; + } + + return result; + } + +/** I got tired of forgetting how to do these. **/ +#define ci_file_name(obj) \ + ({ \ + __typeof__ (obj) _obj = (obj); \ + obj_internal_PathPart(_obj->Pathname, _obj->SubPtr - 1, 1); \ + }) +#define ci_file_path(obj) \ + ({ \ + __typeof__ (obj) _obj = (obj); \ + obj_internal_PathPart(_obj->Pathname, 0, _obj->SubPtr); \ + }) + + +/** ================ Enum Declarations ================ **/ +/** ANCHOR[id=enums] **/ + +/** Enum representing a clustering algorithm. **/ +typedef unsigned char ClusterAlgorithm; +#define ALGORITHM_NULL (ClusterAlgorithm)0u +#define ALGORITHM_NONE (ClusterAlgorithm)1u +#define ALGORITHM_SLIDING_WINDOW (ClusterAlgorithm)2u +#define ALGORITHM_KMEANS (ClusterAlgorithm)3u +#define ALGORITHM_KMEANS_PLUS_PLUS (ClusterAlgorithm)4u +#define ALGORITHM_KMEDOIDS (ClusterAlgorithm)5u +#define ALGORITHM_DB_SCAN (ClusterAlgorithm)6u + +#define nClusteringAlgorithms 7u +ClusterAlgorithm ALL_CLUSTERING_ALGORITHMS[nClusteringAlgorithms] = + { + ALGORITHM_NULL, + ALGORITHM_NONE, + ALGORITHM_SLIDING_WINDOW, + ALGORITHM_KMEANS, + ALGORITHM_KMEANS_PLUS_PLUS, + ALGORITHM_KMEDOIDS, + ALGORITHM_DB_SCAN, + }; + +/** Converts a clustering algorithm to its string name. **/ +char* +ci_ClusteringAlgorithmToString(ClusterAlgorithm clustering_algorithm) + { + switch (clustering_algorithm) + { + case ALGORITHM_NULL: return "NULL algorithm"; + case ALGORITHM_NONE: return "none"; + case ALGORITHM_SLIDING_WINDOW: return "sliding-window"; + case ALGORITHM_KMEANS: return "k-means"; + case ALGORITHM_KMEANS_PLUS_PLUS: return "k-means++"; + case ALGORITHM_KMEDOIDS: return "k-medoids"; + case ALGORITHM_DB_SCAN: return "db-scan"; + default: return "Unknown algorithm"; + } + + return; /** Unreachable. **/ + } + +/** Enum representing a similarity measurement algorithm. **/ +typedef unsigned char SimilarityMeasure; +#define SIMILARITY_NULL (SimilarityMeasure)0u +#define SIMILARITY_COSINE (SimilarityMeasure)1u +#define SIMILARITY_LEVENSHTEIN (SimilarityMeasure)2u + +#define nSimilarityMeasures 3u +SimilarityMeasure ALL_SIMILARITY_MEASURES[nSimilarityMeasures] = + { + SIMILARITY_NULL, + SIMILARITY_COSINE, + SIMILARITY_LEVENSHTEIN, + }; + +/** Converts a similarity measure to its string name. **/ +char* +ci_SimilarityMeasureToString(SimilarityMeasure similarity_measure) + { + switch (similarity_measure) + { + case SIMILARITY_NULL: return "NULL similarity measure"; + case SIMILARITY_COSINE: return "cosine"; + case SIMILARITY_LEVENSHTEIN: return "levenshtein"; + default: return "Unknown similarity measure"; + } + + return; /** Unreachable. **/ + } + +/*** Enum representing the type of data targetted by the driver, + *** set based on the path given when the driver is used to open + *** a cluster file. + *** + *** `0u` is reserved for a possible `NULL` value in the future. + *** However, there is currently no allowed `NULL` TargetType. + ***/ +typedef unsigned char TargetType; +#define TARGET_NODE (TargetType)1u +#define TARGET_CLUSTER (TargetType)2u +#define TARGET_SEARCH (TargetType)3u +#define TARGET_CLUSTER_ENTRY (TargetType)4u +#define TARGET_SEARCH_ENTRY (TargetType)5u + +/** Attribute name lists by TargetType. **/ +#define END_OF_ARRAY NULL +char* const ATTR_ROOT[] = + { + "source", + "attr_name", + "date_created", + "date_computed", + END_OF_ARRAY, + }; +char* const ATTR_CLUSTER[] = + { + "algorithm", + "similarity_measure", + "num_clusters", + "min_improvement", + "max_iterations", + "date_created", + "date_computed", + END_OF_ARRAY, + }; +char* const ATTR_SEARCH[] = + { + "source", + "threshold", + "similarity_measure", + END_OF_ARRAY, + }; +char* const ATTR_CLUSTER_ENTRY[] = + { + "items", + "date_created", + "date_computed", + END_OF_ARRAY, + }; +char* const ATTR_SEARCH_ENTRY[] = + { + "key1", + "key2", + "sim", + END_OF_ARRAY, + }; + +/** Method name list. **/ +char* const METHOD_NAMES[] = + { + "cache", + "stat", + END_OF_ARRAY, + }; + + +/** ================ Struct Declarations ================ **/ +/** ANCHOR[id=structs] **/ + +/*** Represents the data source which may have data already fetched. + *** + *** Memory Stats: + *** - Padding: 4 bytes + *** - Total size: 80 bytes + *** + *** @skip --> Attribute Data. + *** @param Name The source name, specified in the .cluster file. + *** @param Key The key associated with this object in the SourceDataCache. + *** @param SourcePath The path to the data source from which to retrieve data. + *** @param KeyAttr The name of the attribute to use when getting keys from + *** the SourcePath. + *** @param NameAttr The name of the attribute to use when getting data from + *** the SourcePath. + *** + *** @skip --> Computed data. + *** @param Strings The keys for each data string strings received from the + *** database, allowing them to be lined up again when queried. + *** @param Strings The data strings to be clustered and searched, or NULL if + *** they have not been fetched from the source. + *** @param Vectors The cosine comparison vectors from the fetched data, or + *** NULL if they haven't been computed. Note that vectors are no longer + *** needed once all clusters and searches have been computed, so they are + *** automatically freed in that case to save memory. + *** @param nVectors The number of vectors and data strings. + *** + *** @skip --> Time. + *** @param DateCreated The date and time that this object was created and initialized. + *** @param DateComputed The date and time that the computed attributes were computed. + ***/ +typedef struct _SOURCE + { + char* Name; + char* Key; + char* SourcePath; + char* KeyAttr; + char* NameAttr; + char** Keys; + char** Strings; + pVector* Vectors; + unsigned int nVectors; + DateTime DateCreated; + DateTime DateComputed; + } + SourceData, *pSourceData; + + +/*** Computed data for a single cluster. + *** + *** Memory Stats: + *** - Padding: 4 bytes + *** - Total size: 24 bytes + *** + *** @param Size The number of items in the cluster. + *** @param Strings The string values of each item. + *** @param Vectors The cosine vectors for each item. + ***/ +typedef struct + { + unsigned int Size; + char** Strings; + pVector* Vectors; + } + Cluster, *pCluster; + + +/*** Data for each cluster. Only attribute data is checked for caching. + *** + *** Memory Stats: + *** - Padding: 2 bytes + *** - Total size: 96 bytes + *** + *** @skip --> Attribute Data. + *** @param Name The cluster name, specified in the .cluster file. + *** @param Key The key associated with this object in the ClusterDataCache. + *** @param ClusterAlgorithm The clustering algorithm to be used. + *** @param SimilarityMeasure The similarity measure used to compare items. + *** @param nClusters The number of clusters. 1 if algorithm = none. + *** @param MinImprovement The minimum amount of improvement that must be met + *** each clustering iteration. If there is less improvement, the algorithm + *** will stop. The "max" in a .cluster file is represented by -inf. + *** @param MaxIterations The maximum number of iterations that a clustering + *** algorithm can run for. Note: Sliding window uses this attribute to store + *** the window_size. + *** + *** @skip --> Relationship Data. + *** @param nSubClusters The number of subclusters of this cluster. + *** @param SubClusters A pClusterData array, NULL if nSubClusters == 0. + *** @param Parent This cluster's parent. NULL if it is not a subcluster. + *** @param SourceData Pointer to the source data that this cluster uses. + *** + *** @skip --> Computed data. + *** @param Clusters An array of length num_clusters, NULL if the clusters + *** have not yet been computed. + *** @param Sims An array of num_vectors elements, where index i stores the + *** similarity of vector i to its assigned cluster. This attribute is NULL + *** if the clusters have not yet been computed. + *** + *** @skip --> Time. + *** @param DateCreated The date and time that this object was created and initialized. + *** @param DateComputed The date and time that the computed attributes were computed. + ***/ +typedef struct _CLUSTER + { + char* Name; + char* Key; + ClusterAlgorithm ClusterAlgorithm; + SimilarityMeasure SimilarityMeasure; + unsigned int nClusters; + double MinImprovement; + unsigned int MaxIterations; + unsigned int nSubClusters; + struct _CLUSTER** SubClusters; + struct _CLUSTER* Parent; + pSourceData SourceData; + Cluster* Clusters; + double* Sims; + DateTime DateCreated; + DateTime DateComputed; + } + ClusterData, *pClusterData; + + +/*** Data for each search. + *** + *** Memory Stats: + *** - Padding: 3 bytes + *** - Total size: 64 bytes + *** + *** @skip --> Attribute Data. + *** @param Name The search name, specified in the .cluster file. + *** @param Key The key associated with this object in the SearchDataCache. + *** @param Source The cluster from which this search is to be derived. + *** @param SimilarityMeasure The similarity measure used to compare items. + *** @param Threshold The minimum similarity threshold for elements to be + *** included in the results of the search. + *** + *** @skip --> Computed data. + *** @param Dups An array holding the dups found by the search, or NULL if the + *** search has not been computed. + *** @param nDups The number of dups found. + *** + *** @skip --> Time. + *** @param DateCreated The date and time that this object was created and initialized. + *** @param DateComputed The date and time that the computed attributes were computed. + ***/ +typedef struct _SEARCH + { + char* Name; + char* Key; + pClusterData SourceCluster; + double Threshold; + pDup* Dups; + unsigned int nDups; + SimilarityMeasure SimilarityMeasure; + DateTime DateCreated; + DateTime DateComputed; + } + SearchData, *pSearchData; + + +/*** Node instance data. + *** + *** Memory Stats: + *** - Padding: 0 bytes + *** - Total size: 64 bytes + *** + *** @note When a .cluster file is openned, there will be only one node for that + *** file. However, in the course of the query, many driver instance structs + *** may be created by functions like clusterQueryFetch(), and closed by the + *** object system using clusterClose(). + *** + *** @param SourceData Data from the provided source. + *** @param Params A pParam array storing the params in the .cluster file. + *** @param nParams The number of specified params. + *** @param ParamList A "scope" for resolving parameter values during parsing. + *** @param ClusterDatas A pCluster array for the clusters in the .cluster file. + *** Will be NULL if `nClusters = 0`. + *** @param nClusterDatas The number of specified clusters. + *** @param SearchDatas A SearchData array for the searches in the .cluster file. + *** @param nSearches The number of specified searches. + *** @param nSearchDatas The parent object used to open this NodeData instance. + *** @param OpenCount The number of open driver instances that are using the + *** NodeData struct. When this reaches 0, the struct should be freed. + ***/ +typedef struct _NODE + { + pObject Parent; + pParam* Params; + pParamObjects ParamList; + pSourceData SourceData; + pClusterData* ClusterDatas; + pSearchData* SearchDatas; + unsigned int OpenCount; + unsigned int nParams; + unsigned int nClusterDatas; + unsigned int nSearchDatas; + } + NodeData, *pNodeData; + +/*** Driver instance data. + *** + *** Memory Stats: + *** - Padding: 1 bytes + *** - Total size: 24 bytes + *** + *** This struct can be thought of like a "pointer" to specific data accessible + *** through the stored pNodeData struct. This struct also communicates whether + *** that data is guaranteed to have been computed. + *** + *** For example, if target type is the root, a cluster, or a search, no data + *** is guaranteed to be computed. These three types can be returned from + *** clusterOpen(), based on the provided path. + *** + *** Alternatively, a cluster entry or search entry can be targetted by calling + *** fetch on a query pointing to a driver instance that targets a cluster or + *** search (respectively). These two entry target types ensure that the data + *** they indicate has been computed, so the GetAttrType() and GetAttrValue() + *** functions do not need to check this repeatedly each time they are called. + *** + *** @param NodeData The associated node data struct. There can be many driver + *** instances pointing to one NodeData at a time, but each driver instance + *** always points to singular NodeData struct. + *** @param TargetType The type of data targetted (see above). + *** @param TargetData If target type is: + *** ```csv + *** Node: A pointer to the SourceData struct. + *** Cluster or ClusterEntry: A pointer to the targetted cluster. + *** Search or SearchEntry: A pointer to the targetted search. + *** ``` + *** @param TargetAttrIndex An index into an attribute list (for GetNextAttr()). + *** @param TargetMethodIndex An index into an method list (for GetNextMethod()). + ***/ +typedef struct _DRIVER + { + pNodeData NodeData; + void* TargetData; + unsigned int TargetIndex; + unsigned char TargetAttrIndex; + unsigned char TargetMethodIndex; + TargetType TargetType; + } + DriverData, *pDriverData; + +/*** Query instance data. + *** + *** Memory Stats: + *** - Padding: 4 bytes + *** - Total size: 16 bytes + *** + *** @param DriverData The associated driver instance being queried. + *** @param RowIndex The selected row of the data targetted by the driver. + ***/ +typedef struct + { + pDriverData DriverData; + unsigned int RowIndex; + } + ClusterQuery, *pClusterQuery; + + +/** Global storage for caches. **/ +struct + { + XHashTable SourceDataCache; + XHashTable ClusterDataCache; + XHashTable SearchDataCache; + } + ClusterDriverCaches = {0}; + +struct + { + unsigned long long OpenCalls; + unsigned long long OpenQueryCalls; + unsigned long long FetchCalls; + unsigned long long CloseCalls; + unsigned long long GetTypeCalls; + unsigned long long GetValCalls; + unsigned long long GetValCalls_name; + unsigned long long GetValCalls_key1; + unsigned long long GetValCalls_key2; + unsigned long long GetValCalls_sim; + } ClusterStatistics; + + +/** ================ Function Declarations ================ **/ +/** ANCHOR[id=functions] **/ + +/** Note: ci stands for "cluster_internal". **/ + +/** Parsing Functions. **/ +// LINK #parsing +static void ci_GiveHint(const char* hint); +static bool ci_TryHint(char* value, char** valid_values, const unsigned int n_valid_values); +static int ci_ParseAttribute(pStructInf inf, char* attr_name, int datatype, pObjData data, pParamObjects param_list, bool required, bool print_type_error); +static ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf cluster_inf, pParamObjects param_list); +static SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf cluster_inf, pParamObjects param_list); +static pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* path); +static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data); +static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data); +static pNodeData ci_ParseNodeData(pStructInf inf, pObject obj); + +/** Freeing Functions. **/ +// LINK #freeing +static void ci_FreeSourceData(pSourceData source_data); +static void ci_FreeClusterData(pClusterData cluster_data, bool recursive); +static void ci_FreeSearchData(pSearchData search_data); +static void ci_FreeNodeData(pNodeData node_data); +static void ci_ClearCaches(void); + +/** Deep Size Computation Functions. **/ +// LINK #sizing +static unsigned int ci_SizeOfSourceData(pSourceData source_data); +static unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive); +static unsigned int ci_SizeOfSearchData(pSearchData search_data); + +/** Computation Functions. (Ensure data is computed.) **/ +// LINK #computation +static int ci_ComputeSourceData(pSourceData source_data, pObjSession session); +static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data); +static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data); + +/** Parameter Functions. **/ +// LINK #params +static int ci_GetParamType(void* inf_v, const char* attr_name); +static int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val); +static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val); + +/** Driver Functions. **/ +// LINK #driver +void* clusterOpen(pObject parent, int mask, pContentType systype, char* usr_type, pObjTrxTree* oxt); +int clusterClose(void* inf_v, pObjTrxTree* oxt); +void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt); +void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt); +int clusterQueryClose(void* qy_v, pObjTrxTree* oxt); +int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt); +int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt); +char* clusterGetFirstAttr(void* inf_v, pObjTrxTree* oxt); +char* clusterGetNextAttr(void* inf_v, pObjTrxTree* oxt); +int clusterInfo(void* inf_v, pObjectInfo info); + +/** Method Execution Functions. **/ +// LINK #method +char* clusterGetFirstMethod(void* inf_v, pObjTrxTree* oxt); +char* clusterGetNextMethod(void* inf_v, pObjTrxTree* oxt); +static int ci_PrintEntry(pXHashEntry entry, void* arg); +static void ci_CacheFreeSourceData(pXHashEntry entry, void* path); +static void ci_CacheFreeCluster(pXHashEntry entry, void* path); +static void ci_CacheFreeSearch(pXHashEntry entry, void* path); +int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree* oxt); + +/** Unimplemented DriverFunctions. **/ +// LINK #unimplemented +int clusterCreate(pObject obj, int mask, pContentType systype, char* usrtype, pObjTrxTree* oxt); +int clusterDelete(pObject obj, pObjTrxTree* oxt); +int clusterDeleteObj(void* inf_v, pObjTrxTree* oxt); +int clusterRead(void* inf_v, char* buffer, int max_cnt, int offset, int flags, pObjTrxTree* oxt); +int clusterWrite(void* inf_v, char* buffer, int cnt, int offset, int flags, pObjTrxTree* oxt); +int clusterSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree* oxt); +void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree* oxt); +int clusterCommit(void* inf_v, pObjTrxTree *oxt); + +/** ================ Parsing Functions ================ **/ +/** ANCHOR[id=parsing] **/ +// LINK #functions + +/** Format a hint to give to the user. **/ +static void ci_GiveHint(const char* hint) + { + fprintf(stderr, " > Hint: Did you mean \"%s\"?\n", hint); + + return; + } + + +/*** Given the user a hint when they specify an invalid string for an attribute + *** where we know the list of valid strings. The hint is only displayed if + *** their string is close enough to a valid string. + *** + *** @param value The value the user gave. + *** @param valid_values The valid values that could be what they meant. + *** @param n_valid_values The number of valid values. Specify 0 to detect + *** length on a null terminated array of values. + *** @returns Whether a hint was given. + ***/ +static bool +ci_TryHint(char* value, char** valid_values, const unsigned int n_valid_values) + { + char* guess = ca_most_similar(value, (void**)valid_values, n_valid_values, ca_lev_compare, 0.25); + if (guess == NULL) return false; /* No hint. */ + + /** Issue hint. **/ + ci_GiveHint(guess); + + return true; + } + + +// LINK #functions +/*** Returns 0 for success and -1 on failure. Promises that mssError() will be + *** invoked on failure, so the caller need not specify their own error message. + *** Returns 1 if attribute is available, printing an error if the attribute was + *** marked as required. + *** + *** @attention - Promises that a failure invokes mssError() at least once. + *** + *** TODO: Greg - Review carefully. I think this code is the reason that runserver() + *** is NOT REQUIRED for dynamic attributes in the cluster driver. I had to debug + *** and rewrite this for ages and it uses several functions I don't 100% understand. + ***/ +static int +ci_ParseAttribute( + pStructInf inf, + char* attr_name, + int datatype, + pObjData data, + pParamObjects param_list, + bool required, + bool print_type_error) + { + int ret; + + /** Get attribute inf. **/ + pStructInf attr_info = stLookup(inf, attr_name); + if (attr_info == NULL) + { + if (required) mssErrorf(1, "Cluster", "'%s' must be specified for clustering.", attr_name); + return 1; + } + ASSERTMAGIC(attr_info, MGK_STRUCTINF); + + /** Allocate expression. **/ + pExpression exp = check_ptr(stGetExpression(attr_info, 0)); + if (exp == NULL) goto err; + + /** Bind parameters. **/ + /** TODO: Greg - What does this return? How do I know if it fails? **/ + expBindExpression(exp, param_list, EXPR_F_RUNSERVER); + + /** Evaluate expression. **/ + ret = expEvalTree(exp, param_list); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Expression evaluation failed (error code %d).", ret); + goto err; + } + + /** Check for data type mismatch. **/ + if (datatype != exp->DataType) + { + mssErrorf(1, "Cluster", + "Expected ['%s' : %s], but got type %s.", + attr_name, objTypeToStr(datatype), objTypeToStr(exp->DataType) + ); + goto err; + } + + /** Get the data out of the expression. **/ + ret = expExpressionToPod(exp, datatype, data); + if (ret != 0) + { + mssErrorf(1, "Cluster", + "Failed to get ['%s' : %s] using expression \"%s\" (error code %d).", + attr_name, objTypeToStr(datatype), exp->Name, ret + ); + goto err; + } + + /** Success. **/ + return 0; + + err: + mssErrorf(0, "Cluster", + "Failed to parse attribute \"%s\" from group \"%s\"", + attr_name, inf->Name + ); + + /** Return error. **/ + return -1; + } + + +// LINK #functions +/*** Parses a ClusteringAlgorithm from the algorithm attribute in the pStructInf + *** representing some structure with that attribute in a parsed structure file. + *** + *** @attention - Promises that a failure invokes mssError() at least once. + *** + *** @param inf A parsed pStructInf. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @returns The data algorithm, or ALGORITHM_NULL on failure. + ***/ +static ClusterAlgorithm +ci_ParseClusteringAlgorithm(pStructInf inf, pParamObjects param_list) + { + /** Get the algorithm attribute. **/ + char* algorithm; + if (ci_ParseAttribute(inf, "algorithm", DATA_T_STRING, POD(&algorithm), param_list, true, true) != 0) + { + mssErrorf(0, "Cluster", "Failed to parse attribute 'algorithm' in group \"%s\".", inf->Name); + return ALGORITHM_NULL; + } + + /** Parse known clustering algorithms. **/ + if (!strcasecmp(algorithm, "none")) return ALGORITHM_NONE; + if (!strcasecmp(algorithm, "sliding-window")) return ALGORITHM_SLIDING_WINDOW; + if (!strcasecmp(algorithm, "k-means")) return ALGORITHM_KMEANS; + if (!strcasecmp(algorithm, "k-means++")) return ALGORITHM_KMEANS_PLUS_PLUS; + if (!strcasecmp(algorithm, "k-medoids")) return ALGORITHM_KMEDOIDS; + if (!strcasecmp(algorithm, "db-scan")) return ALGORITHM_DB_SCAN; + + /** Unknown value for clustering algorithm. **/ + mssErrorf(1, "Cluster", "Unknown \"clustering algorithm\": %s", algorithm); + + /** Attempt to give a hint. **/ + char* all_names[nClusteringAlgorithms] = {NULL}; + for (unsigned int i = 0u; i < nClusteringAlgorithms; i++) + all_names[i] = ci_ClusteringAlgorithmToString(ALL_CLUSTERING_ALGORITHMS[i]); + if (ci_TryHint(algorithm, all_names, nClusteringAlgorithms)); + else if (strcasecmp(algorithm, "sliding") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_SLIDING_WINDOW)); + else if (strcasecmp(algorithm, "window") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_SLIDING_WINDOW)); + else if (strcasecmp(algorithm, "null") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_NONE)); + else if (strcasecmp(algorithm, "nothing") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_NONE)); + + /** Fail. **/ + return ALGORITHM_NULL; + } + + +// LINK #functions +/*** Parses a SimilarityMeasure from the similarity_measure attribute in the given + *** pStructInf parameter, which represents some structure with that attribute + *** in a parsed structure file. + *** + *** @attention - Promises that a failure invokes mssError() at least once. + *** + *** @param inf A parsed pStructInf. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @returns The similarity measure, or SIMILARITY_NULL on failure. + ***/ +static SimilarityMeasure +ci_ParseSimilarityMeasure(pStructInf inf, pParamObjects param_list) + { + /** Get the similarity_measure attribute. **/ + char* measure; + if (ci_ParseAttribute(inf, "similarity_measure", DATA_T_STRING, POD(&measure), param_list, true, true) != 0) + { + mssErrorf(0, "Cluster", "Failed to parse attribute 'similarity_measure' in group \"%s\".", inf->Name); + return SIMILARITY_NULL; + } + + /** Parse known clustering algorithms. **/ + if (!strcasecmp(measure, "cosine")) return SIMILARITY_COSINE; + if (!strcasecmp(measure, "levenshtein")) return SIMILARITY_LEVENSHTEIN; + + /** Unknown similarity measure. **/ + mssErrorf(1, "Cluster", "Unknown \"similarity measure\": %s", measure); + + /** Attempt to give a hint. **/ + char* all_names[nSimilarityMeasures] = {NULL}; + for (unsigned int i = 0u; i < nSimilarityMeasures; i++) + all_names[i] = ci_SimilarityMeasureToString(ALL_SIMILARITY_MEASURES[i]); + if (ci_TryHint(measure, all_names, nSimilarityMeasures)); + else if (strcasecmp(measure, "cos") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_COSINE)); + else if (strcasecmp(measure, "lev") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_LEVENSHTEIN)); + else if (strcasecmp(measure, "edit-dist") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_LEVENSHTEIN)); + else if (strcasecmp(measure, "edit-distance") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_LEVENSHTEIN)); + + /** Fail. **/ + return SIMILARITY_NULL; + } + + +// LINK #functions +/*** Allocates a new pSourceData struct from a parsed pStructInf representing + *** a .cluster structure file. + *** + *** @attention - Warning: Caching in use. + *** @attention - Promises that a failure invokes mssError() at least once. + *** + *** @param inf A parsed pStructInf for a .cluster structure file. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @param path The file path to the parsed structure file, used to generate + *** cache entry keys. + *** @returns A new pSourceData struct on success, or NULL on failure. + ***/ +static pSourceData +ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* path) + { + char* buf = NULL; + pSourceData source_data = NULL; + + /** Allocate SourceData. **/ + source_data = check_ptr(nmMalloc(sizeof(SourceData))); + if (source_data == NULL) goto err_free; + memset(source_data, 0, sizeof(SourceData)); + + /** Initialize obvious values for SourceData. **/ + source_data->Name = check_ptr(nmSysStrdup(inf->Name)); + if (source_data->Name == NULL) goto err_free; + if (!check(objCurrentDate(&source_data->DateCreated))) goto err_free; + + /** Get source. **/ + if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err_free; + source_data->SourcePath = check_ptr(nmSysStrdup(buf)); + if (source_data->SourcePath == NULL) goto err_free; + + /** Get the attribute name to use when querying keys from the source. **/ + if (ci_ParseAttribute(inf, "key_attr", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err_free; + source_data->KeyAttr = check_ptr(nmSysStrdup(buf)); + if (source_data->KeyAttr == NULL) goto err_free; + + /** Get the attribute name to use for querying data from the source. **/ + if (ci_ParseAttribute(inf, "data_attr", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err_free; + source_data->NameAttr = check_ptr(nmSysStrdup(buf)); + if (source_data->NameAttr == NULL) goto err_free; + + /** Create cache entry key. **/ + const size_t len = strlen(path) + + strlen(source_data->SourcePath) + + strlen(source_data->KeyAttr) + + strlen(source_data->NameAttr) + 5lu; + source_data->Key = check_ptr(nmSysMalloc(len * sizeof(char))); + if (source_data->Key == NULL) goto err_free; + snprintf(source_data->Key, len, + "%s?%s->%s:%s", + path, source_data->SourcePath, source_data->KeyAttr, source_data->NameAttr + ); + + /** Check for a cached version. **/ + pSourceData source_maybe = (pSourceData)xhLookup(&ClusterDriverCaches.SourceDataCache, source_data->Key); + if (source_maybe != NULL) + { /* Cache hit. */ + /** Free data we don't need. **/ + nmSysFree(source_data->Key); + ci_FreeSourceData(source_data); + + /** Return the cached source data. **/ + return source_maybe; + } + + /** Cache miss: Add the new object to the cache for next time. **/ + if (!check(xhAdd(&ClusterDriverCaches.SourceDataCache, source_data->Key, (void*)source_data))) + goto err_free; + + /** Success. **/ + return source_data; + + /** Error handling. **/ + err_free: + if (source_data != NULL) + { + if (source_data->Key != NULL) nmSysFree(source_data->Key); + ci_FreeSourceData(source_data); + } + + mssErrorf(0, "Cluster", + "Failed to parse source data from group \"%s\" in file: %s", + inf->Name, path + ); + + return NULL; + } + + +// LINK #functions +/*** Allocates a new pClusterData struct from a parsed pStructInf. + *** + *** @attention - Warning: Caching in use. + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf for a cluster group in a structure file. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @param source_data The pSourceData that clusters are to be built from, also + *** used to generate cache entry keys. + *** @returns A new pClusterData struct on success, or NULL on failure. + ***/ +static pClusterData +ci_ParseClusterData(pStructInf inf, pNodeData node_data) + { + int result; + pClusterData cluster_data = NULL; + XArray sub_clusters = {0}; + char* key = NULL; + + /** Extract values. **/ + pParamObjects param_list = node_data->ParamList; + pSourceData source_data = node_data->SourceData; + + /** Allocate space for data struct. **/ + cluster_data = check_ptr(nmMalloc(sizeof(ClusterData))); + if (cluster_data == NULL) goto err_free; + memset(cluster_data, 0, sizeof(ClusterData)); + + /** Basic Properties. **/ + cluster_data->Name = check_ptr(nmSysStrdup(inf->Name)); + if (cluster_data->Name == NULL) goto err_free; + cluster_data->SourceData = check_ptr(source_data); + if (cluster_data->SourceData == NULL) goto err_free; + if (!check(objCurrentDate(&cluster_data->DateCreated))) goto err_free; + + /** Get algorithm. **/ + cluster_data->ClusterAlgorithm = ci_ParseClusteringAlgorithm(inf, param_list); + if (cluster_data->ClusterAlgorithm == ALGORITHM_NULL) goto err_free; + + /** Handle no clustering case. **/ + if (cluster_data->ClusterAlgorithm == ALGORITHM_NONE) + { + cluster_data->nClusters = 1u; + goto parsing_done; + } + + /** Get similarity_measure. **/ + cluster_data->SimilarityMeasure = ci_ParseSimilarityMeasure(inf, param_list); + if (cluster_data->SimilarityMeasure == SIMILARITY_NULL) goto err_free; + + /** Handle sliding window case. **/ + if (cluster_data->ClusterAlgorithm == ALGORITHM_SLIDING_WINDOW) + { + /** Sliding window doesn't allocate any clusters. **/ + cluster_data->nClusters = 0u; + + /** Get window_size. **/ + int window_size; + if (ci_ParseAttribute(inf, "window_size", DATA_T_INTEGER, POD(&window_size), param_list, true, true) != 0) + goto err_free; + if (window_size < 1) + { + mssErrorf(1, "Cluster", "Invalid value for [window_size : uint > 0]: %d", window_size); + goto err_free; + } + + /** Store value. **/ + cluster_data->MaxIterations = (unsigned int)window_size; + goto parsing_done; + } + + /** Get num_clusters. **/ + int num_clusters; + if (ci_ParseAttribute(inf, "num_clusters", DATA_T_INTEGER, POD(&num_clusters), param_list, true, true) != 0) + goto err_free; + if (num_clusters < 2) + { + mssErrorf(1, "Cluster", "Invalid value for [num_clusters : uint > 1]: %d", num_clusters); + if (num_clusters == 1) fprintf(stderr, "HINT: Use algorithm=\"none\" to disable clustering.\n"); + goto err_free; + } + cluster_data->nClusters = (unsigned int)num_clusters; + + /** Get min_improvement. **/ + double improvement; + result = ci_ParseAttribute(inf, "min_improvement", DATA_T_DOUBLE, POD(&improvement), param_list, false, false); + if (result == 1) cluster_data->MinImprovement = DEFAULT_MIN_IMPROVEMENT; + else if (result == 0) + { + if (improvement <= 0.0 || 1.0 <= improvement) + { + mssErrorf(1, "Cluster", "Invalid value for [min_improvement : 0.0 < x < 1.0 | \"none\"]: %g", improvement); + goto err_free; + } + + /** Successfully got value. **/ + cluster_data->MinImprovement = improvement; + } + else if (result == -1) + { + char* str; + result = ci_ParseAttribute(inf, "min_improvement", DATA_T_STRING, POD(&str), param_list, false, true); + if (result != 0) goto err_free; + if (strcasecmp(str, "none") != 0) + { + mssErrorf(1, "Cluster", "Invalid value for [min_improvement : 0.0 < x < 1.0 | \"none\"]: %s", str); + goto err_free; + } + + /** Successfully got none. **/ + cluster_data->MinImprovement = -INFINITY; + } + + /** Get max_iterations. **/ + int max_iterations; + result = ci_ParseAttribute(inf, "max_iterations", DATA_T_INTEGER, POD(&max_iterations), param_list, false, true); + if (result == -1) goto err_free; + if (result == 0) + { + if (max_iterations < 1) + { + mssErrorf(1, "Cluster", "Invalid value for [max_iterations : uint]: %d", max_iterations); + goto err_free; + } + cluster_data->MaxIterations = (unsigned int)max_iterations; + } + else cluster_data->MaxIterations = DEFAULT_MAX_ITERATIONS; + + /** Search for sub-clusters. **/ + if (!check(xaInit(&sub_clusters, 4u))) goto err_free; + for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + pStructInf sub_inf = check_ptr(inf->SubInf[i]); + ASSERTMAGIC(sub_inf, MGK_STRUCTINF); + char* name = sub_inf->Name; + + /** Handle various struct types. **/ + const int struct_type = stStructType(sub_inf); + switch (struct_type) + { + case ST_T_ATTRIB: + { + /** Valid attribute names. **/ + char* attrs[] = { + "algorithm", + "similarity_measure", + "num_clusters", + "min_improvement", + "max_iterations", + "window_size", + }; + const unsigned int nattrs = sizeof(attrs) / sizeof(char*); + + /** Ignore valid attribute names. **/ + bool is_valid = false; + for (unsigned int i = 0u; i < nattrs; i++) + { + if (strcmp(name, attrs[i]) == 0) + { + is_valid = true; + break; + } + } + if (is_valid) continue; /* Next inf. */ + + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown attribute '%s' in cluster \"%s\".\n", name, inf->Name); + if (ci_TryHint(name, attrs, nattrs)); + else if (strcasecmp(name, "k") == 0) ci_GiveHint("num_clusters"); + else if (strcasecmp(name, "threshold") == 0) ci_GiveHint("min_improvement"); + + break; + } + + case ST_T_SUBGROUP: + { + /** Select array by group type. **/ + char* group_type = check_ptr(sub_inf->UsrType); + if (group_type == NULL) goto err_free; + if (strcmp(group_type, "cluster/cluster") != 0) + { + mssErrorf(1, "Cluster", + "Warning: Unknown group [\"%s\" : \"%s\"] in cluster \"%s\".\n", + name, group_type, inf->Name + ); + continue; + } + + /** Subcluster found. **/ + pClusterData sub_cluster = ci_ParseClusterData(sub_inf, node_data); + if (sub_cluster == NULL) goto err_free; + sub_cluster->Parent = cluster_data; + if (!check_neg(xaAddItem(&sub_clusters, sub_cluster))) goto err_free; + + break; + } + + default: + { + mssErrorf(1, "Cluster", + "Warning: Unknown struct type %d in cluster \"%s\".", + struct_type, inf->Name + ); + goto err_free; + } + } + } + cluster_data->nSubClusters = sub_clusters.nItems; + cluster_data->SubClusters = (pClusterData*)ci_xaToTrimmedArray(&sub_clusters, 1); + sub_clusters.nAlloc = 0; + + /** Create the cache key. **/ + parsing_done:; + switch (cluster_data->ClusterAlgorithm) + { + case ALGORITHM_NONE: + { + const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 8lu; + key = nmSysMalloc(len * sizeof(char)); + snprintf(key, len, "%s/%s?%u", + source_data->Key, + cluster_data->Name, + ALGORITHM_NONE + ); + break; + } + + case ALGORITHM_SLIDING_WINDOW: + { + const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 16lu; + key = nmSysMalloc(len * sizeof(char)); + snprintf(key, len, "%s/%s?%u&%u&%u", + source_data->Key, + cluster_data->Name, + ALGORITHM_SLIDING_WINDOW, + cluster_data->SimilarityMeasure, + cluster_data->MaxIterations + ); + break; + } + + default: + { + const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 32lu; + key = nmSysMalloc(len * sizeof(char)); + snprintf(key, len, "%s/%s?%u&%u&%u&%g&%u", + source_data->Key, + cluster_data->Name, + cluster_data->ClusterAlgorithm, + cluster_data->SimilarityMeasure, + cluster_data->nClusters, + cluster_data->MinImprovement, + cluster_data->MaxIterations + ); + break; + } + } + cluster_data->Key = key; + + /** Check for a cached version. **/ + pClusterData cluster_maybe = (pClusterData)xhLookup(&ClusterDriverCaches.ClusterDataCache, key); + if (cluster_maybe != NULL) + { /* Cache hit. */ + /** Free the parsed cluster that we no longer need. */ + ci_FreeClusterData(cluster_data, false); + nmSysFree(key); + + /** Return the cached cluster. **/ + return cluster_maybe; + } + + /** Cache miss. **/ + if (!check(xhAdd(&ClusterDriverCaches.ClusterDataCache, key, (void*)cluster_data))) goto err_free; + return cluster_data; + + /** Error cleanup. **/ + err_free: + if (key != NULL) nmSysFree(key); + + if (sub_clusters.nAlloc != 0) + { + for (unsigned int i = 0u; i < sub_clusters.nItems; i++) + { + pClusterData cur = sub_clusters.Items[i]; + if (cur == NULL) break; + ci_FreeClusterData(cur, true); + } + check(xaDeInit(&sub_clusters)); /* Failure ignored. */ + } + + if (cluster_data != NULL) ci_FreeClusterData(cluster_data, false); + + mssErrorf(0, "Cluster", "Failed to parse cluster from group \"%s\".", inf->Name); + return NULL; + } + + +// LINK #functions +/*** Allocates a new pSearchData struct from a parsed pStructInf. + *** + *** @attention - Warning: Caching in use. + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf for a search group in a structure file. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @param node_data The pNodeData, used to get the param list and to look up + *** the cluster pointed to by the source attribute. + *** @returns A new pSearchData struct on success, or NULL on failure. + ***/ +static pSearchData +ci_ParseSearchData(pStructInf inf, pNodeData node_data) + { + pSearchData search_data = NULL; + char* key = NULL; + + /** Allocate space for search struct. **/ + search_data = check_ptr(nmMalloc(sizeof(SearchData))); + if (search_data == NULL) goto err_free; + memset(search_data, 0, sizeof(SearchData)); + + /** Get basic information. **/ + search_data->Name = check_ptr(nmSysStrdup(inf->Name)); + if (search_data->Name == NULL) goto err_free; + if (!check(objCurrentDate(&search_data->DateCreated))) goto err_free; + + /** Get source cluster. **/ + char* source_cluster_name; + if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&source_cluster_name), node_data->ParamList, true, true) != 0) return NULL; + for (unsigned int i = 0; i < node_data->nClusterDatas; i++) + { + pClusterData cluster_data = node_data->ClusterDatas[i]; + if (strcmp(source_cluster_name, cluster_data->Name) == 0) + { + /** SourceCluster found. **/ + search_data->SourceCluster = cluster_data; + break; + } + + /** Note: Subclusters should probably be parsed here, if they were implemented. **/ + } + + /** Did we find the requested source? **/ + if (search_data->SourceCluster == NULL) + { + /** Print error. **/ + mssErrorf(1, "Cluster", "Could not find cluster \"%s\" for search \"%s\".", source_cluster_name, search_data->Name); + + /** Attempt to give a hint. **/ + char* cluster_names[node_data->nClusterDatas]; + for (unsigned int i = 0; i < node_data->nClusterDatas; i++) + cluster_names[i] = node_data->ClusterDatas[i]->Name; + ci_TryHint(source_cluster_name, cluster_names, node_data->nClusterDatas); + + /** Fail. **/ + goto err_free; + } + + /** Get threshold attribute. **/ + if (ci_ParseAttribute(inf, "threshold", DATA_T_DOUBLE, POD(&search_data->Threshold), node_data->ParamList, true, true) != 0) goto err_free; + if (search_data->Threshold <= 0.0 || 1.0 <= search_data->Threshold) + { + mssErrorf(1, "Cluster", + "Invalid value for [threshold : 0.0 < x < 1.0 | \"none\"]: %g", + search_data->Threshold + ); + goto err_free; + } + + /** Get similarity measure. **/ + search_data->SimilarityMeasure = ci_ParseSimilarityMeasure(inf, node_data->ParamList); + if (search_data->SimilarityMeasure == SIMILARITY_NULL) goto err_free; + + /** Check for additional data to warn the user about. **/ + for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + pStructInf sub_inf = check_ptr(inf->SubInf[i]); + ASSERTMAGIC(sub_inf, MGK_STRUCTINF); + char* name = sub_inf->Name; + + /** Handle various struct types. **/ + const int struct_type = stStructType(sub_inf); + switch (struct_type) + { + case ST_T_ATTRIB: + { + /** Valid attribute names. **/ + char* attrs[] = { + "source", + "threshold", + "similarity_measure", + }; + const unsigned int nattrs = sizeof(attrs) / sizeof(char*); + + /** Ignore valid attribute names. **/ + bool is_valid = false; + for (unsigned int i = 0u; i < nattrs; i++) + { + if (strcmp(name, attrs[i]) == 0) + { + is_valid = true; + break; + } + } + if (is_valid) continue; /* Next inf. */ + + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown attribute '%s' in search \"%s\".\n", name, inf->Name); + ci_TryHint(name, attrs, nattrs); + + break; + } + + case ST_T_SUBGROUP: + { + /** The spec does not specify any valid sub-groups for searches. **/ + char* group_type = check_ptr(sub_inf->UsrType); + if (group_type == NULL) goto err_free; + fprintf(stderr, + "Warning: Unknown group [\"%s\" : \"%s\"] in search \"%s\".\n", + name, group_type, inf->Name + ); + break; + } + + default: + { + mssErrorf(1, "Cluster", + "Warning: Unknown struct type %d in search \"%s\".", + struct_type, inf->Name + ); + goto err_free; + } + } + } + + /** Create cache entry key. **/ + char* source_key = search_data->SourceCluster->Key; + const size_t len = strlen(source_key) + strlen(search_data->Name) + 16lu; + key = check_ptr(nmSysMalloc(len * sizeof(char))); + if (key == NULL) goto err_free; + snprintf(key, len, "%s/%s?%g&%u", + source_key, + search_data->Name, + search_data->Threshold, + search_data->SimilarityMeasure + ); + pXHashTable search_cache = &ClusterDriverCaches.SearchDataCache; + + /** Check for a cached version. **/ + pSearchData search_maybe = (pSearchData)xhLookup(search_cache, key); + if (search_maybe != NULL) + { /* Cache hit. */ + /** Free the parsed search that we no longer need. **/ + if (search_data != NULL) ci_FreeSearchData(search_data); + if (key != NULL) nmSysFree(key); + + /** Return the cached search. **/ + return search_maybe; + } + + /** Cache miss. **/ + check(xhAdd(search_cache, key, (void*)search_data)); + return search_data; + + /** Error cleanup. **/ + err_free: + if (search_data != NULL) ci_FreeSearchData(search_data); + + mssErrorf(0, "Cluster", "Failed to parse SearchData from group \"%s\".", inf->Name); + + return NULL; + } + + +// LINK #functions +/*** Allocates a new pNodeData struct from a parsed pStructInf. + *** + *** @attention - Does not use caching directly, but uses subfunctions to + *** handle caching of substructures. + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf for the top level group in a .cluster + *** structure file. + *** @param parent The parent object struct. + *** @returns A new pNodeData struct on success, or NULL on failure. + ***/ +static pNodeData +ci_ParseNodeData(pStructInf inf, pObject parent) + { + int ret = -1; + pNodeData node_data = NULL; + XArray param_infs = {0}; + XArray cluster_infs = {0}; + XArray search_infs = {0}; + + /** Get file path. **/ + char* path = check_ptr(ci_file_path(parent)); + if (path == NULL) goto err_free; + + /** Allocate node struct data. **/ + node_data = check_ptr(nmMalloc(sizeof(NodeData))); + if (node_data == NULL) goto err_free; + memset(node_data, 0, sizeof(NodeData)); + node_data->Parent = parent; + + /** Set up param list. **/ + node_data->ParamList = check_ptr(expCreateParamList()); + if (node_data->ParamList == NULL) goto err_free; + node_data->ParamList->Session = check_ptr(parent->Session); + if (node_data->ParamList->Session == NULL) goto err_free; + ret = expAddParamToList(node_data->ParamList, "parameters", (void*)node_data, 0); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to add parameters to the param list scope (error code %d).", ret); + goto err_free; + } + + /** Set the param functions, defined later in the file. **/ + ret = expSetParamFunctions( + node_data->ParamList, + "parameters", + ci_GetParamType, + ci_GetParamValue, + ci_SetParamValue + ); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to set param functions (error code %d).", ret); + goto err_free; + } + + /** Detect relevant groups. **/ + if (!check(xaInit(¶m_infs, 8))) goto err_free; + if (!check(xaInit(&cluster_infs, 8))) goto err_free; + if (!check(xaInit(&search_infs, 8))) goto err_free; + for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + pStructInf sub_inf = check_ptr(inf->SubInf[i]); + ASSERTMAGIC(sub_inf, MGK_STRUCTINF); + char* name = sub_inf->Name; + + /** Handle various struct types. **/ + const int struct_type = stStructType(sub_inf); + switch (struct_type) + { + case ST_T_ATTRIB: + { + /** Valid attribute names. **/ + char* attrs[] = { + "source", + "key_attr", + "data_attr", + }; + const unsigned int nattrs = sizeof(attrs) / sizeof(char*); + + /** Ignore valid attribute names. **/ + bool is_valid = false; + for (unsigned int i = 0u; i < nattrs; i++) + { + if (strcmp(name, attrs[i]) == 0) + { + is_valid = true; + break; + } + } + if (is_valid) continue; /* Next inf. */ + + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown attribute '%s' in cluster node \"%s\".\n", name, inf->Name); + ci_TryHint(name, attrs, nattrs); + + break; + } + + case ST_T_SUBGROUP: + { + /** The spec does not specify any valid sub-groups for searches. **/ + char* group_type = check_ptr(sub_inf->UsrType); + if (group_type == NULL) goto err_free; + if (strcmp(group_type, "cluster/parameter") == 0) + { + if (!check_neg(xaAddItem(¶m_infs, sub_inf))) + goto err_free; + } + else if (strcmp(group_type, "cluster/cluster") == 0) + { + if (!check_neg(xaAddItem(&cluster_infs, sub_inf))) + goto err_free; + } + else if (strcmp(group_type, "cluster/search") == 0) + { + if (!check_neg(xaAddItem(&search_infs, sub_inf))) + goto err_free; + } + else + { + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, + "Warning: Unknown group type \"%s\" on group \"%s\".\n", + group_type, sub_inf->Name + ); + ci_TryHint(group_type, (char*[]){ + "cluster/parameter", + "cluster/cluster", + "cluster/search", + NULL, + }, 0u); + } + break; + } + + default: + { + mssErrorf(1, "Cluster", + "Warning: Unknown struct type %d in search \"%s\".", + struct_type, inf->Name + ); + goto err_free; + } + } + } + + /** Extract OpenCtl for use below. **/ + bool has_provided_params = parent != NULL + && parent->Pathname != NULL + && parent->Pathname->OpenCtl != NULL + && parent->Pathname->OpenCtl[parent->SubPtr - 1] != NULL + && parent->Pathname->OpenCtl[parent->SubPtr - 1]->nSubInf > 0 + && parent->Pathname->OpenCtl[parent->SubPtr - 1]->SubInf != NULL; + int num_provided_params = (has_provided_params) ? parent->Pathname->OpenCtl[parent->SubPtr - 1]->nSubInf : 0; + pStruct* provided_params = (has_provided_params) ? parent->Pathname->OpenCtl[parent->SubPtr - 1]->SubInf : NULL; + + /** Iterate over each param in the structure file. **/ + node_data->nParams = param_infs.nItems; + const size_t params_size = node_data->nParams * sizeof(pParam); + node_data->Params = check_ptr(nmSysMalloc(params_size)); + if (node_data->Params == NULL) goto err_free; + memset(node_data->Params, 0, params_size); + for (unsigned int i = 0u; i < node_data->nParams; i++) + { + pParam param = paramCreateFromInf(param_infs.Items[i]); + if (param == NULL) + { + mssErrorf(0, "Cluster", + "Failed to create param from inf for param #%u: %s", + i, ((pStructInf)param_infs.Items[i])->Name + ); + goto err_free; + } + node_data->Params[i] = param; + + /** Check each provided param to see if the user provided value. **/ + for (unsigned int j = 0u; j < num_provided_params; j++) + { + pStruct provided_param = check_ptr(provided_params[j]); /* Failure ignored. */ + + /** If this provided param value isn't for the param, ignore it. **/ + if (strcmp(provided_param->Name, param->Name) != 0) continue; + + /** Matched! The user is providing a value for this param. **/ + ret = paramSetValueFromInfNe(param, provided_param, 0, node_data->ParamList, node_data->ParamList->Session); + if (ret != 0) + { + mssErrorf(0, "Cluster", + "Failed to set param value from struct info.\n" + " > Param #%u: %s\n" + " > Provided Param #%u: %n\n" + " > Error code: %d", + i, param->Name, + j, provided_param->Name, + ret + ); + goto err_free; + } + + /** Provided value successfully handled, we're done. **/ + break; + } + + /** Invoke param hints parsing. **/ + ret = paramEvalHints(param, node_data->ParamList, node_data->ParamList->Session); + if (ret != 0) + { + mssErrorf(0, "Cluster", + "Failed to evaluate parameter hints for parameter \"%s\" (error code %d).", + param->Name, ret + ); + goto err_free; + } + } + check(xaDeInit(¶m_infs)); /* Failure ignored. */ + param_infs.nAlloc = 0; + + /** Iterate over provided parameters and warn the user if they specified a parameter that does not exist. **/ + for (unsigned int i = 0u; i < num_provided_params; i++) + { + pStruct provided_param = check_ptr(provided_params[i]); /* Failure ignored. */ + char* provided_name = provided_param->Name; + + /** Look to see if this provided param actually exists for this driver instance. **/ + for (unsigned int j = 0u; j < node_data->nParams; j++) + if (strcmp(provided_name, node_data->Params[j]->Name) == 0) + goto next_provided_param; + + /** This param doesn't exist, warn the user and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown provided parameter '%s' for cluster file: %s.\n", provided_name, ci_file_name(parent)); + char** param_names = check_ptr(nmSysMalloc(node_data->nParams * sizeof(char*))); + for (unsigned int j = 0u; j < node_data->nParams; j++) + param_names[j] = node_data->Params[j]->Name; + ci_TryHint(provided_name, param_names, node_data->nParams); + nmSysFree(param_names); + + next_provided_param:; + } + + /** Parse source data. **/ + node_data->SourceData = ci_ParseSourceData(inf, node_data->ParamList, path); + if (node_data->SourceData == NULL) goto err_free; + + /** Parse each cluster. **/ + node_data->nClusterDatas = cluster_infs.nItems; + if (node_data->nClusterDatas > 0) + { + const size_t clusters_size = node_data->nClusterDatas * sizeof(pClusterData); + node_data->ClusterDatas = check_ptr(nmSysMalloc(clusters_size)); + if (node_data->ClusterDatas == NULL) goto err_free; + memset(node_data->ClusterDatas, 0, clusters_size); + for (unsigned int i = 0u; i < node_data->nClusterDatas; i++) + { + node_data->ClusterDatas[i] = ci_ParseClusterData(cluster_infs.Items[i], node_data); + if (node_data->ClusterDatas[i] == NULL) goto err_free; + } + } + else node_data->ClusterDatas = NULL; + check(xaDeInit(&cluster_infs)); /* Failure ignored. */ + cluster_infs.nAlloc = 0; + + /** Parse each search. **/ + node_data->nSearchDatas = search_infs.nItems; + if (node_data->nSearchDatas > 0) + { + const size_t searches_size = node_data->nSearchDatas * sizeof(pSearchData); + node_data->SearchDatas = check_ptr(nmSysMalloc(searches_size)); + if (node_data->SearchDatas == NULL) goto err_free; + memset(node_data->SearchDatas, 0, searches_size); + for (unsigned int i = 0u; i < node_data->nSearchDatas; i++) + { + node_data->SearchDatas[i] = ci_ParseSearchData(search_infs.Items[i], node_data); + if (node_data->SearchDatas[i] == NULL) goto err_free; + } + } + else node_data->SearchDatas = NULL; + check(xaDeInit(&search_infs)); /* Failure ignored. */ + search_infs.nAlloc = 0; + + /** Success. **/ + return node_data; + + err_free: + if (param_infs.nAlloc != 0) check(xaDeInit(¶m_infs)); /* Failure ignored. */ + if (cluster_infs.nAlloc != 0) check(xaDeInit(&cluster_infs)); /* Failure ignored. */ + if (search_infs.nAlloc != 0) check(xaDeInit(&search_infs)); /* Failure ignored. */ + if (node_data != NULL) ci_FreeNodeData(node_data); + mssErrorf(0, "Cluster", "Failed to parse node from group \"%s\" in file: %s", inf->Name, path); + + return NULL; + } + + +/** ================ Freeing Functions ================ **/ +/** ANCHOR[id=freeing] **/ +// LINK #functions + +/** @param source_data A pSourceData struct, freed by this function. **/ +static void +ci_FreeSourceData(pSourceData source_data) + { + /** Guard segfault. **/ + if (source_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_FreeSourceData(NULL);\n"); + return; + } + + /** Free top level attributes, if they exist. **/ + if (source_data->Name != NULL) + { + nmSysFree(source_data->Name); + source_data->Name = NULL; + } + if (source_data->SourcePath != NULL) + { + nmSysFree(source_data->SourcePath); + source_data->SourcePath = NULL; + } + if (source_data->KeyAttr != NULL) + { + nmSysFree(source_data->KeyAttr); + source_data->KeyAttr = NULL; + } + if (source_data->NameAttr != NULL) + { + nmSysFree(source_data->NameAttr); + source_data->NameAttr = NULL; + } + + /** Free fetched data, if it exists. **/ + if (source_data->Strings != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + { + if (source_data->Strings[i] != NULL) + nmSysFree(source_data->Strings[i]); + else continue; + source_data->Strings[i] = NULL; + } + nmSysFree(source_data->Strings); + source_data->Strings = NULL; + } + + /** Free computed vectors, if they exist. **/ + if (source_data->Vectors != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + { + if (source_data->Vectors[i] != NULL) + ca_free_vector(source_data->Vectors[i]); + else continue; + source_data->Vectors[i] = NULL; + } + nmSysFree(source_data->Vectors); + source_data->Vectors = NULL; + } + + /** Free the source data struct. **/ + nmFree(source_data, sizeof(SourceData)); + source_data = NULL; + + return; + } + + +// LINK #functions +/*** Free pClusterData struct with an option to recursively free subclusters. + *** + *** @param cluster_data The cluster data struct to free. + *** @param recursive Whether to recursively free subclusters. + ***/ +static void +ci_FreeClusterData(pClusterData cluster_data, bool recursive) + { + /** Guard segfault. **/ + if (cluster_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_FreeClusterData(NULL, %s);\n", (recursive) ? "true" : "false"); + return; + } + + /** Free attribute data. **/ + if (cluster_data->Name != NULL) + { + nmSysFree(cluster_data->Name); + cluster_data->Name = NULL; + } + + /** Free computed data, if it exists. **/ + if (cluster_data->Clusters != NULL) + { + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + pCluster cluster = &cluster_data->Clusters[i]; + if (cluster == NULL) continue; + if (cluster->Strings != NULL) nmSysFree(cluster->Strings); + if (cluster->Vectors != NULL) nmSysFree(cluster->Vectors); + cluster->Strings = NULL; + cluster->Vectors = NULL; + } + nmSysFree(cluster_data->Clusters); + nmSysFree(cluster_data->Sims); + cluster_data->Clusters = NULL; + cluster_data->Sims = NULL; + } + + /** Free subclusters recursively. **/ + if (cluster_data->SubClusters != NULL) + { + if (recursive) + { + for (unsigned int i = 0u; i < cluster_data->nSubClusters; i++) + { + if (cluster_data->SubClusters[i] != NULL) + ci_FreeClusterData(cluster_data->SubClusters[i], recursive); + else continue; + cluster_data->SubClusters[i] = NULL; + } + } + nmSysFree(cluster_data->SubClusters); + cluster_data->SubClusters = NULL; + } + + /** Free the cluster data struct. **/ + nmFree(cluster_data, sizeof(ClusterData)); + cluster_data = NULL; + + return; + } + + +// LINK #functions +/** @param search_data A pSearchData struct, freed by this function. **/ +static void +ci_FreeSearchData(pSearchData search_data) + { + /** Guard segfault. **/ + if (search_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_FreeSearchData(NULL);\n"); + return; + } + + /** Free attribute data. **/ + if (search_data->Name != NULL) + { + nmSysFree(search_data->Name); + search_data->Name = NULL; + } + + /** Free computed data. **/ + if (search_data->Dups != NULL) + { + for (unsigned int i = 0; i < search_data->nDups; i++) + { + nmFree(search_data->Dups[i], sizeof(Dup)); + search_data->Dups[i] = NULL; + } + nmSysFree(search_data->Dups); + search_data->Dups = NULL; + } + + /** Free the search data struct. **/ + nmFree(search_data, sizeof(SearchData)); + search_data = NULL; + + return; + } + + +// LINK #functions +/** @param node_data A pNodeData struct, freed by this function. **/ +static void +ci_FreeNodeData(pNodeData node_data) + { + /** Guard segfault. **/ + if (node_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_FreeNodeData(NULL);\n"); + return; + } + + /** Free parsed params, if they exist. **/ + if (node_data->Params != NULL) + { + for (unsigned int i = 0u; i < node_data->nParams; i++) + { + if (node_data->Params[i] == NULL) break; + paramFree(node_data->Params[i]); + node_data->Params[i] = NULL; + } + nmSysFree(node_data->Params); + node_data->Params = NULL; + } + if (node_data->ParamList != NULL) + { + expFreeParamList(node_data->ParamList); + node_data->ParamList = NULL; + } + + /** Free parsed clusters, if they exist. **/ + if (node_data->ClusterDatas != NULL) + { + /*** This data is cached, so we should NOT free it! The caching system + *** is responsible for the memory. We only need to free the array + *** holding our pointers to said cached memory. + ***/ + nmSysFree(node_data->ClusterDatas); + node_data->ClusterDatas = NULL; + } + + /** Free parsed searches, if they exist. **/ + if (node_data->SearchDatas != NULL) + { + /*** This data is cached, so we should NOT free it! The caching system + *** is responsible for the memory. We only need to free the array + *** holding our pointers to said cached memory. + ***/ + nmSysFree(node_data->SearchDatas); + node_data->SearchDatas = NULL; + } + + /** Free data source, if one exists. **/ + /*** Note: SourceData is freed last since other free functions may need to + *** access information from this structure when freeing data. + *** (For example, nVector which is used to determine the size of the + *** label struct in each cluster.) + ***/ + if (node_data->SourceData != NULL) + { + /*** This data is cached, so we should NOT free it! The caching system + *** is responsible for the memory. We only need to free the array + *** holding our pointers to said cached memory. + ***/ + node_data->SourceData = NULL; + } + + /** Free the node data. **/ + nmFree(node_data, sizeof(NodeData)); + node_data = NULL; + + return; + } + +/** Frees all data in caches for all cluster driver instances. **/ +static void +ci_ClearCaches(void) + { + /*** Free caches in reverse of the order they are created in case + *** cached data relies on its source during the freeing process. + ***/ + check(xhClearKeySafe(&ClusterDriverCaches.SearchDataCache, ci_CacheFreeSearch, NULL)); /* Failure ignored. */ + check(xhClearKeySafe(&ClusterDriverCaches.ClusterDataCache, ci_CacheFreeCluster, NULL)); /* Failure ignored. */ + check(xhClearKeySafe(&ClusterDriverCaches.SourceDataCache, ci_CacheFreeSourceData, NULL)); /* Failure ignored. */ + + return; + } + + +/** ================ Deep Size Computation Functions ================ **/ +/** ANCHOR[id=sizing] **/ +// LINK #functions + +/*** Returns the deep size of a SourceData struct, including the size of all + *** allocated substructures. As far as I can tell, this is probably only + *** useful for cache management and debugging. + *** + *** Note that Key is ignored because it is a pointer to data managed by the + *** caching systems, so it is not technically part of the struct. + *** + *** @param source_data The source data struct to be queried. + *** @returns The size in bytes of the struct and all internal allocated data. + ***/ +static unsigned int +ci_SizeOfSourceData(pSourceData source_data) + { + /** Guard segfault. **/ + if (source_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_SizeOfSourceData(NULL);\n"); + return 0u; + } + + unsigned int size = 0u; + if (source_data->Name != NULL) size += strlen(source_data->Name) * sizeof(char); + if (source_data->SourcePath != NULL) size += strlen(source_data->SourcePath) * sizeof(char); + if (source_data->KeyAttr != NULL) size += strlen(source_data->KeyAttr) * sizeof(char); + if (source_data->NameAttr != NULL) size += strlen(source_data->NameAttr) * sizeof(char); + if (source_data->Strings != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + size += strlen(source_data->Strings[i]) * sizeof(char); + size += source_data->nVectors * sizeof(char*); + } + if (source_data->Vectors != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + size += ca_sparse_len(source_data->Vectors[i]) * sizeof(int); + size += source_data->nVectors * sizeof(pVector); + } + size += sizeof(SourceData); + + return size; + } + + +// LINK #functions +/*** Returns the deep size of a ClusterData struct, including the size of all + *** allocated substructures. As far as I can tell, this is probably only + *** useful for cache management and debugging. + *** + *** Note that Key is ignored because it is a pointer to data managed by the + *** caching systems, so it is not technically part of the struct. + *** + *** @param cluster_data The cluster data struct to be queried. + *** @param recursive Whether to recursively free subclusters. + *** @returns The size in bytes of the struct and all internal allocated data. + ***/ +static unsigned int +ci_SizeOfClusterData(pClusterData cluster_data, bool recursive) + { + /** Guard segfault. **/ + if (cluster_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_SizeOfClusterData(NULL, %s);\n", (recursive) ? "true" : "false"); + return 0u; + } + + unsigned int size = 0u; + if (cluster_data->Name != NULL) size += strlen(cluster_data->Name) * sizeof(char); + if (cluster_data->Clusters != NULL) + { + const unsigned int nVectors = cluster_data->SourceData->nVectors; + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + size += cluster_data->Clusters[i].Size * (sizeof(char*) + sizeof(pVector)); + size += nVectors * (sizeof(Cluster) + sizeof(double)); + } + if (cluster_data->SubClusters != NULL) + { + if (recursive) + { + for (unsigned int i = 0u; i < cluster_data->nSubClusters; i++) + size += ci_SizeOfClusterData(cluster_data->SubClusters[i], recursive); + } + size += cluster_data->nSubClusters * sizeof(void*); + } + size += sizeof(ClusterData); + + return size; + } + + +// LINK #functions +/*** Returns the deep size of a SearchData struct, including the size of all + *** allocated substructures. As far as I can tell, this is probably only + *** useful for cache management and debugging. + *** + *** Note that Key is ignored because it is a pointer to data managed by the + *** caching systems, so it is not technically part of the struct. + *** + *** @param search_data The search data struct to be queried. + *** @returns The size in bytes of the struct and all internal allocated data. + ***/ +static unsigned int +ci_SizeOfSearchData(pSearchData search_data) + { + /** Guard segfault. **/ + if (search_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_SizeOfSearchData(NULL);\n"); + return 0u; + } + + unsigned int size = 0u; + if (search_data->Name != NULL) size += strlen(search_data->Name) * sizeof(char); + if (search_data->Dups != NULL) size += search_data->nDups * (sizeof(void*) + sizeof(Dup)); + size += sizeof(SearchData); + + return size; + } + + +/** ================ Computation Functions ================ **/ +/** ANCHOR[id=computation] **/ +// LINK #functions + +/*** Ensures that the source_data->Data has been fetched from the data source + *** and that source_data->nVectors has been computed from the fetched data. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param source_data The pSourceData affected by the computation. + *** @param session The current session, used to open the data source. + *** @returns 0 if successful, or + *** -1 other value on failure. + ***/ +static int +ci_ComputeSourceData(pSourceData source_data, pObjSession session) + { + bool successful = false; + int ret; + pObject obj = NULL; + pObjQuery query = NULL; + XArray key_xarray = {0}; + XArray data_xarray = {0}; + XArray vector_xarray = {0}; + + /** Guard segfault. **/ + if (source_data == NULL) return -1; + + /** If the vectors are already computed, we're done. **/ + if (source_data->Vectors != NULL) return 0; + + /** Record the date and time. **/ + if (!check(objCurrentDate(&source_data->DateComputed))) goto end_free; + + /** Open the source path specified by the .cluster file. **/ + obj = objOpen(session, source_data->SourcePath, OBJ_O_RDONLY, 0600, "system/directory"); + if (obj == NULL) + { + mssErrorf(0, "Cluster", + "Failed to open object driver:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n", + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath + ); + goto end_free; + } + + /** Generate a "query" for retrieving data. **/ + query = objOpenQuery(obj, NULL, NULL, NULL, NULL, 0); + if (query == NULL) + { + mssErrorf(0, "Cluster", + "Failed to open query:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name + ); + goto end_free; + } + + /** Initialize an xarray to store the retrieved data. **/ + // memset(&key_xarray, 0, sizeof(XArray)); + // memset(&data_xarray, 0, sizeof(XArray)); + // memset(&vector_xarray, 0, sizeof(XArray)); + if (!check(xaInit(&key_xarray, 64))) goto end_free; + if (!check(xaInit(&data_xarray, 64))) goto end_free; + if (!check(xaInit(&vector_xarray, 64))) goto end_free; + + /** Fetch data and build vectors. **/ + while (true) + { + pObject entry = objQueryFetch(query, O_RDONLY); + if (entry == NULL) break; /* Done. */ + + /** Data value: Type checking. **/ + const int data_datatype = objGetAttrType(entry, source_data->NameAttr); + if (data_datatype == -1) + { + mssErrorf(0, "Cluster", + "Failed to get type for %uth entry:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name + ); + goto end_free; + } + if (data_datatype != DATA_T_STRING) + { + mssErrorf(1, "Cluster", + "Type for %uth entry was not a string:\n" + " > Attribute: ['%s':'%s' : %s]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, objTypeToStr(data_datatype), + source_data->SourcePath, + obj->Driver->Name + ); + goto end_free; + } + + /** Data value: Get value from database. **/ + char* data; + ret = objGetAttrValue(entry, source_data->NameAttr, DATA_T_STRING, POD(&data)); + if (ret != 0) + { + mssErrorf(0, "Cluster", + "Failed to value for %uth entry:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n" + " > Error code: %d\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name, + ret + ); + goto end_free; + } + + /** Skip empty strings. **/ + if (strlen(data) == 0) + { + check(fflush(stdout)); /* Failure ignored. */ + continue; + } + + /** Convert the string to a vector. **/ + pVector vector = ca_build_vector(data); + if (vector == NULL) + { + mssErrorf(1, "Cluster", "Failed to build vectors for string \"%s\".", data); + successful = false; + goto end_free; + } + if (ca_is_empty(vector)) + { + mssErrorf(1, "Cluster", "Vector building for string \"%s\" produced no character pairs.", data); + successful = false; + goto end_free; + } + if (ca_has_no_pairs(vector)) + { + /** Skip pVector with no pairs. **/ + check(fflush(stdout)); /* Failure ignored. */ + ca_free_vector(vector); + continue; + } + + + /** Key value: Type checking. **/ + const int key_datatype = objGetAttrType(entry, source_data->KeyAttr); + if (key_datatype == -1) + { + mssErrorf(0, "Cluster", + "Failed to get type for key on %uth entry:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name + ); + goto end_free; + } + if (key_datatype != DATA_T_STRING) + { + mssErrorf(1, "Cluster", + "Type for key on %uth entry was not a string:\n" + " > Attribute: ['%s':'%s' : %s]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, objTypeToStr(key_datatype), + source_data->SourcePath, + obj->Driver->Name + ); + goto end_free; + } + + /** key value: Get value from database. **/ + char* key; + ret = objGetAttrValue(entry, source_data->KeyAttr, DATA_T_STRING, POD(&key)); + if (ret != 0) + { + mssErrorf(0, "Cluster", + "Failed to value for key on %uth entry:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n" + " > Error code: %d\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name, + ret + ); + goto end_free; + } + + /** Store values. **/ + char* key_dup = check_ptr(nmSysStrdup(key)); + if (key_dup == NULL) goto end_free; + char* data_dup = check_ptr(nmSysStrdup(data)); + if (data_dup == NULL) goto end_free; + if (!check_neg(xaAddItem(&key_xarray, (void*)key_dup))) goto end_free; + if (!check_neg(xaAddItem(&data_xarray, (void*)data_dup))) goto end_free; + if (!check_neg(xaAddItem(&vector_xarray, (void*)vector))) goto end_free; + + /** Clean up. **/ + ret = objClose(entry); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to close object entry (error code %d).", ret); + // success = false; // Fall-through: Failure ignored. + } + } + + source_data->nVectors = vector_xarray.nItems; + if (source_data->nVectors == 0) + { + mssErrorf(0, "Cluster", + "Data source path did not contain any valid data:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name + ); + } + + /** Trim and store keys. **/ + source_data->Keys = (char**)check_ptr(ci_xaToTrimmedArray(&key_xarray, 1)); + if (source_data->Keys == NULL) goto err_free; + key_xarray.nAlloc = 0; + + /** Trim and store data strings. **/ + source_data->Strings = (char**)check_ptr(ci_xaToTrimmedArray(&data_xarray, 1)); + if (source_data->Strings == NULL) goto err_free; + data_xarray.nAlloc = 0; + + /** Trim and store vectors. **/ + source_data->Vectors = (int**)check_ptr(ci_xaToTrimmedArray(&vector_xarray, 1)); + if (source_data->Vectors == NULL) goto err_free; + vector_xarray.nAlloc = 0; + + /** Success. **/ + successful = true; + goto end_free; + + err_free: + if (source_data->Keys != NULL) nmSysFree(source_data->Keys); + if (source_data->Strings != NULL) nmSysFree(source_data->Strings); + if (source_data->Vectors != NULL) nmSysFree(source_data->Vectors); + + end_free: + /** Clean up xarrays. **/ + if (key_xarray.nAlloc != 0) + { + for (unsigned int i = 0u; i < vector_xarray.nItems; i++) + { + char* key = key_xarray.Items[i]; + if (key != NULL) nmSysFree(key); + else break; + } + check(xaDeInit(&key_xarray)); /* Failure ignored. */ + } + if (data_xarray.nAlloc != 0) + { + for (unsigned int i = 0u; i < data_xarray.nItems; i++) + { + char* str = data_xarray.Items[i]; + if (str != NULL) nmSysFree(str); + else break; + } + check(xaDeInit(&data_xarray)); /* Failure ignored. */ + } + if (vector_xarray.nAlloc != 0) + { + for (unsigned int i = 0u; i < vector_xarray.nItems; i++) + { + pVector vec = vector_xarray.Items[i]; + if (vec != NULL) ca_free_vector(vec); + else break; + } + check(xaDeInit(&vector_xarray)); /* Failure ignored. */ + } + + /** Clean up query. **/ + if (query != NULL) + { + ret = objQueryClose(query); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to close query (error code %d).", ret); + // success = false; // Fall-through: Failure ignored. + } + } + + /** Clean up object. **/ + if (obj != NULL) + { + ret = objClose(obj); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to close object driver (error code %d).", ret); + // success = false; // Fall-through: Failure ignored. + } + } + + /** Print an error if the function failed. **/ + if (!successful) mssErrorf(0, "Cluster", "SourceData computation failed."); + + /** Return the function status code. **/ + return (successful) ? 0 : -1; + } + + +// LINK #functions +/*** Ensures that the cluster_data->Labels has been computed, running the + *** specified clustering algorithm if necessary. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param cluster_data The pClusterData affected by the computation. + *** @param node_data The current pNodeData, used to get vectors to cluster. + *** @returns 0 if successful, or + *** -1 other value on failure. + ***/ +static int +ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) + { + cluster_data->Sims = NULL; + cluster_data->Clusters = NULL; + + /** Guard segfaults. **/ + if (cluster_data == NULL || node_data == NULL) return -1; + + /** If the clusters are already computed, we're done. **/ + if (cluster_data->Clusters != NULL) return 0; + + /** Make source data available. **/ + pSourceData source_data = check_ptr(node_data->SourceData); + if (source_data == NULL) + { + mssErrorf(1, "Cluster", "Failed to get source data for cluster computation."); + goto err_free; + } + + /** We need the SourceData vectors to compute clusters. **/ + if (ci_ComputeSourceData(source_data, node_data->ParamList->Session) != 0) + { + mssErrorf(0, "Cluster", "ClusterData computation failed due to missing SourceData."); + goto err_free; + } + + /** Record the date and time. **/ + if (!check(objCurrentDate(&cluster_data->DateComputed))) goto err_free; + + /** Allocate static memory for finding clusters. **/ + const size_t clusters_size = cluster_data->nClusters * sizeof(Cluster); + cluster_data->Clusters = check_ptr(nmSysMalloc(clusters_size)); + if (cluster_data->Clusters == NULL) goto err_free; + memset(cluster_data->Clusters, 0, clusters_size); + const size_t sims_size = source_data->nVectors * sizeof(double); + cluster_data->Sims = check_ptr(nmSysMalloc(sims_size)); + if (cluster_data->Sims == NULL) goto err_free; + memset(cluster_data->Sims, 0, sims_size); + + /** Execute clustering. **/ + switch (cluster_data->ClusterAlgorithm) + { + case ALGORITHM_NONE: + { + /** Put all the data into one cluster. **/ + pCluster first_cluster = &cluster_data->Clusters[0]; + first_cluster->Size = source_data->nVectors; + first_cluster->Strings = check_ptr(nmSysMalloc(source_data->nVectors * sizeof(char*))); + if (first_cluster->Strings == NULL) goto err_free; + first_cluster->Vectors = check_ptr(nmSysMalloc(source_data->nVectors * sizeof(pVector))); + if (first_cluster->Vectors == NULL) goto err_free; + memcpy(first_cluster->Strings, source_data->Strings, source_data->nVectors * sizeof(char*)); + memcpy(first_cluster->Vectors, source_data->Vectors, source_data->nVectors * sizeof(pVector)); + break; + } + + case ALGORITHM_SLIDING_WINDOW: + /** Computed in each search for efficiency. **/ + memset(cluster_data->Clusters, 0, clusters_size); + break; + + case ALGORITHM_KMEANS: + { + /** Check for unimplemented similarity measures. **/ + if (cluster_data->SimilarityMeasure != SIMILARITY_COSINE) + { + mssErrorf(1, "Cluster", + "The similarity measure \"%s\" is not implemented.", + ci_SimilarityMeasureToString(cluster_data->SimilarityMeasure) + ); + goto err_free; + } + + /** Allocate lables. Note: kmeans does not require us to initialize them. **/ + const size_t lables_size = source_data->nVectors * sizeof(unsigned int); + unsigned int* labels = check_ptr(nmSysMalloc(lables_size)); + if (labels == NULL) goto err_free; + + /** Run kmeans. **/ + const bool successful = check(ca_kmeans( + source_data->Vectors, + source_data->nVectors, + cluster_data->nClusters, + cluster_data->MaxIterations, + cluster_data->MinImprovement, + labels, + cluster_data->Sims + )); + if (!successful) goto err_free; + + /** Convert the labels into clusters. **/ + + /** Allocate space for clusters. **/ + XArray indexes_in_cluster[cluster_data->nClusters]; + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + if (!check(xaInit(&indexes_in_cluster[i], 8))) goto err_free; + + /** Iterate through each label and add the index of the specified cluster to the xArray. **/ + for (unsigned long long i = 0llu; i < source_data->nVectors; i++) + if (!check_neg(xaAddItem(&indexes_in_cluster[labels[i]], (void*)i))) goto err_free; + nmSysFree(labels); /* Free unused data. */ + + /** Iterate through each cluster, store it, and free the xArray. **/ + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + pXArray indexes_in_this_cluster = &indexes_in_cluster[i]; + pCluster cluster = &cluster_data->Clusters[i]; + cluster->Size = indexes_in_this_cluster->nItems; + cluster->Strings = check_ptr(nmSysMalloc(cluster->Size * sizeof(char*))); + if (cluster->Strings == NULL) goto err_free; + cluster->Vectors = check_ptr(nmSysMalloc(cluster->Size * sizeof(pVector))); + if (cluster->Vectors == NULL) goto err_free; + for (unsigned int j = 0u; j < cluster->Size; j++) + { + const unsigned long long index = (unsigned long long)indexes_in_this_cluster->Items[j]; + cluster->Strings[j] = source_data->Strings[index]; + cluster->Vectors[j] = source_data->Vectors[index]; + } + check(xaDeInit(indexes_in_this_cluster)); /* Failure ignored. */ + } + + /** k-means done. **/ + break; + } + + default: + mssErrorf(1, "Cluster", + "Clustering algorithm \"%s\" is not implemented.", + ci_ClusteringAlgorithmToString(cluster_data->ClusterAlgorithm) + ); + goto err_free; + } + + /** Success. **/ + return 0; + + err_free: + if (cluster_data->Sims != NULL) nmFree(cluster_data->Sims, sims_size); + + if (cluster_data->Clusters != NULL) + { + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + pCluster cluster = &cluster_data->Clusters[i]; + if (cluster->Strings != NULL) nmFree(cluster->Strings, cluster->Size * sizeof(char*)); + else break; + if (cluster->Vectors != NULL) nmFree(cluster->Vectors, cluster->Size * sizeof(pVector)); + else break; + } + nmFree(cluster_data->Clusters, clusters_size); + } + + mssErrorf(0, "Cluster", "ClusterData computation failed for \"%s\".", cluster_data->Name); + + return -1; + } + + +// LINK #functions +/*** Ensures that the search_data->Dups has been computed, running the a + *** search with the specified similarity measure if necessary. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param cluster_data The pClusterData affected by the computation. + *** @param node_data The current pNodeData, used to get vectors to cluster. + *** @returns 0 if successful, or + *** -1 other value on failure. + ***/ +static int +ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) + { + pXArray dups = NULL; + + /** If the clusters are already computed, we're done. **/ + if (search_data->Dups != NULL) return 0; + + /** We need the cluster data to be computed before we search it. **/ + pClusterData cluster_data = check_ptr(search_data->SourceCluster); + if (cluster_data == NULL) + { + mssErrorf(1, "Cluster", "Failed to get cluster data for search computation."); + goto err_free; + } + if (ci_ComputeClusterData(cluster_data, node_data) != 0) + { + mssErrorf(0, "Cluster", "SearchData computation failed due to missing clusters."); + goto err_free; + } + + /** Record the date and time. **/ + if (!check(objCurrentDate(&search_data->DateComputed))) goto err_free; + + /** Execute the search using the specified source and comparison function. **/ + pXArray dups_temp = NULL; + switch (search_data->SimilarityMeasure) + { + case SIMILARITY_COSINE: + { + if (cluster_data->ClusterAlgorithm == ALGORITHM_SLIDING_WINDOW) + { + dups_temp = check_ptr(ca_sliding_search( + (void**)cluster_data->SourceData->Vectors, + cluster_data->SourceData->nVectors, + cluster_data->MaxIterations, /* Window size. */ + ca_cos_compare, + search_data->Threshold, + (void**)cluster_data->SourceData->Keys, + dups + )); + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute sliding search with cosine similarity measure."); + goto err_free; + } + } + else + { + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + dups_temp = check_ptr(ca_complete_search( + (void**)cluster_data->Clusters[i].Vectors, + cluster_data->Clusters[i].Size, + ca_cos_compare, + search_data->Threshold, + (void**)cluster_data->SourceData->Keys, + dups + )); + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute complete search with cosine similarity measure."); + goto err_free; + } + else dups = dups_temp; + } + } + break; + } + + case SIMILARITY_LEVENSHTEIN: + { + if (cluster_data->ClusterAlgorithm == ALGORITHM_SLIDING_WINDOW) + { + dups_temp = check_ptr(ca_sliding_search( + (void**)cluster_data->SourceData->Vectors, + cluster_data->SourceData->nVectors, + cluster_data->MaxIterations, /* Window size. */ + ca_lev_compare, + search_data->Threshold, + (void**)cluster_data->SourceData->Keys, + dups + )); + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute sliding search with Levenstein similarity measure."); + goto err_free; + } + } + else + { + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + dups_temp = check_ptr(ca_complete_search( + (void**)cluster_data->Clusters[i].Strings, + cluster_data->Clusters[i].Size, + ca_lev_compare, + search_data->Threshold, + (void**)cluster_data->SourceData->Keys, + dups + )); + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute complete search with Levenstein similarity measure."); + goto err_free; + } + else dups = dups_temp; + } + } + break; + } + + default: + mssErrorf(1, "Cluster", + "Unknown similarity meansure \"%s\".", + ci_SimilarityMeasureToString(search_data->SimilarityMeasure) + ); + goto err_free; + } + if (dups_temp == NULL) goto err_free; + else dups = dups_temp; + // fprintf(stderr, "Done searching, found %d dups.\n", dups->nItems); + + /** Store dups. **/ + search_data->nDups = dups->nItems; + search_data->Dups = (dups->nItems == 0) + ? check_ptr(nmSysMalloc(0)) + : ci_xaToTrimmedArray(dups, 2); + if (search_data->Dups == NULL) + { + mssErrorf(1, "Cluster", "Failed to store dups after computing search data."); + goto err_free; + } + + /** Success. **/ + return 0; + + err_free: + if (search_data->Dups != NULL) nmSysFree(search_data->Dups); + if (dups != NULL) + { + for (unsigned int i = 0u; i < dups->nItems; i++) + { + if (dups->Items[i] != NULL) nmFree(dups->Items[i], sizeof(Dup)); + else break; + } + check(xaFree(dups)); /* Failure ignored. */ + } + + mssErrorf(0, "Cluster", "SearchData computation failed for \"%s\".", search_data->Name); + + return -1; + } + + +/** ================ Parameter Functions ================ **/ +/** ANCHOR[id=params] **/ +// LINK #functions + +/*** Get the type of a parameter. Intended for expSetParamFunctions(). + *** + *** @param inf_v Node data containing the list of paramenters. + *** @param attr_name The name of the requested paramenter. + *** @returns The datatype, see datatypes.h for a list of valid datatypes. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +static int +ci_GetParamType(void* inf_v, const char* attr_name) + { + pNodeData node_data = (pNodeData)inf_v; + + /** Find the parameter. **/ + for (unsigned int i = 0; i < node_data->nParams; i++) + { + pParam param = node_data->Params[i]; + if (strcmp(param->Name, attr_name) != 0) continue; + + /** Parameter found. **/ + return (param->Value == NULL) ? DATA_T_UNAVAILABLE : param->Value->DataType; + } + + /** Parameter not found. **/ + return DATA_T_UNAVAILABLE; + } + + +// LINK #functions +/*** Get the value of a parameter. Intended for `expSetParamFunctions()`. + *** + *** @attention - Warning: If the retrieved value is `NULL`, the pObjectData + *** val is not updated, and the function returns 1, indicating `NULL`. + *** This is intended behavior, for consistency with other Centrallix + *** functions, so keep it in mind so you're not surprised. + *** + *** @param inf_v Node data containing the list of paramenters. + *** @param attr_name The name of the requested paramenter. + *** @param datatype The expected datatype of the parameter value. + *** See datatypes.h for a list of valid datatypes. + *** @param val A pointer to a location where a pointer to the requested + *** data should be stored. Typically, the caller creates a local variable + *** to store this pointer, then passes a pointer to that local variable + *** so that they will have a pointer to the data. + *** This buffer will not be modified unless the data is successfully + *** found. If a value other than 0 is returned, the buffer is not updated. + *** @returns 0 if successful, + *** 1 if the variable is null, + *** -1 if an error occurs. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +static int +ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) + { + pNodeData node_data = (pNodeData)inf_v; + + /** Find the parameter. **/ + for (unsigned int i = 0; i < node_data->nParams; i++) + { + pParam param = (pParam)node_data->Params[i]; + if (strcmp(param->Name, attr_name) != 0) continue; + + /** Parameter found. **/ + if (param->Value == NULL) return 1; + if (param->Value->Flags & DATA_TF_NULL) return 1; + if (param->Value->DataType != datatype) + { + mssErrorf(1, "Cluster", "Type mismatch accessing parameter '%s'.", param->Name); + return -1; + } + + /** Return param value. **/ + if (!check(objCopyData(&(param->Value->Data), val, datatype))) goto err; + return 0; + } + + err: + mssErrorf(1, "Cluster", + "Failed to get parameter ['%s' : %s]", + attr_name, objTypeToStr(datatype) + ); + + return -1; + } + +// LINK #functions +/** Not implemented. **/ +static int +ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) + { + mssErrorf(1, "Cluster", "SetParamValue() is not implemented because clusters are imutable."); + + return -1; + } + + +/** ================ Driver functions ================ **/ +/** ANCHOR[id=driver] **/ +// LINK #functions + +/*** Opens a new cluster driver instance by parsing a `.cluster` file found + *** at the path provided in parent. + *** + *** @param parent The parent of the object to be openned, including useful + *** information such as the pathname, session, etc. + *** @param mask Driver permission mask (unused). + *** @param sys_type ? (unused) + *** @param usr_type The object system file type being openned. Should always + *** be "system/cluster" because this driver is only registered for that + *** type of file. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** + *** @returns A pDriverData struct representing a driver instance, or + *** NULL if an error occurs. + ***/ +void* +clusterOpen(pObject parent, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt) + { + pNodeData node_data = NULL; + pDriverData driver_data = NULL; + + /** Update statistics. **/ + ClusterStatistics.OpenCalls++; + + /** If CREAT and EXCL are specified, exclusively create it, failing if the file already exists. **/ + pSnNode node_struct = NULL; + bool can_create = (parent->Mode & O_CREAT) && (parent->SubPtr == parent->Pathname->nElements); + if (can_create && (parent->Mode & O_EXCL)) + { + node_struct = snNewNode(parent->Prev, usr_type); + if (node_struct == NULL) + { + mssErrorf(0, "Cluster", "Failed to exclusively create new node struct."); + goto err_free; + } + } + + /** Read the node if it exists. **/ + if (node_struct == NULL) + node_struct = snReadNode(parent->Prev); + + /** If we can't read it, create it (if allowed). **/ + if (node_struct == NULL && can_create) + node_struct = snNewNode(parent->Prev, usr_type); + + /** If there still isn't a node, fail early. **/ + if (node_struct == NULL) + { + mssErrorf(0, "Cluster", "Failed to create node struct."); + goto err_free; + } + + /** Magic. **/ + ASSERTMAGIC(node_struct, MGK_STNODE); + ASSERTMAGIC(node_struct->Data, MGK_STRUCTINF); + + /** Parse node data from the node_struct. **/ + node_data = ci_ParseNodeData(node_struct->Data, parent); + if (node_data == NULL) + { + mssErrorf(0, "Cluster", "Failed to parse structure file \"%s\".", ci_file_name(parent)); + goto err_free; + } + + /** Allocate driver instance data. **/ + driver_data = check_ptr(nmMalloc(sizeof(DriverData))); + if (driver_data == NULL) goto err_free; + memset(driver_data, 0, sizeof(DriverData)); + driver_data->NodeData = node_data; + driver_data->NodeData->OpenCount++; + + /** Detect target from path. **/ + char* target_name = obj_internal_PathPart(parent->Pathname, parent->SubPtr + parent->SubCnt++, 1); + if (target_name == NULL) + { + /** Target found: Root **/ + driver_data->TargetType = TARGET_NODE; + driver_data->TargetData = (void*)driver_data->NodeData->SourceData; + goto success; + } + + /** Search clusters. **/ + for (unsigned int i = 0u; i < node_data->nClusterDatas; i++) + { + pClusterData cluster = node_data->ClusterDatas[i]; + if (strcmp(cluster->Name, target_name) != 0) continue; + + /** Target found: Cluster **/ + driver_data->TargetType = TARGET_CLUSTER; + + /** Check for sub-clusters in the path. **/ + while (true) + { + /** Decend one path part deeper into the path. **/ + const char* path_part = obj_internal_PathPart(parent->Pathname, parent->SubPtr + parent->SubCnt++, 1); + + /** If the path does not go any deeper, we're done. **/ + if (path_part == NULL) + { + driver_data->TargetData = (void*)cluster; + break; + } + + /** Need to go deeper: Search for the requested sub-cluster. **/ + for (unsigned int i = 0u; i < cluster->nSubClusters; i++) + { + pClusterData sub_cluster = cluster->SubClusters[i]; + if (strcmp(sub_cluster->Name, path_part) != 0) continue; + + /** Target found: Sub-cluster **/ + cluster = sub_cluster; + goto continue_descent; + } + + /** Path names sub-cluster that does not exist. **/ + mssErrorf(1, "Cluster", "Sub-cluster \"%s\" does not exist.", path_part); + goto err_free; + + continue_descent:; + } + goto success; + } + + /** Search searches. **/ + for (unsigned int i = 0u; i < node_data->nSearchDatas; i++) + { + pSearchData search = node_data->SearchDatas[i]; + if (strcmp(search->Name, target_name) != 0) continue; + + /** Target found: Search **/ + driver_data->TargetType = TARGET_SEARCH; + driver_data->TargetData = (void*)search; + + /** Check for extra, invalid path parts. **/ + char* extra_data = obj_internal_PathPart(parent->Pathname, parent->SubPtr + parent->SubCnt++, 1); + if (extra_data != NULL) + { + mssErrorf(1, "Cluster", "Unknown path part %s.", extra_data); + goto err_free; + } + return (void*)driver_data; /* Success. */ + } + + /** We were unable to find the requested cluster or search. **/ + mssErrorf(1, "Cluster", "\"%s\" is not the name of a declared cluster or search.", target_name); + + /** Attempt to give a hint. **/ + { + const unsigned int n_targets = node_data->nClusterDatas + node_data->nSearchDatas; + char* target_names[n_targets]; + for (unsigned int i = 0u; i < node_data->nClusterDatas; i++) + target_names[i] = node_data->ClusterDatas[i]->Name; + for (unsigned int i = 0u; i < node_data->nSearchDatas; i++) + target_names[i + node_data->nClusterDatas] = node_data->SearchDatas[i]->Name; + ci_TryHint(target_name, target_names, n_targets); + } + + /** Error cleanup. **/ + err_free: + if (node_data != NULL) ci_FreeNodeData(node_data); + if (driver_data != NULL) nmFree(driver_data, sizeof(DriverData)); + + mssErrorf(0, "Cluster", + "Failed to open cluster file \"%s\" at: %s", + ci_file_name(parent), ci_file_path(parent) + ); + + return NULL; + + success: + return driver_data; + } + + +// LINK #functions +/*** Close a cluster driver instance object, releasing any necessary memory + *** and closing any necessary underlying resources. However, most of that + *** data will be cached and won't be freed unless the cache is dropped. + *** + *** @param inf_v The affected driver instance. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns 0, success. + ***/ +int +clusterClose(void* inf_v, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + /** Update statistics. **/ + ClusterStatistics.CloseCalls++; + + /** No work needed. **/ + if (driver_data == NULL) return 0; + + /** Unlink the driver's node data. **/ + pNodeData node_data = driver_data->NodeData; + if (node_data != NULL && --node_data->OpenCount == 0) + ci_FreeNodeData(driver_data->NodeData); + + /** Free driver data. **/ + nmFree(driver_data, sizeof(DriverData)); + + return 0; + } + + +// LINK #functions +/*** Opens a new query pointing to the first row of the data targetted by + *** the driver instance struct. The query has an internal index counter + *** that starts at the first row and increments as data is fetched. + *** + *** @param inf_v The driver instance to be queried. + *** @param query The query to use on this struct. This is assumed to be + *** handled elsewhere, so we don't read it here (unused). + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns The cluster query, or + *** NULL if an error occurs. + ***/ +void* +clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt) + { + pClusterQuery cluster_query = NULL; + pDriverData driver_data = inf_v; + + if (driver_data->TargetType != TARGET_SEARCH + && driver_data->TargetType != TARGET_CLUSTER + && driver_data->TargetType != TARGET_NODE) + { + /** Queries are not supported for this target type. **/ + return NULL; + } + + /** Update statistics. **/ + ClusterStatistics.OpenQueryCalls++; + + /** Allocate memory for the query. **/ + cluster_query = check_ptr(nmMalloc(sizeof(ClusterQuery))); + if (cluster_query == NULL) return NULL; + + /** Initialize the query. **/ + cluster_query->DriverData = (pDriverData)inf_v; + cluster_query->RowIndex = 0u; + + return cluster_query; + } + + +// LINK #functions +/*** Get the next entry as an open driver instance object. + *** + *** @param qy_v A query instance, storing an internal index which is + *** incremented once that data has been fetched. + *** @param obj Unused. + *** @param mode Unused. + *** @param oxt Unused. + *** @returns pDriverData that is either a cluster entry or search entry, + *** pointing to a specific target index into the relevant data. + *** OR NULL, indicating that all data has been fetched. + ***/ +void* +clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) + { + pClusterQuery cluster_query = (pClusterQuery)qy_v; + pDriverData driver_data = cluster_query->DriverData; + pDriverData result_data = NULL; + + /** Update statistics. **/ + ClusterStatistics.FetchCalls++; + + /** Allocate result struct. **/ + result_data = check_ptr(nmMalloc(sizeof(DriverData))); + if (result_data == NULL) goto err; + + /** Default initialization. **/ + result_data->NodeData = driver_data->NodeData; + result_data->TargetData = driver_data->TargetData; + result_data->TargetType = 0; /* Unset. */ + result_data->TargetIndex = 0; /* Reset. */ + result_data->TargetAttrIndex = 0; /* Reset. */ + result_data->TargetMethodIndex = 0; /* Reset. */ + + /** Load node data. **/ + pNodeData node_data = driver_data->NodeData; + + /** Ensure that the data being fetched exists and is computed. **/ + const TargetType target_type = driver_data->TargetType; + switch (target_type) + { + case TARGET_NODE: + { + unsigned int index = cluster_query->RowIndex++; + + /** Iterate over clusters. **/ + const unsigned int n_cluster_datas = node_data->nClusterDatas; + if (index < n_cluster_datas) + { + /** Fetch a cluster. **/ + result_data->TargetType = TARGET_CLUSTER; + result_data->TargetData = node_data->ClusterDatas[index]; + break; + } + else index -= n_cluster_datas; + + /** Iterate over searches. **/ + const unsigned int n_search_datas = node_data->nSearchDatas; + if (index < n_search_datas) + { + /** Fetch a search. **/ + result_data->TargetType = TARGET_SEARCH; + result_data->TargetData = node_data->SearchDatas[index]; + break; + } + else index -= n_search_datas; + + /** Iteration complete. **/ + goto done; + } + + case TARGET_CLUSTER: + { + /** Ensure the required data is computed. **/ + pClusterData target = (pClusterData)driver_data->TargetData; + if (ci_ComputeClusterData(target, node_data) != 0) + { + mssErrorf(0, "Cluster", "Failed to compute ClusterData for query."); + goto err; + } + + /** Stop iteration if the requested data does not exist. **/ + if (cluster_query->RowIndex >= target->nClusters) goto done; + + /** Set the data being fetched. **/ + result_data->TargetType = TARGET_CLUSTER_ENTRY; + result_data->TargetIndex = cluster_query->RowIndex++; + + break; + } + + case TARGET_SEARCH: + { + /** Ensure the required data is computed. **/ + pSearchData target = (pSearchData)driver_data->TargetData; + if (ci_ComputeSearchData(target, node_data) != 0) + { + mssErrorf(0, "Cluster", "Failed to compute SearchData for query."); + goto err; + } + + /** Stop iteration if the requested data does not exist. **/ + if (cluster_query->RowIndex >= target->nDups) goto done; + + /** Set the data being fetched. **/ + result_data->TargetType = TARGET_SEARCH_ENTRY; + result_data->TargetIndex = cluster_query->RowIndex++; + + break; + } + + case TARGET_CLUSTER_ENTRY: + case TARGET_SEARCH_ENTRY: + mssErrorf(1, "Cluster", "Querying a query result is not allowed."); + goto err; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", target_type); + goto err; + } + + /** Add a link to the NodeData so that it isn't freed while we're using it. **/ + node_data->OpenCount++; + + /** Success. **/ + return result_data; + + err: + mssErrorf(0, "Cluster", "Failed to fetch query result."); + + done: + if (result_data != NULL) nmFree(result_data, sizeof(DriverData)); + return NULL; + } + + +// LINK #functions +/*** Close a cluster query instance, releasing any necessary memory and + *** closing any necessary underlying resources. This does not close the + *** underlying driver instance, which must be closed with clusterClose(). + *** + *** @param qy_v The affected query instance. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns 0, success. + ***/ +int +clusterQueryClose(void* qy_v, pObjTrxTree* oxt) + { + if (qy_v != NULL) nmFree(qy_v, sizeof(ClusterQuery)); + + return 0; + } + + +// LINK #functions +/*** Get the type of a cluster driver instance attribute. + *** + *** @param inf_v The driver instance. + *** @param attr_name The name of the requested attribute. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns The datatype, see datatypes.h for a list of valid datatypes. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +int +clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + /** Update statistics. **/ + ClusterStatistics.GetTypeCalls++; + + /** Guard possible segfault. **/ + if (attr_name == NULL) + { + fprintf(stderr, "Warning: Call to clusterGetAttrType() with NULL attribute name.\n"); + return DATA_T_UNAVAILABLE; + } + + /** Performance shortcut for frequently requested attributes: key1, key2, and sim. **/ + if (attr_name[0] == 'k' || attr_name[0] == 's') goto handle_targets; + + /** Types for general attributes. **/ + if (strcmp(attr_name, "name") == 0 + || strcmp(attr_name, "annotation") == 0 + || strcmp(attr_name,"content_type") == 0 + || strcmp(attr_name, "inner_type") == 0 + || strcmp(attr_name,"outer_type") == 0) + return DATA_T_STRING; + if (strcmp(attr_name, "last_modification") == 0) + return DATA_T_DATETIME; + if (strcmp(attr_name, "date_created") == 0 + || strcmp(attr_name, "date_computed") == 0) + { + return (driver_data->TargetType == TARGET_CLUSTER + || driver_data->TargetType == TARGET_CLUSTER_ENTRY + || driver_data->TargetType == TARGET_SEARCH + || driver_data->TargetType == TARGET_SEARCH_ENTRY) + ? DATA_T_DATETIME /* Target has date attr. */ + : DATA_T_UNAVAILABLE; /* Target does not have date attr. */ + } + + /** Types for specific data targets. **/ + handle_targets: + switch (driver_data->TargetType) + { + case TARGET_NODE: + if (strcmp(attr_name, "source") == 0 + || strcmp(attr_name, "data_attr") == 0 + || strcmp(attr_name, "key_attr") == 0) + return DATA_T_STRING; + break; + + case TARGET_CLUSTER: + if (strcmp(attr_name, "algorithm") == 0 + || strcmp(attr_name, "similarity_measure") == 0) + return DATA_T_STRING; + if (strcmp(attr_name, "num_clusters") == 0 + || strcmp(attr_name, "max_iterations") == 0) + return DATA_T_INTEGER; + if (strcmp(attr_name, "min_improvement") == 0) + return DATA_T_DOUBLE; + break; + + case TARGET_SEARCH: + if (strcmp(attr_name, "source") == 0 + || strcmp(attr_name, "similarity_measure") == 0) + return DATA_T_STRING; + if (strcmp(attr_name, "threshold") == 0) + return DATA_T_DOUBLE; + break; + + case TARGET_CLUSTER_ENTRY: + if (strcmp(attr_name, "items") == 0) + return DATA_T_STRINGVEC; + break; + + case TARGET_SEARCH_ENTRY: + if (strcmp(attr_name, "key1") == 0 + || strcmp(attr_name, "key2") == 0) + return DATA_T_STRING; + if (strcmp(attr_name, "sim") == 0) + return DATA_T_DOUBLE; + break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return DATA_T_UNAVAILABLE; + } + + return DATA_T_UNAVAILABLE; + } + + +// LINK #functions +/*** Get the value of a cluster driver instance attribute. + *** + *** @param inf_v The driver instance to be read. + *** @param attr_name The name of the requested attribute. + *** @param datatype The expected datatype of the attribute value. + *** See datatypes.h for a list of valid datatypes. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @param val A pointer to a location where a pointer to the requested + *** data should be stored. Typically, the caller creates a local variable + *** to store this pointer, then passes a pointer to that local variable + *** so that they will have a pointer to the data. + *** This buffer will not be modified unless the data is successfully + *** found. If a value other than 0 is returned, the buffer is not updated. + *** @returns 0 if successful, + *** -1 if an error occurs. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +int +clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + /** Update statistics. **/ + ClusterStatistics.GetValCalls++; + + /** Guard possible segfault. **/ + if (attr_name == NULL) + { + fprintf(stderr, "Warning: Call to clusterGetAttrType() with NULL attribute name.\n"); + return DATA_T_UNAVAILABLE; + } + + /** Performance shortcut for frequently requested attributes: key1, key2, and sim. **/ + if ((attr_name[0] == 'k' && datatype == DATA_T_STRING) /* key1, key2 : string */ + || (attr_name[0] == 's' && datatype == DATA_T_DOUBLE) /* sim : double */ + ) goto handle_targets; + + /** Type check. **/ + const int expected_datatype = clusterGetAttrType(inf_v, attr_name, oxt); + if (datatype != expected_datatype) + { + mssErrorf(1, "Cluster", + "Type mismatch: Accessing attribute ['%s' : %s] as type %s.", + attr_name, objTypeToStr(expected_datatype), objTypeToStr(datatype) + ); + return -1; + } + + /** Handle name. **/ + if (strcmp(attr_name, "name") == 0) + { + ClusterStatistics.GetValCalls_name++; + switch (driver_data->TargetType) + { + case TARGET_NODE: + val->String = ((pSourceData)driver_data->TargetData)->Name; + break; + + case TARGET_CLUSTER: + case TARGET_CLUSTER_ENTRY: + val->String = ((pClusterData)driver_data->TargetData)->Name; + break; + + case TARGET_SEARCH: + case TARGET_SEARCH_ENTRY: + val->String = ((pSearchData)driver_data->TargetData)->Name; + break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + + return 0; + } + + /** Handle annotation. **/ + if (strcmp(attr_name, "annotation") == 0) + { + switch (driver_data->TargetType) + { + case TARGET_NODE: val->String = "Clustering driver."; break; + case TARGET_CLUSTER: val->String = "Clustering driver: Cluster."; break; + case TARGET_CLUSTER_ENTRY: val->String = "Clustering driver: Cluster Entry."; break; + case TARGET_SEARCH: val->String = "Clustering driver: Search."; break; + case TARGET_SEARCH_ENTRY: val->String = "Clustering driver: Cluster Entry."; break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + return 0; + } + + /** Handle various types. **/ + if (strcmp(attr_name, "outer_type") == 0) + { + val->String = "system/row"; + return 0; + } + if (strcmp(attr_name, "content_type") == 0 + || strcmp(attr_name, "inner_type") == 0) + { + val->String = "system/void"; + return 0; + } + if (strcmp(attr_name, "internal_type") == 0) + { + switch (driver_data->TargetType) + { + case TARGET_NODE: val->String = "system/cluster"; break; + case TARGET_CLUSTER: val->String = "cluster/cluster"; break; + case TARGET_CLUSTER_ENTRY: val->String = "cluster/entry"; break; + case TARGET_SEARCH: val->String = "cluster/search"; break; + case TARGET_SEARCH_ENTRY: val->String = "search/entry"; break; + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + + return 0; + } + + /** Last modification is not implemented. **/ + if (strcmp(attr_name, "last_modification") == 0) + { + if (driver_data->TargetType == TARGET_CLUSTER + || driver_data->TargetType == TARGET_CLUSTER_ENTRY + || driver_data->TargetType == TARGET_SEARCH + || driver_data->TargetType == TARGET_SEARCH_ENTRY) + goto date_computed; + else return 1; /* null */ + } + + /** Handle date_created. **/ + if (strcmp(attr_name, "date_created") == 0) + { + switch (driver_data->TargetType) + { + case TARGET_NODE: + /** Attribute is not defined for this target type. **/ + return -1; + + case TARGET_CLUSTER: + case TARGET_CLUSTER_ENTRY: + val->DateTime = &((pClusterData)driver_data->TargetData)->DateCreated; + return 0; + + case TARGET_SEARCH: + case TARGET_SEARCH_ENTRY: + val->DateTime = &((pSearchData)driver_data->TargetData)->DateCreated; + return 0; + } + return -1; + } + + /** Handle date_computed. **/ + if (strcmp(attr_name, "date_computed") == 0) + { + date_computed: + switch (driver_data->TargetType) + { + case TARGET_NODE: + /** Attribute is not defined for this target type. **/ + return -1; + + case TARGET_CLUSTER: + case TARGET_CLUSTER_ENTRY: + { + pClusterData target = (pClusterData)driver_data->TargetData; + pDateTime date_time = &target->DateComputed; + if (date_time->Value == 0) return 1; /* null */ + else val->DateTime = date_time; + return 0; + } + + case TARGET_SEARCH: + case TARGET_SEARCH_ENTRY: + { + pSearchData target = (pSearchData)driver_data->TargetData; + pDateTime date_time = &target->DateComputed; + if (date_time->Value == 0) return 1; /* null */ + else val->DateTime = date_time; + return 0; + } + } + + /** Default: Unknown type. **/ + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + + /** Handle attributes for specific data targets. **/ + handle_targets: + switch (driver_data->TargetType) + { + case TARGET_NODE: + if (strcmp(attr_name, "source") == 0) + { + /** TODO: THAT'S NOT A SOURCE DATA STRUCT!?!?!?!?!?!?!??!?!?!? */ + val->String = ((pSourceData)driver_data->TargetData)->SourcePath; + fprintf(stderr, "Got source: \"%s\"", val->String); + return 0; + } + if (strcmp(attr_name, "key_attr") == 0) + { + val->String = ((pSourceData)driver_data->TargetData)->KeyAttr; + return 0; + } + if (strcmp(attr_name, "name_attr") == 0) + { + val->String = ((pSourceData)driver_data->TargetData)->NameAttr; + return 0; + } + break; + + case TARGET_CLUSTER: + { + pClusterData target = (pClusterData)driver_data->TargetData; + + if (strcmp(attr_name, "algorithm") == 0) + { + val->String = ci_ClusteringAlgorithmToString(target->ClusterAlgorithm); + return 0; + } + if (strcmp(attr_name, "similarity_measure") == 0) + { + val->String = ci_SimilarityMeasureToString(target->SimilarityMeasure); + return 0; + } + if (strcmp(attr_name, "num_clusters") == 0) + { + if (target->nClusters > INT_MAX) + fprintf(stderr, "Warning: 'num_clusters' value of %u exceeds INT_MAX (%d).\n", target->nClusters, INT_MAX); + val->Integer = (int)target->nClusters; + return 0; + } + if (strcmp(attr_name, "max_iterations") == 0) + { + if (target->MaxIterations > INT_MAX) + fprintf(stderr, "Warning: 'max_iterations' value of %u exceeds INT_MAX (%d).\n", target->MaxIterations, INT_MAX); + val->Integer = (int)target->MaxIterations; + return 0; + } + if (strcmp(attr_name, "min_improvement") == 0) + { + val->Double = target->MinImprovement; + return 0; + } + break; + } + + case TARGET_SEARCH: + { + pSearchData target = (pSearchData)driver_data->TargetData; + + if (strcmp(attr_name, "source") == 0) + { + val->String = target->SourceCluster->Name; + return 0; + } + if (strcmp(attr_name, "similarity_measure") == 0) + { + val->String = ci_SimilarityMeasureToString(target->SimilarityMeasure); + return 0; + } + if (strcmp(attr_name, "threshold") == 0) + { + val->Double = target->Threshold; + return 0; + } + } + + case TARGET_CLUSTER_ENTRY: + { + pClusterData target = (pClusterData)driver_data->TargetData; + pCluster target_cluster = &target->Clusters[driver_data->TargetIndex]; + + if (strcmp(attr_name, "items") == 0) + { + /** Static variable to prevent leaking StringVec from previous calls. **/ + static StringVec* vec = NULL; + if (vec != NULL) nmFree(vec, sizeof(StringVec)); + + /** Allocate and initialize the requested data. **/ + val->StringVec = vec = check_ptr(nmMalloc(sizeof(StringVec))); + if (val->StringVec == NULL) return -1; + val->StringVec->nStrings = target_cluster->Size; + val->StringVec->Strings = target_cluster->Strings; + + /** Success. **/ + return 0; + } + break; + } + + case TARGET_SEARCH_ENTRY: + { + pSearchData target = (pSearchData)driver_data->TargetData; + pDup target_dup = target->Dups[driver_data->TargetIndex]; + + if (strcmp(attr_name, "sim") == 0) + { + ClusterStatistics.GetValCalls_sim++; + val->Double = target_dup->similarity; + return 0; + } + if (strcmp(attr_name, "key1") == 0) + { + ClusterStatistics.GetValCalls_key1++; + val->String = target_dup->key1; + return 0; + } + if (strcmp(attr_name, "key2") == 0) + { + ClusterStatistics.GetValCalls_key2++; + val->String = target_dup->key2; + return 0; + } + break; + } + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + + /** Unknown attribute. **/ + char* name; + clusterGetAttrValue(inf_v, "name", DATA_T_STRING, POD(&name), NULL); + mssErrorf(1, "Cluster", + "Unknown attribute '%s' for cluster object %s (target type: %u, \"%s\").", + attr_name, driver_data->NodeData->SourceData->Name, driver_data->TargetType, name + ); + + return -1; + } + + +// LINK #functions +/*** Create a new presentation hints object, describing this attribute on the + *** provided cluster driver instance. + *** + *** Note: Failures from nmSysStrdup() and several others are ignored because + *** the worst case scenario is that the attributes are set to null, which + *** will cause them to be ignored. I consider that to be better than than + *** throwing an error that could unnecessarily disrupt normal usage. + *** + *** @param inf_v The driver instance to be read. + *** @param attr_name The name of the requested attribute. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns A presentation hints object, if successful, + *** NULL if an error occurs. + ***/ +pObjPresentationHints +clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + pObjPresentationHints hints = NULL; + pParamObjects tmp_list = NULL; + + /** Malloc presentation hints struct. **/ + hints = check_ptr(nmMalloc(sizeof(ObjPresentationHints))); + if (hints == NULL) goto err_free; + memset(hints, 0, sizeof(ObjPresentationHints)); + + /** Hints that are the same for all attributes. **/ + hints->GroupID = -1; + hints->VisualLength2 = 1; + hints->Style |= OBJ_PH_STYLE_READONLY | OBJ_PH_STYLE_CREATEONLY | OBJ_PH_STYLE_NOTNULL; + hints->StyleMask |= OBJ_PH_STYLE_READONLY | OBJ_PH_STYLE_CREATEONLY | OBJ_PH_STYLE_NOTNULL; + + /** Temporary param list for compiling expressions. **/ + tmp_list = check_ptr(expCreateParamList()); + if (hints == NULL) goto err_free; + + /** Search for the requested attribute through attributes common to all instances. **/ + if (strcmp(attr_name, "name") == 0) + { + hints->Length = 32; + hints->VisualLength = 16; + goto end; + } + if (strcmp(attr_name, "annotation") == 0) + { + hints->Length = 36; + hints->VisualLength = 36; + goto end; + } + if (strcmp(attr_name, "inner_type") == 0 + || strcmp(attr_name, "inner_type") == 0 + || strcmp(attr_name, "outer_type") == 0 + || strcmp(attr_name, "content_type") == 0 + || strcmp(attr_name, "last_modification") == 0) + { + hints->VisualLength = 30; + goto end; + } + + /** Handle date created and date computed. */ + if (strcmp(attr_name, "date_created") == 0 + || strcmp(attr_name, "date_computed") == 0) + { + if (driver_data->TargetType == TARGET_CLUSTER + || driver_data->TargetType == TARGET_CLUSTER_ENTRY + || driver_data->TargetType == TARGET_SEARCH + || driver_data->TargetType == TARGET_SEARCH_ENTRY) + { + hints->Length = 24; + hints->VisualLength = 20; + hints->Format = check_ptr(nmSysStrdup("datetime")); /* Failure ignored. */ + goto end; + } + else goto unknown_attribute; + } + + /** Search by target type. **/ + switch (driver_data->TargetType) + { + case TARGET_NODE: + if (strcmp(attr_name, "source") == 0) + { + hints->Length = _PC_PATH_MAX; + hints->VisualLength = 64; + hints->FriendlyName = check_ptr(nmSysStrdup("Source Path")); /* Failure ignored. */ + goto end; + } + if (strcmp(attr_name, "key_attr") == 0) + { + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Key Attribute Name")); /* Failure ignored. */ + goto end; + } + if (strcmp(attr_name, "data_attr") == 0) + { + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Data Attribute Name")); /* Failure ignored. */ + goto end; + } + break; + + case TARGET_CLUSTER: + if (strcmp(attr_name, "num_clusters") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("2", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("2147483647", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 8; + hints->VisualLength = 4; + hints->FriendlyName = check_ptr(nmSysStrdup("Number of Clusters")); /* Failure ignored. */ + goto end; + } + if (strcmp(attr_name, "min_improvement") == 0) + { + /** Min and max values. **/ + hints->DefaultExpr = expCompileExpression("0.0001", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = check_ptr(nmSysStrdup("Minimum Improvement Threshold")); /* Failure ignored. */ + goto end; + } + if (strcmp(attr_name, "max_iterations") == 0) + { + /** Min and max values. **/ + hints->DefaultExpr = expCompileExpression("64", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("2147483647", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 8; + hints->VisualLength = 4; + hints->FriendlyName = check_ptr(nmSysStrdup("Maximum Iterations")); /* Failure ignored. */ + goto end; + } + if (strcmp(attr_name, "algorithm") == 0) + { + /** Enum values. **/ + check(xaInit(&(hints->EnumList), nClusteringAlgorithms)); /* Failure ignored. */ + for (unsigned int i = 0u; i < nClusteringAlgorithms; i++) + check_neg(xaAddItem(&(hints->EnumList), &ALL_CLUSTERING_ALGORITHMS[i])); /* Failure ignored. */ + + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + char buf[8]; + snprintf(buf, sizeof(buf), "%d", nClusteringAlgorithms); + hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Display flags. **/ + hints->Style |= OBJ_PH_STYLE_BUTTONS; + hints->StyleMask |= OBJ_PH_STYLE_BUTTONS; + + /** Other hints. **/ + hints->Length = 24; + hints->VisualLength = 20; + hints->FriendlyName = check_ptr(nmSysStrdup("Clustering Algorithm")); /* Failure ignored. */ + goto end; + } + /** Fall-through: Start of overlapping region. **/ + + case TARGET_SEARCH: + if (strcmp(attr_name, "similarity_measure") == 0) + { + /** Enum values. **/ + check(xaInit(&(hints->EnumList), nSimilarityMeasures)); /* Failure ignored. */ + for (unsigned int i = 0u; i < nSimilarityMeasures; i++) + check_neg(xaAddItem(&(hints->EnumList), &ALL_SIMILARITY_MEASURES[i])); /* Failure ignored. */ + + /** Display flags. **/ + hints->Style |= OBJ_PH_STYLE_BUTTONS; + hints->StyleMask |= OBJ_PH_STYLE_BUTTONS; + + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + char buf[8]; + snprintf(buf, sizeof(buf), "%d", nSimilarityMeasures); + hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 32; + hints->VisualLength = 20; + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity Measure")); /* Failure ignored. */ + goto end; + } + + /** End of overlapping region. **/ + if (driver_data->TargetType == TARGET_CLUSTER) break; + + if (strcmp(attr_name, "source") == 0) + { + hints->Length = 64; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Source Cluster Name")); /* Failure ignored. */ + goto end; + } + if (strcmp(attr_name, "threshold") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity Threshold")); /* Failure ignored. */ + goto end; + } + break; + + case TARGET_CLUSTER_ENTRY: + { + pClusterData target = (pClusterData)check_ptr(driver_data->TargetData); + if (target == NULL) goto err_free; + + if (strcmp(attr_name, "items") == 0) + { + /** Other hints. **/ + hints->Length = 65536; + hints->VisualLength = 256; + hints->FriendlyName = check_ptr(nmSysStrdup("Cluster Data")); /* Failure ignored. */ + goto end; + } + if (strcmp(attr_name, "sim") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity")); /* Failure ignored. */ + goto end; + } + break; + } + + case TARGET_SEARCH_ENTRY: + { + pSearchData target = (pSearchData)check_ptr(driver_data->TargetData); + if (target == NULL) goto err_free; + + if (strcmp(attr_name, "key1") == 0) + { + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Key 1")); /* Failure ignored. */ + goto end; + } + if (strcmp(attr_name, "key2") == 0) + { + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Key 2")); /* Failure ignored. */ + goto end; + } + if (strcmp(attr_name, "sim") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity")); /* Failure ignored. */ + goto end; + } + break; + } + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + goto err_free; + } + + /** Unknown attribute. **/ + unknown_attribute:; + mssErrorf(1, "Cluster", "Unknown attribute '%s'.", attr_name); + + /** Error cleanup. **/ + err_free: + if (hints != NULL) nmFree(hints, sizeof(ObjPresentationHints)); + hints = NULL; + + /** Construct the clearest error message that we can. **/ + char* name = NULL; + char* internal_type = NULL; + check(clusterGetAttrValue(inf_v, "name", DATA_T_STRING, POD(&name), NULL)); /* Failure ignored. */ + check(clusterGetAttrValue(inf_v, "internal_type", DATA_T_STRING, POD(&internal_type), NULL)); /* Failure ignored. */ + mssErrorf(0, "Cluster", + "Failed to get presentation hints for object '%s' : \"%s\".", + name, internal_type + ); + + end: + if (tmp_list != NULL) check(expFreeParamList(tmp_list)); /* Failure ignored. */ + + return hints; + } + + +// LINK #functions +/*** Returns the name of the first attribute that one can get from + *** this driver instance (using GetAttrType() and GetAttrValue()). + *** Resets the internal variable (TargetAttrIndex) used to maintain + *** iteration state for clusterGetNextAttr(). + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the first attribute. + ***/ +char* +clusterGetFirstAttr(void* inf_v, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + driver_data->TargetAttrIndex = 0u; + + return clusterGetNextAttr(inf_v, oxt); + } + + +// LINK #functions +/*** Returns the name of the next attribute that one can get from + *** this driver instance (using GetAttrType() and GetAttrValue()). + *** Uses an internal variable (TargetAttrIndex) used to maintain + *** the state of this iteration over repeated calls. + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the next attribute. + ***/ +char* +clusterGetNextAttr(void* inf_v, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + const unsigned int i = driver_data->TargetAttrIndex++; + switch (driver_data->TargetType) + { + case TARGET_NODE: return ATTR_ROOT[i]; + case TARGET_CLUSTER: return ATTR_CLUSTER[i]; + case TARGET_SEARCH: return ATTR_SEARCH[i]; + case TARGET_CLUSTER_ENTRY: return ATTR_CLUSTER_ENTRY[i]; + case TARGET_SEARCH_ENTRY: return ATTR_SEARCH_ENTRY[i]; + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return NULL; + } + + return; /* Unreachable. */ + } + + +// LINK #functions +/*** Get the capabilities of the driver instance object. + *** + *** @param inf_v The driver instance to be checked. + *** @param info The struct to be populated with driver flags. + *** @returns 0 if successful, + *** -1 if the driver is an unimplemented type (should never happen). + ***/ +int +clusterInfo(void* inf_v, pObjectInfo info) + { + pDriverData driver_data = (pDriverData)inf_v; + pNodeData node_data = (pNodeData)driver_data->NodeData; + + /** Reset flags buffer. **/ + info->Flags = 0; + + /** Disallow unsupported functionality. **/ + info->Flags |= OBJ_INFO_F_CANT_ADD_ATTR; + info->Flags |= OBJ_INFO_F_CANT_HAVE_CONTENT; + info->Flags |= OBJ_INFO_F_NO_CONTENT; + + switch (driver_data->TargetType) + { + case TARGET_NODE: + info->nSubobjects = node_data->nClusterDatas + node_data->nSearchDatas; + info->Flags |= OBJ_INFO_F_CAN_HAVE_SUBOBJ; + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->Flags |= (info->nSubobjects > 0) ? OBJ_INFO_F_HAS_SUBOBJ : OBJ_INFO_F_NO_SUBOBJ; + break; + + case TARGET_CLUSTER: + info->Flags |= OBJ_INFO_F_CAN_HAVE_SUBOBJ; + info->Flags |= OBJ_INFO_F_HAS_SUBOBJ; /* Data must not be empty. */ + + /*** Clusters always have one label per vector. + *** If we know how many vectors are in the dataset, + *** we know how many labels this cluster will have, + *** even if it hasn't been computed yet. + ***/ + if (node_data->SourceData->Vectors != NULL) + { + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->nSubobjects = node_data->SourceData->nVectors; + } + break; + + case TARGET_SEARCH: + { + pSearchData search_data = (pSearchData)driver_data->TargetData; + info->Flags |= OBJ_INFO_F_CAN_HAVE_SUBOBJ; + if (search_data->Dups != NULL) + { + info->nSubobjects = search_data->nDups; + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->Flags |= (info->nSubobjects > 0) ? OBJ_INFO_F_HAS_SUBOBJ : OBJ_INFO_F_NO_SUBOBJ; + } + break; + } + + case TARGET_CLUSTER_ENTRY: + case TARGET_SEARCH_ENTRY: + /** No Subobjects. **/ + info->Flags |= OBJ_INFO_F_CANT_HAVE_SUBOBJ; + info->Flags |= OBJ_INFO_F_NO_SUBOBJ; + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->nSubobjects = 0; + break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + goto err; + } + + return 0; + + err: + mssErrorf(0, "Cluster", "Failed execute get info."); + return -1; + } + + +/** ================ Method Execution Functions ================ **/ +/** ANCHOR[id=method] **/ +// LINK #functions + +/*** Returns the name of the first method that one can execute from + *** this driver instance (using clusterExecuteMethod()). Resets the + *** internal variable (TargetMethodIndex) used to maintain iteration + *** state for clusterGetNextMethod(). + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the first method. + ***/ +char* +clusterGetFirstMethod(void* inf_v, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + driver_data->TargetMethodIndex = 0u; + + return clusterGetNextMethod(inf_v, oxt); + } + + +// LINK #functions +/*** Returns the name of the next method that one can get from + *** this driver instance (using GetAttrType() and GetAttrValue()). + *** Uses an internal variable (TargetMethodIndex) used to maintain + *** the state of this iteration over repeated calls. + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the next method. + ***/ +char* +clusterGetNextMethod(void* inf_v, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + return METHOD_NAMES[driver_data->TargetMethodIndex++]; + } + + +// LINK #functions +/** Intended for use in xhForEach(). **/ +static int +ci_PrintEntry(pXHashEntry entry, void* arg) + { + /** Extract entry. **/ + char* key = entry->Key; + void* data = entry->Data; + + /** Extract args. **/ + void** args = (void**)arg; + unsigned int* type_id_ptr = (unsigned int*)args[0]; + unsigned int* total_bytes_ptr = (unsigned int*)args[1]; + unsigned long long* less_ptr = (unsigned long long*)args[2]; + char* path = (char*)args[3]; + + /** If a path is provided, check that it matches the start of the key. **/ + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return 0; + + /** Handle type. **/ + char* type; + char* name; + unsigned int bytes; + switch (*type_id_ptr) + { + case 1u: + { + pSourceData source_data = (pSourceData)data; + + /** Compute size. **/ + bytes = ci_SizeOfSourceData(source_data); + + /** If less is specified, skip uncomputed source. **/ + if (*less_ptr > 0llu && source_data->Vectors == NULL) goto no_print; + + /** Compute printing information. **/ + type = "Source"; + name = source_data->Name; + break; + } + case 2u: + { + pClusterData cluster_data = (pClusterData)data; + + /** Compute size. **/ + bytes = ci_SizeOfClusterData(cluster_data, false); + + /** If less is specified, skip uncomputed source. **/ + if (*less_ptr > 0llu && cluster_data->Clusters == NULL) goto no_print; + + /** Compute printing information. **/ + type = "Cluster"; + name = cluster_data->Name; + break; + } + case 3u: + { + pSearchData search_data = (pSearchData)data; + + /** Compute size. **/ + bytes = ci_SizeOfSearchData(search_data); + + /** If less is specified, skip uncomputed source. **/ + if (*less_ptr > 0llu && search_data->Dups == NULL) goto no_print; + + /** Compute printing information. **/ + type = "Search"; + name = search_data->Name; + break; + } + default: + mssErrorf(0, "Cluster", "Unknown type_id %u.", *type_id_ptr); + return -1; + } + + /** Print the cache entry data. **/ + char buf[12]; + snprint_bytes(buf, sizeof(buf), bytes); + printf("%-8s %-16s %-12s \"%s\"\n", type, name, buf, key); + goto increment_total; + + no_print: + (*less_ptr)++; + + increment_total: + *total_bytes_ptr += bytes; + + return 0; + } + + +// LINK #functions +/** Intended for use in xhClearKeySafe(). **/ +static void +ci_CacheFreeSourceData(pXHashEntry entry, void* path) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pSourceData source_data = (pSourceData)entry->Data; + + /** If a path is provided, check that it matches the start of the key. **/ + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; + + /** Free data. **/ + ci_FreeSourceData(source_data); + nmSysFree(key); + + return; + } + + +// LINK #functions +/** Intended for use in xhClearKeySafe(). **/ +static void +ci_CacheFreeCluster(pXHashEntry entry, void* path) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pClusterData cluster_data = (pClusterData)entry->Data; + + /** If a path is provided, check that it matches the start of the key. **/ + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; + + /** Free data. **/ + ci_FreeClusterData(cluster_data, false); + nmSysFree(key); + + return; + } + + +// LINK #functions +/** Intended for use in xhClearKeySafe(). **/ +static void +ci_CacheFreeSearch(pXHashEntry entry, void* path) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pSearchData search_data = (pSearchData)entry->Data; + + /** If a path is provided, check that it matches the start of the key. **/ + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; + + /** Free data. **/ + ci_FreeSearchData(search_data); + nmSysFree(key); + + return; + } + + +// LINK #functions +/*** Executes a method with the given name. + *** + *** @param inf_v The affected driver instance. + *** @param method_name The name of the method. + *** @param param A possibly optional param passed to the method. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + ***/ +int +clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + /** Cache management method. **/ + if (strcmp(method_name, "cache") == 0) + { + char* path = NULL; + + /** Second parameter is required. **/ + if (param->String == NULL) + { + mssErrorf(1, "Cluster", + "[param : \"show\" | \"show_less\" | \"show_all\" | \"drop_all\"] is required for the cache method." + ); + goto err; + } + + /** 'show' and 'show_all'. **/ + bool show = false; + unsigned long long skip_uncomputed = 0llu; + if (strcmp(param->String, "show_less") == 0) + /** Specify show_less to skip uncomputed caches. **/ + skip_uncomputed = 1ull; + if (skip_uncomputed == 1ull || strcmp(param->String, "show") == 0) + { + show = true; + path = ci_file_path(driver_data->NodeData->Parent); + } + if (strcmp(param->String, "show_all") == 0) show = true; + + if (show) + { + /** Print cache info table. **/ + int ret = 0; + unsigned int i = 1u, source_bytes = 0u, cluster_bytes = 0u, search_bytes = 0u; + bool failed = false; + printf("\nShowing cache for "); + if (path != NULL) printf("\"%s\":\n", path); + else printf("all files:\n"); + printf("%-8s %-16s %-12s %s\n", "Type", "Name", "Size", "Cache Entry Key"); + failed |= !check(xhForEach( + &ClusterDriverCaches.SourceDataCache, + ci_PrintEntry, + (void*[]){&i, &source_bytes, (void*)&skip_uncomputed, path} + )); + i++; + failed |= !check(xhForEach( + &ClusterDriverCaches.ClusterDataCache, + ci_PrintEntry, + (void*[]){&i, &cluster_bytes, (void*)&skip_uncomputed, path} + )); + i++; + failed |= !check(xhForEach( + &ClusterDriverCaches.SearchDataCache, + ci_PrintEntry, + (void*[]){&i, &search_bytes, (void*)&skip_uncomputed, path} + )); + if (failed) + { + mssErrorf(0, "Cluster", "Unexpected error occurred while showhing caches."); + ret = -1; + } + + /** Precomputations. **/ + unsigned int total_caches = 0u + + (unsigned int)ClusterDriverCaches.SourceDataCache.nItems + + (unsigned int)ClusterDriverCaches.ClusterDataCache.nItems + + (unsigned int)ClusterDriverCaches.SearchDataCache.nItems; + if (total_caches <= skip_uncomputed) printf("All caches skipped, nothing to show...\n"); + + /** Print stats. **/ + char buf[16]; + printf("\nCache Stats:\n"); + printf("%-8s %-4s %-12s\n", "", "#", "Total Size"); + printf("%-8s %-4d %-12s\n", "Source", ClusterDriverCaches.SourceDataCache.nItems, snprint_bytes(buf, sizeof(buf), source_bytes)); + printf("%-8s %-4d %-12s\n", "Cluster", ClusterDriverCaches.ClusterDataCache.nItems, snprint_bytes(buf, sizeof(buf), cluster_bytes)); + printf("%-8s %-4d %-12s\n", "Search", ClusterDriverCaches.SearchDataCache.nItems, snprint_bytes(buf, sizeof(buf), search_bytes)); + printf("%-8s %-4d %-12s\n\n", "Total", total_caches, snprint_bytes(buf, sizeof(buf), source_bytes + cluster_bytes + search_bytes)); + + /** Print skip stats (if anything was skipped.) **/ + if (skip_uncomputed > 0llu) printf("Skipped %llu uncomputed caches.\n\n", skip_uncomputed - 1llu); + + return ret; + } + + /** 'drop_all'. **/ + if (strcmp(param->String, "drop_all") == 0) + { + printf("\nDropping cache for all files:\n"); + ci_ClearCaches(); + return 0; + } + + /** Unknown parameter. **/ + mssErrorf(1, "Cluster", + "Expected [param : \"show\" | \"show_less\" | \"show_all\" | \"drop_all\"] for the cache method, but got: \"%s\"", + param->String + ); + goto err; + } + + if (strcmp(method_name, "stat") == 0) + { + char buf[12]; + printf("Cluster Driver Statistics:\n"); + printf(" Stat Name %12s\n", "Value"); + printf(" OpenCalls %12s\n", snprint_commas_llu(buf, sizeof(buf), ClusterStatistics.OpenCalls)); + printf(" OpenQueryCalls %12s\n", snprint_commas_llu(buf, sizeof(buf), ClusterStatistics.OpenQueryCalls)); + printf(" FetchCalls %12s\n", snprint_commas_llu(buf, sizeof(buf), ClusterStatistics.FetchCalls)); + printf(" CloseCalls %12s\n", snprint_commas_llu(buf, sizeof(buf), ClusterStatistics.CloseCalls)); + printf(" GetTypeCalls %12s\n", snprint_commas_llu(buf, sizeof(buf), ClusterStatistics.GetTypeCalls)); + printf(" GetValCalls %12s\n", snprint_commas_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls)); + printf(" GetValCalls_name %12s\n", snprint_commas_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_name)); + printf(" GetValCalls_key1 %12s\n", snprint_commas_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key1)); + printf(" GetValCalls_key2 %12s\n", snprint_commas_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key2)); + printf(" GetValCalls_sim %12s\n", snprint_commas_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_sim)); + printf("\n"); + + nmStats(); + + return 0; + } + + /** Unknown parameter. **/ + mssErrorf(1, "Cluster", "Unknown command: \"%s\"", method_name); + + err: + mssErrorf(0, "Cluster", "Failed execute command."); + + return -1; + } + + +/** ================ Unimplemented Functions ================ **/ +/** ANCHOR[id=unimplemented] **/ +// LINK #functions + +/** Not implemented. **/ +int +clusterCreate(pObject obj, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterCreate() is not implemented."); + + return -ENOSYS; + } + +/** Not implemented. **/ +int +clusterDelete(pObject obj, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterDelete() is not implemented."); + + return -1; + } + +/** Not implemented. **/ +int +clusterDeleteObj(void* inf_v, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterDeleteObj() is not implemented."); + + return -1; + } + +/** Not implemented. **/ +int +clusterRead(void* inf_v, char* buffer, int max_cnt, int offset, int flags, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterRead() not implemented."); + fprintf(stderr, "HINT: Use queries instead, (e.g. clusterOpenQuery()).\n"); + + return -1; + } + +/** Not implemented. **/ +int +clusterWrite(void* inf_v, char* buffer, int cnt, int offset, int flags, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterWrite() not implemented because clusters are imutable."); + + return -1; + } + +/** Not implemented. **/ +int +clusterSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterSetAttrValue() not implemented because clusters are imutable."); + + return -1; + } + +/** Not implemented. **/ +int +clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterAddAttr() not implemented because clusters are imutable."); + + return -1; + } + +/** Not implemented. **/ +void* +clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterOpenAttr() not implemented."); + + return NULL; + } + +/** Not implemented. **/ +int +clusterCommit(void* inf_v, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterCommit() not implemented because clusters are imutable."); + + return 0; + } + + +// LINK #functions +/*** Initialize the driver. This includes: + *** - Registering the driver with the objectsystem. + *** - Registering structs with newmalloc for debugging. + *** - Initializing global data needed for the driver. + *** + *** @returns 0 if successful, or + *** -1 if an error occurs. + ***/ +int +clusterInitialize(void) + { + /** Allocate the driver. **/ + pObjDriver drv = (pObjDriver)check_ptr(nmMalloc(sizeof(ObjDriver))); + if (drv == NULL) goto err_free; + memset(drv, 0, sizeof(ObjDriver)); + + /** Initialize caches. **/ + // memset(&ClusterDriverCaches, 0, sizeof(ClusterDriverCaches)); + if (!check(xhInit(&ClusterDriverCaches.SourceDataCache, 251, 0))) goto err_free; + if (!check(xhInit(&ClusterDriverCaches.ClusterDataCache, 251, 0))) goto err_free; + if (!check(xhInit(&ClusterDriverCaches.SearchDataCache, 251, 0))) goto err_free; + + /** Initialize statistics. **/ + memset(&ClusterStatistics, 0, sizeof(ClusterStatistics)); + + /** Setup the structure. **/ + if (check_ptr(strcpy(drv->Name, "cluster - Clustering Driver")) == NULL) goto err_free; + if (!check(xaInit(&drv->RootContentTypes, 1))) goto err_free; + if (!check_neg(xaAddItem(&drv->RootContentTypes, "system/cluster"))) goto err_free; + drv->Capabilities = 0; /* TODO: Greg - Should I indicate any capabilities? */ + + /** Setup the function references. **/ + drv->Open = clusterOpen; + drv->OpenChild = NULL; + drv->Close = clusterClose; + drv->Create = clusterCreate; + drv->Delete = clusterDelete; + drv->DeleteObj = clusterDeleteObj; + drv->OpenQuery = clusterOpenQuery; + drv->QueryDelete = NULL; + drv->QueryFetch = clusterQueryFetch; + drv->QueryClose = clusterQueryClose; + drv->Read = clusterRead; + drv->Write = clusterWrite; + drv->GetAttrType = clusterGetAttrType; + drv->GetAttrValue = clusterGetAttrValue; + drv->GetFirstAttr = clusterGetFirstAttr; + drv->GetNextAttr = clusterGetNextAttr; + drv->SetAttrValue = clusterSetAttrValue; + drv->AddAttr = clusterAddAttr; + drv->OpenAttr = clusterOpenAttr; + drv->GetFirstMethod = clusterGetFirstMethod; + drv->GetNextMethod = clusterGetNextMethod; + drv->ExecuteMethod = clusterExecuteMethod; + drv->PresentationHints = clusterPresentationHints; + drv->Info = clusterInfo; + drv->Commit = clusterCommit; + drv->GetQueryCoverageMask = NULL; + drv->GetQueryIdentityPath = NULL; + + /** Register the driver. **/ + if (!check(objRegisterDriver(drv))) goto err_free; + + /** Register structs used in this project with the newmalloc memory management system. **/ + nmRegister(sizeof(SourceData), "ClusterSourceData"); + nmRegister(sizeof(Cluster), "Cluster"); + nmRegister(sizeof(ClusterData), "ClusterData"); + nmRegister(sizeof(SearchData), "ClusterSearch"); + nmRegister(sizeof(NodeData), "ClusterNodeData"); + nmRegister(sizeof(DriverData), "ClusterDriverData"); + nmRegister(sizeof(ClusterQuery), "ClusterQuery"); + nmRegister(sizeof(ClusterDriverCaches), "ClusterDriverCaches"); + + /** Success. **/ + return 0; + + err_free: + /** Error cleanup. **/ + if (ClusterDriverCaches.SourceDataCache.nRows != 0) check(xhDeInit(&ClusterDriverCaches.SourceDataCache)); /* Failure ignored. **/ + if (ClusterDriverCaches.ClusterDataCache.nRows != 0) check(xhDeInit(&ClusterDriverCaches.ClusterDataCache)); /* Failure ignored. **/ + if (ClusterDriverCaches.SearchDataCache.nRows != 0) check(xhDeInit(&ClusterDriverCaches.SearchDataCache)); /* Failure ignored. **/ + if (drv != NULL) + { + if (drv->RootContentTypes.nAlloc != 0) check(xaDeInit(&drv->RootContentTypes)); /* Failure ignored. */ + nmFree(drv, sizeof(ObjDriver)); + } + + mssErrorf(1, "Cluster", "Failed to initialize cluster driver.\n"); + + return -1; + } diff --git a/centrallix/test_obj.c b/centrallix/test_obj.c index c4c64e25..355a89f4 100644 --- a/centrallix/test_obj.c +++ b/centrallix/test_obj.c @@ -767,7 +767,13 @@ testobj_do_cmd(pObjSession s, char* cmd, int batch_mode, pLxSession inp_lx) fdQPrintf(TESTOBJ.Output, "%[,%]%STR", i!=0, ptr); else { - while (strpbrk(ptr, "\r\n")) *(strpbrk(ptr, "\r\n")) = ' '; + char* cur; + while (1) + { + cur = strpbrk(ptr, "\r\n"); + if (cur == NULL) break; + else *cur = ' '; + } fdQPrintf(TESTOBJ.Output, "%[,%]\"%STR&DSYB\"", i!=0, ptr); } @@ -1443,6 +1449,7 @@ testobj_do_cmd(pObjSession s, char* cmd, int batch_mode, pLxSession inp_lx) else { printf("Unknown command '%s'\n",cmdname); + mlxCloseSession(ls); return -1; } diff --git a/centrallix/tests/test_cos_compare_00.cmp b/centrallix/tests/test_cos_compare_00.cmp index d586365f..2061443a 100644 --- a/centrallix/tests/test_cos_compare_00.cmp +++ b/centrallix/tests/test_cos_compare_00.cmp @@ -1,7 +1,11 @@ -Attribute [case1]: integer 1 -Attribute [case2]: integer 1 -Attribute [case3]: integer 1 -Attribute [case4]: integer 1 -Attribute [case5]: integer 1 -Attribute [case6]: integer 1 -Attribute [case7]: integer 1 +Attribute [case1]: string "pass" +Attribute [case2]: string "pass" +Attribute [case3]: string "pass" +Attribute [case4]: string "pass" +Attribute [cynthia]: string "pass" +Attribute [timothy]: string "pass" +Attribute [lance]: string "pass" +Attribute [gregory]: string "pass" +Attribute [nathan]: string "pass" +Attribute [identical]: string "pass" +Attribute [name]: string "pass" diff --git a/centrallix/tests/test_cos_compare_00.to b/centrallix/tests/test_cos_compare_00.to index 5bf95051..c5b0b1a5 100644 --- a/centrallix/tests/test_cos_compare_00.to +++ b/centrallix/tests/test_cos_compare_00.to @@ -1,17 +1,24 @@ ##NAME Text Mining String Similarity with Cosine Compare -# All email addresses and phone numbers are imaginary and were fabricated for the purposes of this test +# Basic tests of cosine similarity. +query select case1 = condition((cos_compare('hello', 'hello') >= 0.999) and (cos_compare('hello', 'hello') <= 1.0), "pass", "fail") +query select case2 = condition((cos_compare('hello', 'zephora') <= 0.001) and (cos_compare('hello', 'zephora') >= 0.0), "pass", "fail") +query select case3 = condition((cos_compare('hello', 'hello world') <= 0.7) and (cos_compare('hello', 'hello world') >= 0.6), "pass", "fail") +query select case4 = condition((cos_compare('hello there', 'hellow there') >= 0.9) and (cos_compare('hello', 'hellow') <= 1.0), "pass", "fail") + -query select case1 = (cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") >= 0.49) and (cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") <= 0.54) +# Tests on fabricated contact information. +# All email addresses and phone numbers are imaginary and were fabricated for the purposes of this test +query select cynthia = condition((cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") >= 0.49) and (cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") <= 0.54), "pass", "fail") -query select case2 = (cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") >= 0.425) and (cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") <= 0.475) +query select timothy = condition((cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") >= 0.45) and (cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") <= 0.50), "pass", "fail") -query select case3 = (cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") >= 0.35) and (cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") <= 0.40) +query select lance = condition((cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") >= 0.425) and (cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") <= 0.475), "pass", "fail") -query select case4 = (cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") >= 0.94) and (cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") <= 0.99) +query select gregory = condition((cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") >= 0.94) and (cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") <= 0.99), "pass", "fail") -query select case5 = (cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") >=0.66) and (cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") <= 0.71) +query select nathan = condition((cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") >= 0.575) and (cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") <= 0.625), "pass", "fail") -query select case6 = (cos_compare("This is an identical case", "This is an identical case") >=0.975) and (cos_compare("This is an identical case", "This is an identical case") <=1.00) +query select identical = condition((cos_compare("This is an identical case", "This is an identical case") >= 0.975) and (cos_compare("This is an identical case", "This is an identical case") <= 1.00), "pass", "fail") -query select case7 = (cos_compare("Samuel", "Alex") >= 0.00) and (cos_compare("Samuel", "Alex") <= 0.025) +query select name = condition((cos_compare("Samuel", "Alex") >= 0.00) and (cos_compare("Samuel", "Alex") <= 0.025), "pass", "fail") diff --git a/centrallix/tests/test_expfn_log_00.cmp b/centrallix/tests/test_expfn_log_00.cmp new file mode 100644 index 00000000..13005681 --- /dev/null +++ b/centrallix/tests/test_expfn_log_00.cmp @@ -0,0 +1,43 @@ +Attribute [ln(1)]: double 0.0 +Attribute [ln(e)]: double 1.0 +Attribute [ln(0)]: double -inf.0 +Attribute [ln(-1)]: double nan.0 +Attribute [ln(10)]: double 2.30258509 +Attribute [ln(1.5)]: double 0.40546511 +Attribute [ln(1e-10)]: integer 1 +Attribute [ln(1e10)+]: integer 1 +Attribute [ln(1e10)-]: integer 1 +Attribute [log10(1)]: double 0.0 +Attribute [log10(10)]: double 1.0 +Attribute [log10(0)]: double -inf.0 +Attribute [log10(-10)]: double nan.0 +Attribute [log10(100)]: double 2.0 +Attribute [log10(0.01)]: double -2.0 +Attribute [log10(1.234)]: double 0.09131516 +Attribute [log10(1e-10)]: double -10.0 +Attribute [log10(1e10)]: double 10.0 +Attribute [log(8, 2)]: double 3.0 +Attribute [log(1000, 10)]: double 3.0 +Attribute [log(10, 0)]: double -0.0 +Attribute [log(10, 1)]: double inf.0 +Attribute [log(8, -2)]: double nan.0 +Attribute [log(0, 2)]: double -inf.0 +Attribute [log(-8, 2)]: double nan.0 +Attribute [log(1, 2)]: double 0.0 +Attribute [log(1e10, 10)]: double 10.0 +Attribute [log(8, 0.5)]: double -3.0 +Attribute [log(1)]: integer 1 +Attribute [log(e)]: integer 1 +Attribute [log(0)]: integer 1 +Attribute [log(-1)]: integer 1 +Attribute [log(10)]: integer 1 +Attribute [log(1.5)]: integer 1 +Attribute [log(1e-10)]: integer 1 +Attribute [log(1e10)+]: integer 1 +Attribute [log(1e10)-]: integer 1 +Attribute [ln(2.718281828)]: double 1.0 +Attribute [log10(3.14159)]: double 0.49714951 +Attribute [log(10, 1.1)]: double 0.04139269 +Attribute [log(1.1, 10)]: double 24.15885793 +Attribute [log(10, 0.001)]: double -3.0 +Attribute [log(0.1, 1000)]: double -3.0 diff --git a/centrallix/tests/test_expfn_log_00.to b/centrallix/tests/test_expfn_log_00.to new file mode 100644 index 00000000..c7314023 --- /dev/null +++ b/centrallix/tests/test_expfn_log_00.to @@ -0,0 +1,55 @@ +##NAME log() functions + +# Natural Log: ln(x) +query select 'ln(1)' = ln(1) -- Expect 0. +query select 'ln(e)' = ln(2.718281828459045) -- Expect 1. +query select 'ln(0)' = ln(0) -- Expect -inf (log approaches infinity). +query select 'ln(-1)' = ln(-1) -- Expect NaN (log undefined for negative). +query select 'ln(10)' = round(ln(10), 8) -- Expect ~2.30258509. +query select 'ln(1.5)' = round(ln(1.5), 8) -- Expect ~0.40546511. +query select 'ln(1e-10)' = ln(0.0000000001) < 0.0000000001 -- Expect true (value is very small). +query select 'ln(1e10)+' = ln(10000000000.0) > 23.0 -- Expect true (value is ~23.02585). +query select 'ln(1e10)-' = ln(10000000000.0) < 23.1 -- Expect true (value is ~23.02585). + +# Log base 10: log10(x) +query select 'log10(1)' = log10(1) -- Expect 0. +query select 'log10(10)' = log10(10) -- Expect 1. +query select 'log10(0)' = log10(0) -- Expect -inf. +query select 'log10(-10)' = log10(-10) -- Expect NaN. +query select 'log10(100)' = log10(100) -- Expect 2. +query select 'log10(0.01)' = log10(0.01) -- Expect -2. +query select 'log10(1.234)' = round(log10(1.234), 8) -- Expect ~0.091315. +query select 'log10(1e-10)' = log10(0.0000000001) -- Expect ~-10. +query select 'log10(1e10)' = log10(10000000000.0) -- Expect ~10. + +# General base n of x: log(x, n) +# Edge cases: base <= 0 or base == 1 (invalid), x <= 0 (invalid) +query select 'log(8, 2)' = log(8, 2) -- Expect 3. +query select 'log(1000, 10)' = log(1000, 10) -- Expect 3. +query select 'log(10, 0)' = log(10, 0) -- Expect -0.0 (base 0 is undefined). +query select 'log(10, 1)' = log(10, 1) -- Expect inf (base 1 is undefined). +query select 'log(8, -2)' = log(8, -2) -- Expect NaN (negative base). +query select 'log(0, 2)' = log(0, 2) -- Expect -inf (x=0). +query select 'log(-8, 2)' = log(-8, 2) -- Expect NaN or error (x negative). +query select 'log(1, 2)' = log(1, 2) -- Expect 0. +query select 'log(1e10, 10)' = log(10000000000.0, 10) -- Expect 10. +query select 'log(8, 0.5)' = log(8, 0.5) -- Expect negative value. + +# log(x) = ln(x) +query select 'log(1)' = (log(1) == ln(1)) +query select 'log(e)' = (log(2.71828182845) == ln(2.71828182845)) +query select 'log(0)' = (log(0) == ln(0)) +query select 'log(-1)' = (log(-1) == ln(-1)) +query select 'log(10)' = (log(10) == ln(10)) +query select 'log(1.5)' = (log(1.5) == ln(1.5)) +query select 'log(1e-10)' = (log(0.0000000001) == ln(0.0000000001)) +query select 'log(1e10)+' = (log(10000000000.0) == ln(10000000000.0)) +query select 'log(1e10)-' = (log(10000000000.0) == ln(10000000000.0)) + +-- Additional double/int mixed cases +query select 'ln(2.718281828)' = round(ln(2.718281828), 8) -- Expect ~1 (close to e). +query select 'log10(3.14159)' = round(log10(3.14159), 8) -- Expect ~0.49715. +query select 'log(10, 1.1)' = round(log(1.1, 10), 8) -- Expect 0.04139289. +query select 'log(1.1, 10)' = round(log(10, 1.1), 8) -- Expect 24.15885793. +query select 'log(10, 0.001)' = round(log(0.001, 10), 8) -- Expect ~-0.33333333... +query select 'log(0.1, 1000)' = round(log(1000, 0.1), 8) -- Expect ~-0.33333333... diff --git a/centrallix/tests/test_expfn_metaphone_00.cmp b/centrallix/tests/test_expfn_metaphone_00.cmp new file mode 100644 index 00000000..d13cf05c --- /dev/null +++ b/centrallix/tests/test_expfn_metaphone_00.cmp @@ -0,0 +1,140 @@ +Attribute [result]: string "TST`TST" +Attribute [result]: string "PSK`PSK" +Attribute [result]: string "SNTRLKS`SNTRLKS" +Attribute [result]: string "LRNS`LRNS" +Attribute [result]: string "FLPS`FLPS" +Attribute [result]: string "AKSPTNNS`AKSPTNKNS" +Attribute [result]: string "SPRKLFRJLSTSKSPLTSS`SPRKLFRKLSTSKSPLTXS" +Attribute [result]: string "SKTLPKSSTSLKRFLKRPS`SKTLPKSSTSLKRFLKRPS" +Attribute [result]: string "SM0`XMT" +Attribute [result]: string "XMT`SMT" +Attribute [result]: string "SNTR`XNTR" +Attribute [result]: string "XNTR`SNTR" +Attribute [result]: string "ARN`ARNF" +Attribute [result]: string "ARNF`ARNF" +Attribute [result]: string "AKST`AKST" +Attribute [result]: string "AKSTNT`AKSTNT" +Attribute [result]: string "AKTL`AKTL" +Attribute [result]: string "ARX`ARK" +Attribute [result]: string "ART`ARTS" +Attribute [result]: string "PKS`PKS" +Attribute [result]: string "PX`PX" +Attribute [result]: string "PJTR`PHTR" +Attribute [result]: string "PLX`PLX" +Attribute [result]: string "PRTX`PRTX" +Attribute [result]: string "PJ`PK" +Attribute [result]: string "P`P" +Attribute [result]: string "PR`PR" +Attribute [result]: string "PRTN`PRTN" +Attribute [result]: string "KPRL`KPR" +Attribute [result]: string "SSR`SSR" +Attribute [result]: string "KKN`KKN" +Attribute [result]: string "KMPL`KMPL" +Attribute [result]: string "KRLL`KRLL" +Attribute [result]: string "KRLL`KRLL" +Attribute [result]: string "KMSTR`KMSTR" +Attribute [result]: string "KNT`KNT" +Attribute [result]: string "KRS`KRS" +Attribute [result]: string "KF`KF" +Attribute [result]: string "SRN`XRN" +Attribute [result]: string "TM`TM" +Attribute [result]: string "ATKR`ATKR" +Attribute [result]: string "AJ`AJ" +Attribute [result]: string "FLPTS`FLPFX" +Attribute [result]: string "FKX`FKX" +Attribute [result]: string "KLKS`KKS" +Attribute [result]: string "KRMNK`JRMNK" +Attribute [result]: string "JRTL`JRTL" +Attribute [result]: string "JLN`JLN" +Attribute [result]: string "KSPL`KSPL" +Attribute [result]: string "KF`KF" +Attribute [result]: string "KRK`KRK" +Attribute [result]: string "HKMR`HKMR" +Attribute [result]: string "H`H" +Attribute [result]: string "ALNT`ALNT" +Attribute [result]: string "AL`AL" +Attribute [result]: string "ATLN`ATLN" +Attribute [result]: string "JNKLTS`ANKLFX" +Attribute [result]: string "HS`HS" +Attribute [result]: string "LF`LF" +Attribute [result]: string "MKFR`MKFR" +Attribute [result]: string "MKRKR`MKRKR" +Attribute [result]: string "MNKR`MNJR" +Attribute [result]: string "MK`MK" +Attribute [result]: string "MKLFLN`MKLFLN" +Attribute [result]: string "MKL`MXL" +Attribute [result]: string "MTL`MTL" +Attribute [result]: string "ARKSTR`ARKSTR" +Attribute [result]: string "ARKT`ARKT" +Attribute [result]: string "PNN`PNN" +Attribute [result]: string "RSPR`RSPR" +Attribute [result]: string "RSN`RSNS" +Attribute [result]: string "RJ`RJR" +Attribute [result]: string "RF`RF" +Attribute [result]: string "SLFTR`SLFTR" +Attribute [result]: string "SNHSNT`SNHSNT" +Attribute [result]: string "XNKR`SKNKR" +Attribute [result]: string "XRMRRN`SKRMRRN" +Attribute [result]: string "XLSNKR`SLSNJR" +Attribute [result]: string "SKL`SKL" +Attribute [result]: string "SKNR`SKNR" +Attribute [result]: string "SKST`SKST" +Attribute [result]: string "XKR`SKR" +Attribute [result]: string "XKR`SKR" +Attribute [result]: string "TKLR`TLR" +Attribute [result]: string "TMS`TMS" +Attribute [result]: string "TMS`TMS" +Attribute [result]: string "0M`TM" +Attribute [result]: string "TXNR`TKNR" +Attribute [result]: string "TF`TF" +Attribute [result]: string "FK`FK" +Attribute [result]: string "AKTLR`FKTLR" +Attribute [result]: string "AKSLR`FKSLR" +Attribute [result]: string "ART`FRT" +Attribute [result]: string "SF`SFR" +Attribute [result]: string "ANKLFX`ANKLFK" +Attribute [result]: string "J`J" +Attribute [result]: string "MKLLN`MKLLN" +Attribute [result]: string "MRS`MRS" +Attribute [result]: string "APR`APR" +Attribute [result]: string "KMPRL`KMPR" +Attribute [result]: string "HT`HT" +Attribute [result]: string "K0RN`KTRN" +Attribute [result]: string "K0RN`KTRN" +Attribute [result]: string "RXRT`RKRT" +Attribute [result]: string "PP`PP" +Attribute [result]: string "ARK`ARK" +Attribute [result]: string "JF`KF" +Attribute [result]: string "TF`TF" +Attribute [result]: string "R`R" +Attribute [result]: string "STFN`STFN" +Attribute [result]: string "PRS`PRS" +Attribute [result]: string "RNT`RNT" +Attribute [result]: string "PRN`PRN" +Attribute [result]: string "PRN`PRN" +Attribute [result]: string "AT`AT" +Attribute [result]: string "AT`AT" +Attribute [result]: string "APT`APT" +Attribute [result]: string "PK`PK" +Attribute [result]: string "PKR`PKR" +Attribute [result]: string "XRLS`XRLS" +Attribute [result]: string "KN`KN" +Attribute [result]: string "NM`NM" +Attribute [result]: string "RJ`R" +Attribute [result]: string "KNTN`KNTN" +Attribute [result]: string "A`A" +Attribute [result]: string "XMKR`XMKR" +Attribute [result]: string "SN`XN" +Attribute [result]: string "SKLT`SKLT" +Attribute [result]: string "STXN`STXN" +Attribute [result]: string "MX`MX" +Attribute [result]: string "PS`PTS" +Attribute [result]: string "AKNS`ANS" +Attribute [result]: string "SNS`SNS" +Attribute [result]: string "FNKK`FNKK" +Attribute [result]: string "JSF`HSF" +Attribute [result]: string "APJKT`APJKT" +Attribute [result]: string "SLS`SLS" +Attribute [result]: string "XRF`XRF" +Attribute [result]: string "KS`KS" +Attribute [result]: string "FNKLR`FNKLR" diff --git a/centrallix/tests/test_expfn_metaphone_00.to b/centrallix/tests/test_expfn_metaphone_00.to new file mode 100644 index 00000000..de1897c3 --- /dev/null +++ b/centrallix/tests/test_expfn_metaphone_00.to @@ -0,0 +1,161 @@ +##NAME metaphone() function + +# Special thanks to the following websites for double checking the correct results: +# 1: https://words.github.io/double-metaphone +# 2: https://mainegenealogy.net/metaphone_converter.asp +# 3: https://en.toolpage.org/tool/metaphone + +# These tests were collected from the following sources: +# - Example comments in the source code of exp_double_metaphone.c +# - Maurice Aubrey's Tests* +# - Tests manually written by Israel Fuller +# - Tests written by prompting ChatGPT-5 (preview)** +# +# *Source: https://github.com/gitpan/Text-DoubleMetaphone/blob/master/t/words.txt +# **GPT-5 mini (Preview) was run in GitHub Copilot to suggest the words +# for some tests after analizing a generated coverage report. I (Israel) +# used the suggestions to write some "AI generated" test cases. +# +# For more information, see the manual test suite implementation at the +# end of the exp_double_metaphone.c file. + +query select result = metaphone("Test") +query select result = metaphone("Basic") +query select result = metaphone("Centrallix") +query select result = metaphone("Lawrence") +query select result = metaphone("Philips") +query select result = metaphone("Acceptingness") +query select result = metaphone("Supercalifragilisticexpialidocious") +query select result = metaphone("Suoicodilaipxecitsiligarfilacrepus") +query select result = metaphone("Smith") +query select result = metaphone("Schmidt") +query select result = metaphone("Snider") +query select result = metaphone("Schneider") +query select result = metaphone("Arnow") +query select result = metaphone("Arnoff") +query select result = metaphone("Accede") +query select result = metaphone("Accident") +query select result = metaphone("Actually") +query select result = metaphone("Arch") +query select result = metaphone("Artois") +query select result = metaphone("Bacchus") +query select result = metaphone("Bacci") +query select result = metaphone("Bajador") +query select result = metaphone("Bellocchio") +query select result = metaphone("Bertucci") +query select result = metaphone("Biaggi") +query select result = metaphone("Bough") +query select result = metaphone("Breaux") +query select result = metaphone("Broughton") +query select result = metaphone("Cabrillo") +query select result = metaphone("Caesar") +query select result = metaphone("Cagney") +query select result = metaphone("Campbell") +query select result = metaphone("Carlisle") +query select result = metaphone("Carlysle") +query select result = metaphone("Chemistry") +query select result = metaphone("Chianti") +query select result = metaphone("Chorus") +query select result = metaphone("Cough") +query select result = metaphone("Czerny") +query select result = metaphone("Dumb") +query select result = metaphone("Edgar") +query select result = metaphone("Edge") +query select result = metaphone("Filipowicz") +query select result = metaphone("Focaccia") +query select result = metaphone("Gallegos") +query select result = metaphone("Germanic") +query select result = metaphone("Ghiradelli") +query select result = metaphone("Ghislane") +query select result = metaphone("Gospel") +query select result = metaphone("Gough") +query select result = metaphone("Greek") +query select result = metaphone("Hochmeier") +query select result = metaphone("Hugh") +query select result = metaphone("Island") +query select result = metaphone("Isle") +query select result = metaphone("Italian") +query select result = metaphone("Jankelowicz") +query select result = metaphone("Jose") +query select result = metaphone("Laugh") +query select result = metaphone("Mac Caffrey") +query select result = metaphone("Mac Gregor") +query select result = metaphone("Manager") +query select result = metaphone("McHugh") +query select result = metaphone("McLaughlin") +query select result = metaphone("Michael") +query select result = metaphone("Middle") +query select result = metaphone("Orchestra") +query select result = metaphone("Orchid") +query select result = metaphone("Pinyin") +query select result = metaphone("Raspberry") +query select result = metaphone("Resnais") +query select result = metaphone("Rogier") +query select result = metaphone("Rough") +query select result = metaphone("Salvador") +query select result = metaphone("San jacinto") +query select result = metaphone("Schenker") +query select result = metaphone("Schermerhorn") +query select result = metaphone("Schlesinger") +query select result = metaphone("School") +query select result = metaphone("Schooner") +query select result = metaphone("Succeed") +query select result = metaphone("Sugar") +query select result = metaphone("Sugary") +query select result = metaphone("Tagliaro") +query select result = metaphone("Thames") +query select result = metaphone("Thomas") +query select result = metaphone("Thumb") +query select result = metaphone("Tichner") +query select result = metaphone("Tough") +query select result = metaphone("Vghee") +query select result = metaphone("Wachtler") +query select result = metaphone("Wechsler") +query select result = metaphone("Word") +query select result = metaphone("Xavier") +query select result = metaphone("Yankelovich") +query select result = metaphone("Zhao") +query select result = metaphone("McClellan") +query select result = metaphone("maurice") +query select result = metaphone("aubrey") +query select result = metaphone("cambrillo") +query select result = metaphone("heidi") +query select result = metaphone("katherine") +query select result = metaphone("catherine") +query select result = metaphone("richard") +query select result = metaphone("bob") +query select result = metaphone("eric") +query select result = metaphone("geoff") +query select result = metaphone("dave") +query select result = metaphone("ray") +query select result = metaphone("steven") +query select result = metaphone("bryce") +query select result = metaphone("randy") +query select result = metaphone("bryan") +query select result = metaphone("brian") +query select result = metaphone("otto") +query select result = metaphone("auto") +query select result = metaphone("Abbott") +query select result = metaphone("Back") +query select result = metaphone("Bacher") +query select result = metaphone("Charles") +query select result = metaphone("Ghana") +query select result = metaphone("Gnome") +query select result = metaphone("Raj") +query select result = metaphone("Quentin") +query select result = metaphone("Who") +query select result = metaphone("Shoemaker") +query select result = metaphone("Sian") +query select result = metaphone("Scold") +query select result = metaphone("Station") +query select result = metaphone("Match") +query select result = metaphone("Pizza") +query select result = metaphone("Agnes") +query select result = metaphone("Science") +query select result = metaphone("Van Gogh") +query select result = metaphone("Josef") +query select result = metaphone("Object") +query select result = metaphone("Sholz") +query select result = metaphone("Scharf") +query select result = metaphone("Kasia") +query select result = metaphone("Van Geller") diff --git a/centrallix/tests/test_expfn_trim_00.cmp b/centrallix/tests/test_expfn_trim_00.cmp new file mode 100644 index 00000000..5515d9aa --- /dev/null +++ b/centrallix/tests/test_expfn_trim_00.cmp @@ -0,0 +1,10 @@ +Attribute [trim("White space on the left o]: string "No white space on the left or right side." +Attribute [trim(" White space on the l]: string "White space on the left side." +Attribute [trim("White space on the right ]: string "White space on the right side." +Attribute [trim(" White space on the r]: string "White space on the right and the left side." +Attribute [trim("With tab character ")]: string "With tab character " +Attribute [trim("With newline character +")]: string "With newline character +" +Attribute [trim("")]: string "" +Attribute [trim(null)]: string NULL diff --git a/centrallix/tests/test_expfn_trim_00.to b/centrallix/tests/test_expfn_trim_00.to new file mode 100644 index 00000000..746de6eb --- /dev/null +++ b/centrallix/tests/test_expfn_trim_00.to @@ -0,0 +1,19 @@ +##NAME trim() function + +query select 'trim("White space on the left or right side.")' = trim("No white space on the left or right side.") + +query select 'trim(" White space on the left side.")' = trim(" White space on the left side.") + +query select 'trim("White space on the right side. ")' = trim("White space on the right side. ") + +query select 'trim(" White space on the right and the left side. ")' = trim(" White space on the right and the left side. ") + +query select 'trim("With tab character\t")' = trim("With tab character\t") + +query select 'trim("With newline character\n")' = trim("With newline character\n") + +# query select 'trim("\r With carriage return character.")' = trim("\r With carriage return character") + +query select 'trim("")' = trim("") + +query select 'trim(null)' = trim(null) diff --git a/centrallix/tests/test_fuzzycompare_00.cmp b/centrallix/tests/test_fuzzycompare_00.cmp deleted file mode 100644 index baa6db1e..00000000 --- a/centrallix/tests/test_fuzzycompare_00.cmp +++ /dev/null @@ -1,13 +0,0 @@ -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 diff --git a/centrallix/tests/test_fuzzycompare_00.to b/centrallix/tests/test_fuzzycompare_00.to deleted file mode 100644 index 78141a47..00000000 --- a/centrallix/tests/test_fuzzycompare_00.to +++ /dev/null @@ -1,15 +0,0 @@ -##NAME Levenshtein String Comparison - -query select sw1 = 1 where fuzzy_compare('hello', 'hello!', 20) >= 0 and fuzzy_compare("hello","hello!", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'asdfkh', 20) >= 0 and fuzzy_compare("hello","asdfkh", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'aaaaaaaaaaaaaaaaa', 20) >= 0 and fuzzy_compare("hello","aaaaaaaaaaaaaaaaa", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'nope', 20) >= 0 and fuzzy_compare("hello","nope", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('below', 'hello!', 20) >= 0 and fuzzy_compare("below","hello!", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('kitten', 'smitten', 20) >= 0 and fuzzy_compare("kitten","smitten", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'bobbobbobbob', 20) >= 0 and fuzzy_compare("hello","bobbobbobbob", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', '', 20) >= 0 and fuzzy_compare("hello","", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('', '', 20) >= 0 and fuzzy_compare("","", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('blooooop', 'blob', 20) >= 0 and fuzzy_compare("blooooop","blob", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('', '!', 20) >= 0 and fuzzy_compare("","!", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('h', 'h', 20) >= 0 and fuzzy_compare("h","h", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hi', 'hi', 20) >= 0 and fuzzy_compare("hi","hi", 20) <= 1 diff --git a/centrallix/tests/test_lev_compare_00.cmp b/centrallix/tests/test_lev_compare_00.cmp new file mode 100644 index 00000000..1c295a36 --- /dev/null +++ b/centrallix/tests/test_lev_compare_00.cmp @@ -0,0 +1,23 @@ +Attribute [case1]: string "pass" +Attribute [case2]: string "pass" +Attribute [case3]: string "pass" +Attribute [case4]: string "pass" +Attribute [case5]: string "pass" +Attribute [case6]: string "pass" +Attribute [case7]: string "pass" +Attribute [case8]: string "pass" +Attribute [case9]: string "pass" +Attribute [case10]: string "pass" +Attribute [case11]: string "pass" +Attribute [case12]: string "pass" +Attribute [case13]: string "pass" +Attribute [case14]: string "pass" +Attribute [case15]: string "pass" +Attribute [case16]: string "pass" +Attribute [case17]: string "pass" +Attribute [case18]: string "pass" +Attribute [case19]: string "pass" +Attribute [case20]: string "pass" +Attribute [case21]: string "pass" +Attribute [case22]: string "pass" +Attribute [case23]: string "pass" diff --git a/centrallix/tests/test_lev_compare_00.to b/centrallix/tests/test_lev_compare_00.to new file mode 100644 index 00000000..5d9cec0f --- /dev/null +++ b/centrallix/tests/test_lev_compare_00.to @@ -0,0 +1,28 @@ +##NAME Levenshtein String Comparison + +# Legacy tests. +query select case1 = condition(lev_compare('hello', 'hello!') >= 0 and lev_compare('hello','hello!') <= 1, 'pass', 'fail') +query select case2 = condition(lev_compare('hello', 'asdfkh') >= 0 and lev_compare('hello','asdfkh') <= 1, 'pass', 'fail') +query select case3 = condition(lev_compare('hello', 'aaaaaaaaaaaaaaaaa') >= 0 and lev_compare('hello','aaaaaaaaaaaaaaaaa') <= 1, 'pass', 'fail') +query select case4 = condition(lev_compare('hello', 'nope') >= 0 and lev_compare('hello', 'nope') <= 1, 'pass', 'fail') +query select case5 = condition(lev_compare('below', 'hello!') >= 0 and lev_compare('below', 'hello!') <= 1, 'pass', 'fail') +query select case6 = condition(lev_compare('kitten', 'smitten') >= 0 and lev_compare('kitten', 'smitten') <= 1, 'pass', 'fail') +query select case7 = condition(lev_compare('hello', 'bobbobbobbob') >= 0 and lev_compare('hello', 'bobbobbobbob') <= 1, 'pass', 'fail') +query select case8 = condition(lev_compare('hello', '') >= 0 and lev_compare('hello', '') <= 1, 'pass', 'fail') +query select case9 = condition(lev_compare('', '') >= 0 and lev_compare('', '') <= 1, 'pass', 'fail') +query select case10 = condition(lev_compare('blooooop', 'blob') >= 0 and lev_compare('blooooop', 'blob') <= 1, 'pass', 'fail') +query select case11 = condition(lev_compare('', '!') >= 0 and lev_compare('','!') <= 1, 'pass', 'fail') +query select case12 = condition(lev_compare('h', 'h') >= 0 and lev_compare('h','h') <= 1, 'pass', 'fail') +query select case13 = condition(lev_compare('hi', 'hi') >= 0 and lev_compare('hi','hi') <= 1, 'pass', 'fail') + +# Kitten tests. +query select case14 = condition(lev_compare('kitten', 'kitten') >= 0.99 and lev_compare('kitten', 'kitten') <= 1.0, 'pass', 'fail') -- 0 edits +query select case15 = condition(lev_compare('kitten', 'skitten') >= 0.8 and lev_compare('kitten', 'skitten') <= 0.9, 'pass', 'fail') -- 1 insert +query select case16 = condition(lev_compare('kitten', 'itten') >= 0.8 and lev_compare('kitten', 'itten') <= 0.9, 'pass', 'fail') -- 1 delete +query select case17 = condition(lev_compare('kitten', 'mitten') >= 0.8 and lev_compare('kitten', 'mitten') <= 0.9, 'pass', 'fail') -- 1 replace +query select case18 = condition(lev_compare('kitten', 'smitten') >= 0.7 and lev_compare('kitten', 'smitten') <= 0.8, 'pass', 'fail') -- 1 insert and one replace +query select case19 = condition(lev_compare('kitten', 'iktten') >= 0.8 and lev_compare('kitten', 'iktten') <= 0.9, 'pass', 'fail') -- 1 transpose +query select case20 = condition(lev_compare('kitten', 'kittens') >= 0.8 and lev_compare('kitten', 'kittens') <= 0.9, 'pass', 'fail') -- 1 insert (end) +query select case21 = condition(lev_compare('kitten', 'kitte') >= 0.8 and lev_compare('kitten', 'kitte') <= 0.9, 'pass', 'fail') -- 1 delete (end) +query select case22 = condition(lev_compare('kitten', 'kittem') >= 0.8 and lev_compare('kitten', 'kittem') <= 0.9, 'pass', 'fail') -- 1 replace (end) +query select case23 = condition(lev_compare('kitten', 'kittne') >= 0.8 and lev_compare('kitten', 'kittne') <= 0.9, 'pass', 'fail') -- 1 transpose (end) diff --git a/centrallix/tests/test_levenshtein_00.cmp b/centrallix/tests/test_levenshtein_00.cmp index 0bc319c9..b95f2d44 100644 --- a/centrallix/tests/test_levenshtein_00.cmp +++ b/centrallix/tests/test_levenshtein_00.cmp @@ -1,6 +1,18 @@ -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 2 -Attribute [sw1]: integer 2 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 +Attribute [case1]: integer 0 +Attribute [case2]: integer 1 +Attribute [case3]: integer 1 +Attribute [case4]: integer 1 +Attribute [case5]: integer 2 +Attribute [case6]: integer 1 +Attribute [case7]: integer 1 +Attribute [case8]: integer 1 +Attribute [case9]: integer 1 +Attribute [case10]: integer 1 +Attribute [case11]: integer 2 +Attribute [case12]: integer 1 +Attribute [case13]: integer 1 +Attribute [case14]: integer 2 +Attribute [case15]: integer 0 +Attribute [case16]: integer 0 +Attribute [case17]: integer 133 +Attribute [case18]: integer 254 diff --git a/centrallix/tests/test_levenshtein_00.to b/centrallix/tests/test_levenshtein_00.to index a666c3a4..a92bdd74 100644 --- a/centrallix/tests/test_levenshtein_00.to +++ b/centrallix/tests/test_levenshtein_00.to @@ -1,8 +1,25 @@ -##NAME Levenshtein String Comparison +##NAME Levenshtein Basic Comparisons -query select sw1 = levenshtein('hello', 'hello!') -query select sw1 = levenshtein('kitten', 'mitten') -query select sw1 = levenshtein('kitten', 'smitten') -query select sw1 = levenshtein('lawn', 'flown') -query select sw1 = levenshtein('kitten', 'itten') -query select sw1 = levenshtein('kitten', 'skitten') +# Kitten tests. +query select case1 = levenshtein('kitten', 'kitten') -- 0 edits +query select case2 = levenshtein('kitten', 'skitten') -- 1 insert +query select case3 = levenshtein('kitten', 'itten') -- 1 delete +query select case4 = levenshtein('kitten', 'mitten') -- 1 replace +query select case5 = levenshtein('kitten', 'smitten') -- 1 insert and 1 replace +query select case6 = levenshtein('kitten', 'iktten') -- 1 transpose +query select case7 = levenshtein('kitten', 'kittens') -- 1 insert (end) +query select case8 = levenshtein('kitten', 'kitte') -- 1 delete (end) +query select case9 = levenshtein('kitten', 'kittem') -- 1 replace (end) +query select case10 = levenshtein('kitten', 'kittne') -- 1 transpose (end) + +# Alternate words. +query select case11 = levenshtein('lawn', 'flown') -- 1 insert and 1 replace +query select case12 = levenshtein('hello', 'hello!') -- 1 insert (end) +query select case13 = levenshtein('zert', 'zerf') -- 1 replace (end) +query select case14 = levenshtein('llearr', 'lear') -- 2 deletes (start & end) + +# Edge cases. +query select case15 = levenshtein('', '') -- 0 edits +query select case16 = levenshtein('This is a very long string!! I do not expect this function to need to process a string longer than this, because this string is a full 254 characters. That is pretty long. The object system limits strings to this size so we cannot make a longer string...', 'This is a very long string!! I do not expect this function to need to process a string longer than this, because this string is a full 254 characters. That is pretty long. The object system limits strings to this size so we cannot make a longer string...') -- 0 edits. +query select case17 = levenshtein('This is a very long string!! I do not expect this function to need to process a string longer than this, because this string is a full 254 characters. That is pretty long. The object system limits strings to this size so we cannot make a longer string...', 'This is quite a lengthy string. I do not expect the function to compute any longer string since this one is a full 254 characters. That is plenty, even if someone adds many contact details to their record!! Thus, this test should cover most cases we see.') -- 133 edits. +query select case18 = levenshtein('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', 'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB') -- 254 replaces. diff --git a/centrallix/tests/test_similarity_00.cmp b/centrallix/tests/test_similarity_00.cmp deleted file mode 100644 index a0d29220..00000000 --- a/centrallix/tests/test_similarity_00.cmp +++ /dev/null @@ -1,5 +0,0 @@ -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 diff --git a/centrallix/tests/test_similarity_00.to b/centrallix/tests/test_similarity_00.to deleted file mode 100644 index a0942ab7..00000000 --- a/centrallix/tests/test_similarity_00.to +++ /dev/null @@ -1,7 +0,0 @@ -##NAME Text Mining String Similarity - -query select sw1 = (cos_compare('hello', 'hello') >= 0.999) and (cos_compare('hello', 'hello') <= 1) -query select sw1 = (cos_compare('hello', 'nancy') <= 0.001) and (cos_compare('hello', 'nancy') >= 0) -query select sw1 = (cos_compare('hello', 'hello world') <= 0.891) and (cos_compare('hello', 'hello world') >= 0.890) -query select sw1 = (cos_compare('hello', 'hellow') >= 0.935) and (cos_compare('hello', 'hellow') <= 0.936) -query select sw1 = (cos_compare('hello', 'hellow', 1) >= 0.935) and (cos_compare('hello', 'hellow', 1) <= 0.936) diff --git a/centrallix/utility/double_metaphone.c b/centrallix/utility/double_metaphone.c new file mode 100644 index 00000000..f37b0964 --- /dev/null +++ b/centrallix/utility/double_metaphone.c @@ -0,0 +1,1573 @@ +/************************************************************************/ +/* Text-DoubleMetaphone */ +/* Centrallix Base Library */ +/* */ +/* Copyright 2000, Maurice Aubrey . */ +/* All rights reserved. */ +/* */ +/* This code is copied for redistribution with modification, from the */ +/* gitpan/Text-DoubleMetaphone implementation on GitHub (1), which is */ +/* under the following license. */ +/* */ +/* This code is based heavily on the C++ implementation by Lawrence */ +/* Philips and incorporates several bug fixes courtesy of Kevin */ +/* Atkinson . */ +/* */ +/* This module is free software; you may redistribute it and/or */ +/* modify it under the same terms as Perl itself. */ +/* */ +/* A summary of the relevant content from https://dev.perl.org/licenses */ +/* has been included below for the convenience of the reader. This */ +/* information was collected and saved on September 5th, 2025 and may */ +/* differ from current information. For the most up to date copy of */ +/* this information, please use the link provided above. */ +/* */ +/* Perl5 is Copyright © 1993 and later, by Larry Wall and others. */ +/* */ +/* It is free software; you can redistribute it and/or modify it */ +/* under the terms of either: */ +/* */ +/* a) the GNU General Public License (2) as published by the Free */ +/* Software Foundation (3); either version 1 (2), or (at your */ +/* option) any later version (4), or */ +/* */ +/* b) the "Artistic License" (5). */ +/* */ +/* Citations: */ +/* 1: https://github.com/gitpan/Text-meta_double_metaphone */ +/* 2: https://dev.perl.org/licenses/gpl1.html */ +/* 3: http://www.fsf.org */ +/* 4: http://www.fsf.org/licenses/licenses.html#GNUGPL */ +/* 5: https://dev.perl.org/licenses/artistic.html */ +/* */ +/* Centrallix is published under the GNU General Public License, */ +/* satisfying the above requirement. A summary of this is included */ +/* below for the convenience of the reader. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: double_metaphone.c, double_metaphone.h */ +/* Author: Maurice Aubrey and Israel Fuller */ +/* Description: This module implements a "sounds like" algorithm by */ +/* Lawrence Philips which he published in the June, 2000 */ +/* issue of C/C++ Users Journal. Double Metaphone is an */ +/* improved version of the original Metaphone algorithm */ +/* written by Philips'. This implementaton was written by */ +/* Maurice Aubrey for C/C++ with bug fixes provided by */ +/* Kevin Atkinson. It was revised by Israel Fuller to */ +/* better align with the Centrallix coding style and */ +/* standards so that it could be included here. */ +/************************************************************************/ + +/*** Note to future programmers reading this file (by Israel Fuller): + *** + *** This file was copied from a GitHub Repo with proper licensing (in case + *** you didn't read the legal stuff above), so feel free to check it out. + *** + *** As for this code, I've modified it to use styling and memory allocation + *** consistent with the rest of the Centrallix codebase. Also, I have added + *** documentation comments and extensive test cases (at the end of the file), + *** however, these reflect my own (possibly incorrect) understanding, which + *** might not line up with the original author. + *** + *** To be honest, though, trying to make this code as readable as possible + *** was very challenging due to all the messy boolean algebra. If there is + *** ever a professional linguist reading this, please factor out some of the + *** logic into local variables with descriptive names so that the rest of us + *** can read this code without our eyes glazing over. + *** + *** If you have any questions, please feel free to reach out to me or Greg. + *** + *** Original Source: https://github.com/gitpan/Text-meta_double_metaphone + ***/ + +#include +#include +#include +#include +#include +#include + +/*** If running in a testing environment, newmalloc is not + *** available, so we fall back to default C memory allocation. + ***/ +#ifndef TESTING +#include "cxlib/newmalloc.h" +#define META_MALLOC(size) nmSysMalloc(size) +#define META_REALLOC(ptr, size) nmSysRealloc(ptr, size) +#define META_FREE(ptr) nmSysFree(ptr) +#else +#include +#define META_MALLOC(size) malloc(size) +#define META_REALLOC(ptr, size) realloc(ptr, size) +#define META_FREE(ptr) free(ptr) +#endif + +/*** Helper function to handle checking for failed memory allocation + *** Author: Israel Fuller. + *** + *** @param ptr Pointer to the memory that should be allocated. + *** @param fname The name of the function invoked to allocate memory. + *** @param size The amount of memory being allocated. + *** @returns The pointer, for chaining. + ***/ +void* +meta_check_allocation(void* ptr, const char* fname, const size_t size) + { + if (ptr == NULL) + { + /** Create the most descriptive error message we can. **/ + char error_buf[BUFSIZ]; + snprintf(error_buf, sizeof(error_buf), "exp_double_metaphone.c: Fail - %s(%lu)", fname, size); + perror(error_buf); + + // Throw error for easier locating in a debugger. + fprintf(stderr, "Program will now crash.\n"); + assert(0); + } + + return ptr; + } + +/** Malloc shortcut macros. **/ +#define SAFE_MALLOC(size) \ + ({ \ + const size_t sz = (size); \ + memset(meta_check_allocation(META_MALLOC(sz), "META_MALLOC", sz), 0, sz); \ + }) +#define SAFE_REALLOC(ptr, size) \ + ({ \ + const size_t sz = (size); \ + meta_check_allocation(META_REALLOC(ptr, sz), "META_REALLOC", sz); \ + }) + +typedef struct + { + char* str; + size_t length; + size_t bufsize; + int free_str_on_destroy; + } +MetaString; + +/*** Allocates a new MetaString. + *** + *** @param init_str The initial size of the string. + *** @returns The new MetaString. + ***/ +MetaString* +meta_new_string(const char* init_str) + { + MetaString *s; + char empty_string[] = ""; + + s = (MetaString*)SAFE_MALLOC(sizeof(MetaString)); + + if (init_str == NULL) + init_str = empty_string; + + s->length = strlen(init_str); + /** Preallocate a bit more for potential growth. **/ + s->bufsize = s->length + 7u; + + s->str = (char*)SAFE_MALLOC(s->bufsize * sizeof(char)); + + strncpy(s->str, init_str, s->length + 1); + s->free_str_on_destroy = 1; + + return s; + } + +/*** Frees a MetaString. + *** + *** @param s The MetaString. + ***/ +void +meta_destroy_string(MetaString* s) + { + if (s == NULL) + return; + + if (s->free_str_on_destroy && s->str != NULL) + META_FREE(s->str); + + META_FREE(s); + + return; + } + +/*** Increases a MetaString's buffer size. + *** + *** @param s The MetaString* being modified. + *** @param chars_needed Minimum number of characters to increase buffer size. + ***/ +void +meta_increase_buffer(MetaString* s, const size_t chars_needed) + { + s->bufsize += chars_needed + 8u; + s->str = SAFE_REALLOC(s->str, s->bufsize * sizeof(char)); + + return; + } + +/*** Convert all characters of a MetaString to uppercase. + *** + *** @param s The MetaString being modified. + ***/ +void +meta_make_upper(MetaString* s) + { + for (char* i = s->str; i[0] != '\0'; i++) + *i = (char)toupper(*i); + + return; + } + +/*** @param s The MetaString being checked. + *** @param pos The character location to check within the MetaString. + *** @returns 1 if the location is out of bounds for the MetaString, + *** 0 otherwise. + ***/ +bool +meta_is_out_of_bounds(MetaString* s, unsigned int pos) + { + return (s->length <= pos); + } + +/*** Checks if a character in a MetaString is a vowel. + *** + *** @param s The MetaString being checked. + *** @param pos The character location to check within the MetaString. + ***/ +bool +meta_is_vowel(MetaString* s, unsigned int pos) + { + if (meta_is_out_of_bounds(s, pos)) return 0; + + const char c = *(s->str + pos); + + return ((c == 'A') || (c == 'E') || (c == 'I') || + (c == 'O') || (c == 'U') || (c == 'Y')); + } + +/*** Search a MetaString for "W", "K", "CZ", or "WITZ", which indicate that the + *** string is Slavo Germanic. + *** + *** @param s The MetaString to be searched. + *** @returns 1 if the MetaString is Slavo Germanic, or 0 otherwise. + ***/ +bool +meta_is_slavo_germanic(MetaString* s) + { + return (strstr(s->str, "W") != NULL) + || (strstr(s->str, "K") != NULL) + || (strstr(s->str, "CZ") != NULL) + || (strstr(s->str, "WITZ") != NULL); + } + +/*** @param s The MetaString being checked. + *** @param pos The character location to check within the MetaString. + *** @returns The character at the position in the MetaString, or + *** '\0' if the position is not in the MetaString. + ***/ +char +meta_get_char_at(MetaString* s, unsigned int pos) + { + return (meta_is_out_of_bounds(s, pos)) ? '\0' : ((char) *(s->str + pos)); + } + +/*** Checks for to see if any of a list of strings appear in a the given + *** MetaString after the given start position. + *** + *** @attention - Note that the START value is 0 based. + *** + *** @param s The MetaString being modified. + *** @param start The zero-based start of at which to begin searching + *** within the MetaString. + *** @param length The length of the character strings being checked. + *** @returns 1 if any of the character sequences appear after the start + *** in the MetaString and 0 otherwise. + ***/ +bool +meta_is_str_at(MetaString* s, unsigned int start, ...) + { + va_list ap; + + /** Should never happen. **/ + if (meta_is_out_of_bounds(s, start)) + return 0; + + const char* pos = (s->str + start); + va_start(ap, start); + + char* test; + do + { + test = va_arg(ap, char*); + if (*test && (strncmp(pos, test, strlen(test)) == 0)) + return true; + } + while (test[0] != '\0'); + + va_end(ap); + + return false; + } + +/*** Adds a string to a MetaString, expanding the MetaString if needed. + *** + *** @param s The MetaString being modified. + *** @param new_str The string being added. + ***/ +void +meta_add_str(MetaString* s, const char* new_str) + { + if (new_str == NULL) + return; + + const size_t add_length = strlen(new_str); + if ((s->length + add_length) > (s->bufsize - 1)) + meta_increase_buffer(s, add_length); + + strcat(s->str, new_str); + s->length += add_length; + + return; + } + +/*** Computes double metaphone. + *** + *** Example Usage: + *** ```c + *** char* primary_code; + *** char* secondary_code; + *** meta_double_metaphone(input, &primary_code, &secondary_code); + *** ``` + *** + *** @param str The string to compute. + *** @param primary_code A pointer to a buffer where the pointer to a string + *** containing the produced primary code will be stored. + *** @param secondary_code A pointer to a buffer where the pointer to a string + *** containing the produced secondary code will be stored. + ***/ +void +meta_double_metaphone(const char* str, char** primary_code, char** secondary_code) + { + size_t length; + + if (primary_code == NULL) + { + fprintf(stderr, "Warning: Call to meta_double_metaphone() is missing a pointer to store primary code.\n"); + return; + } + + if (secondary_code == NULL) + { + fprintf(stderr, "Warning: Call to meta_double_metaphone() is missing a pointer to store secondary code.\n"); + return; + } + + if (str == NULL || (length = strlen(str)) == 0u) + { + fprintf(stderr, "Warning: Call to meta_double_metaphone() with invalid string.\n"); + + /** Double Metaphone on an invalid string yields two empty strings. **/ + *primary_code = (char*)SAFE_MALLOC(sizeof(char)); + *secondary_code = (char*)SAFE_MALLOC(sizeof(char)); + return; + } + unsigned int current = 0; + unsigned int last = (unsigned int)(length - 1); + + /** Pad original so we can index beyond end. **/ + MetaString* original = meta_new_string(str); + meta_make_upper(original); + meta_add_str(original, " "); + + MetaString* primary = meta_new_string(""); + MetaString* secondary = meta_new_string(""); + primary->free_str_on_destroy = 0; + secondary->free_str_on_destroy = 0; + + /** Skip these if they are at start of a word. **/ + if (meta_is_str_at(original, 0, "GN", "KN", "PN", "WR", "PS", "")) + current += 1; + + /** Initial 'X' is pronounced 'Z' e.g. 'Xavier' **/ + const char first_char = meta_get_char_at(original, 0); + if (first_char == 'X') + { + meta_add_str(primary, "S"); /* 'Z' maps to 'S' */ + meta_add_str(secondary, "S"); + current += 1; + } + + /** Precomputing this is useful. **/ + const bool is_slavo_germanic = meta_is_slavo_germanic(original); + + /** Main loop. **/ + while (current < length) + { + const char cur_char = meta_get_char_at(original, current); + const char next_char = meta_get_char_at(original, current + 1); + switch (cur_char) + { + case 'A': + case 'E': + case 'I': + case 'O': + case 'U': + case 'Y': + { + if (current == 0) + { + /** All init vowels now map to 'A'. **/ + meta_add_str(primary, "A"); + meta_add_str(secondary, "A"); + } + current += 1; + break; + } + + case 'B': + { + /** "-mb", e.g", "dumb", already skipped over... **/ + meta_add_str(primary, "P"); + meta_add_str(secondary, "P"); + + current += (next_char == 'B') ? 2 : 1; + break; + } + + case 'C': + { + /** Various germanic. **/ + if ( + (current > 1) + && !meta_is_vowel(original, current - 2) + && meta_is_str_at(original, (current - 1), "ACH", "") + && meta_get_char_at(original, current + 2) != 'I' + && ( + meta_get_char_at(original, current + 2) != 'E' + || meta_is_str_at(original, (current - 2), "BACHER", "MACHER", "") + ) + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + /** Special case 'caesar' **/ + if (current == 0 && meta_is_str_at(original, current, "CAESAR", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "S"); + current += 2; + break; + } + + /** Italian 'chianti' **/ + if (meta_is_str_at(original, current, "CHIA", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + if (meta_is_str_at(original, current, "CH", "")) + { + /** Find 'michael' **/ + if (current > 0 && meta_is_str_at(original, current, "CHAE", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "X"); + current += 2; + break; + } + + /** Greek roots e.g. 'chemistry', 'chorus' **/ + if ( + current == 0 + && meta_is_str_at(original, (current + 1), "HOR", "HYM", "HIA", "HEM", "HARAC", "HARIS", "") + && !meta_is_str_at(original, 0, "CHORE", "") + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + /** Germanic, greek, or otherwise 'ch' for 'kh' sound. */ + if ( + meta_is_str_at(original, 0, "SCH", "VAN ", "VON ", "") + /** 'architect but not 'arch', 'orchestra', 'orchid' **/ + || meta_is_str_at(original, (current - 2), "ORCHES", "ARCHIT", "ORCHID", "") + || meta_is_str_at(original, (current + 2), "T", "S", "") + || ( + (current == 0 || meta_is_str_at(original, (current - 1), "A", "O", "U", "E", "")) + /** e.g., 'wachtler', 'wechsler', but not 'tichner' **/ + && meta_is_str_at(original, (current + 2), "L", "R", "N", "M", "B", "H", "F", "V", "W", " ", "") + ) + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + else + { + if (current > 0) + { + if (meta_is_str_at(original, 0, "MC", "")) + { + /* e.g., "McHugh" */ + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + else + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "K"); + } + } + else + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + } + } + current += 2; + break; + } + + /** e.g, 'czerny' **/ + if (meta_is_str_at(original, current, "CZ", "") + && !meta_is_str_at(original, (current - 2), "WICZ", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + current += 2; + break; + } + + /** e.g., 'focaccia' **/ + if (meta_is_str_at(original, (current + 1), "CIA", "")) + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + current += 3; + break; + } + + /** Double 'C' rule. **/ + if ( + meta_is_str_at(original, current, "CC", "") + && !(current == 1 && first_char == 'M') /* McClellan exception. */ + ) + { + /** 'bellocchio' but not 'bacchus' **/ + if ( + meta_is_str_at(original, (current + 2), "I", "E", "H", "") + && !meta_is_str_at(original, (current + 2), "HU", "") + ) + { + /** 'accident', 'accede' 'succeed' **/ + if ( + (current == 1 && meta_get_char_at(original, current - 1) == 'A') + || meta_is_str_at(original, (current - 1), "UCCEE", "UCCES", "") + ) + { + meta_add_str(primary, "KS"); + meta_add_str(secondary, "KS"); + /** 'bacci', 'bertucci', other italian **/ + } + else + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + } + current += 3; + break; + } + else + { /** Pierce's rule **/ + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + } + + if (meta_is_str_at(original, current, "CK", "CG", "CQ", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + if (meta_is_str_at(original, current, "CI", "CE", "CY", "")) + { + /* Italian vs. English */ + if (meta_is_str_at(original, current, "CIO", "CIE", "CIA", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + } + else + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "S"); + } + current += 2; + break; + } + + /** else **/ + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + + /** Name sent in 'mac caffrey', 'mac gregor **/ + if (meta_is_str_at(original, (current + 1), " C", " Q", " G", "")) + current += 3; + else if (meta_is_str_at(original, (current + 1), "C", "K", "Q", "") + && !meta_is_str_at(original, (current + 1), "CE", "CI", "")) + current += 2; + else + current += 1; + break; + } + + case 'D': + { + if (meta_is_str_at(original, current, "DG", "")) + { + if (meta_is_str_at(original, (current + 2), "I", "E", "Y", "")) + { + /** e.g. 'edge' **/ + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + current += 3; + break; + } + else + { + /** e.g. 'edgar' **/ + meta_add_str(primary, "TK"); + meta_add_str(secondary, "TK"); + current += 2; + break; + } + } + + if (meta_is_str_at(original, current, "DT", "DD", "")) + { + meta_add_str(primary, "T"); + meta_add_str(secondary, "T"); + current += 2; + break; + } + + /** else **/ + meta_add_str(primary, "T"); + meta_add_str(secondary, "T"); + current += 1; + break; + } + + case 'F': + { + current += (next_char == 'F') ? 2 : 1; + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + break; + } + + case 'G': + { + if (next_char == 'H') + { + /** 'Vghee' */ + if (current > 0 && !meta_is_vowel(original, (current - 1))) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + if (current < 3) + { + /** 'ghislane', 'ghiradelli' **/ + if (current == 0) + { + if (meta_get_char_at(original, (current + 2)) == 'I') + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + } + else + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + current += 2; + break; + } + } + + if ( + /** Parker's rule (with some further refinements) - e.g., 'hugh' **/ + (current > 1 && meta_is_str_at(original, (current - 2), "B", "H", "D", "")) + /** e.g., 'bough' **/ + || (current > 2 && meta_is_str_at(original, (current - 3), "B", "H", "D", "")) + /** e.g., 'broughton' **/ + || (current > 3 && meta_is_str_at(original, (current - 4), "B", "H", "")) + ) + { + current += 2; + break; + } + else + { + /** e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' **/ + if ( + current > 2 + && meta_get_char_at(original, (current - 1)) == 'U' + && meta_is_str_at(original, (current - 3), "C", "G", "L", "R", "T", "") + ) + { + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + } + else if (current > 0 && meta_get_char_at(original, (current - 1)) != 'I') + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + + current += 2; + break; + } + } + + if (next_char == 'N') + { + if (current == 1 && !is_slavo_germanic && meta_is_vowel(original, 0)) + { + meta_add_str(primary, "KN"); + meta_add_str(secondary, "N"); + } + else + /** not e.g. 'cagney' **/ + if ( + next_char != 'Y' + && !is_slavo_germanic + && !meta_is_str_at(original, (current + 2), "EY", "") + ) + { + meta_add_str(primary, "N"); + meta_add_str(secondary, "KN"); + } + else + { + meta_add_str(primary, "KN"); + meta_add_str(secondary, "KN"); + } + current += 2; + break; + } + + /** 'tagliaro' **/ + if ( + !is_slavo_germanic + && meta_is_str_at(original, (current + 1), "LI", "") + ) + { + meta_add_str(primary, "KL"); + meta_add_str(secondary, "L"); + current += 2; + break; + } + + /** -ges-,-gep-,-gel-, -gie- at beginning **/ + if ( + current == 0 + && ( + next_char == 'Y' + || meta_is_str_at( + original, (current + 1), + "ES", "EP", "EB", "EL", "EY", "IB", + "IL", "IN", "IE", "EI", "ER", "" + ) + ) + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "J"); + current += 2; + break; + } + + /** -ger-, -gy- **/ + if ( + (next_char == 'Y' || meta_is_str_at(original, (current + 1), "ER", "")) + /** Exceptions. **/ + && !meta_is_str_at(original, 0, "DANGER", "RANGER", "MANGER", "") + && !meta_is_str_at(original, (current - 1), "E", "I", "RGY", "OGY", "") + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "J"); + current += 2; + break; + } + + /** Italian e.g, 'biaggi' **/ + if ( + meta_is_str_at(original, (current + 1), "E", "I", "Y", "") + || meta_is_str_at(original, (current - 1), "AGGI", "OGGI", "") + ) + { + /** Obvious germanic. **/ + if (meta_is_str_at(original, 0, "SCH", "VAN ", "VON ", "") + || meta_is_str_at(original, (current + 1), "ET", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + else + { + /** Always soft, if french ending. **/ + if (meta_is_str_at(original, (current + 1), "IER ", "")) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + } + else + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "K"); + } + } + current += 2; + break; + } + + current += (next_char == 'G') ? 2 : 1; + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + break; + } + + case 'H': + { + /** Only keep if first & before vowel or between 2 vowels. **/ + if ( + (current == 0 || meta_is_vowel(original, (current - 1))) + && meta_is_vowel(original, current + 1) + ) + { + meta_add_str(primary, "H"); + meta_add_str(secondary, "H"); + current += 2; + } + else /* also takes care of 'HH' */ + current += 1; + break; + } + + case 'J': + { + /** Obvious spanish, 'jose', 'san jacinto' **/ + const bool has_jose_next = meta_is_str_at(original, current, "JOSE", ""); + const bool starts_with_san = meta_is_str_at(original, 0, "SAN ", ""); + if (has_jose_next || starts_with_san) + { + if ( + starts_with_san + /** I don't know what this condition means. **/ + || (current == 0 && meta_get_char_at(original, current + 4) == ' ') + ) + { + meta_add_str(primary, "H"); + meta_add_str(secondary, "H"); + } + else + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "H"); + } + current += 1; + break; + } + + if (current == 0 && !has_jose_next) + { + meta_add_str(primary, "J"); /* Yankelovich/Jankelowicz */ + meta_add_str(secondary, "A"); + } + else + { + /** spanish pron. of e.g. 'bajador' **/ + if ( + !is_slavo_germanic + && (next_char == 'A' || next_char == 'O') + && meta_is_vowel(original, (current - 1)) + ) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "H"); + } + else + { + if (current == last) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, ""); + } + else + { + if ( + !meta_is_str_at(original, (current + 1), "L", "T", "K", "S", "N", "M", "B", "Z", "") + && !meta_is_str_at(original, (current - 1), "S", "K", "L", "") + ) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + } + } + } + } + + current += (next_char == 'J') ? 2 : 1; + break; + } + + case 'K': + { + current += (next_char == 'K') ? 2 : 1; + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + break; + } + + case 'L': + { + if (next_char == 'L') + { + /** Spanish e.g. 'cabrillo', 'gallegos' **/ + if ( + ( + current == length - 3 + && meta_is_str_at(original, (current - 1), "ILLO", "ILLA", "ALLE", "") + ) + || ( + meta_is_str_at(original, (current - 1), "ALLE", "") + && ( + meta_is_str_at(original, (last - 1), "AS", "OS", "") + || meta_is_str_at(original, last, "A", "O", "") + ) + ) + ) + { + meta_add_str(primary, "L"); + meta_add_str(secondary, ""); + current += 2; + break; + } + current += 2; + } + else + current += 1; + meta_add_str(primary, "L"); + meta_add_str(secondary, "L"); + break; + } + + case 'M': + { + current += ( + ( + meta_is_str_at(original, (current - 1), "UMB", "") + && (current + 1 == last || meta_is_str_at(original, (current + 2), "ER", "")) + ) + /** 'dumb','thumb' **/ + || next_char == 'M' + ) ? 2 : 1; + meta_add_str(primary, "M"); + meta_add_str(secondary, "M"); + break; + } + + case 'N': + { + current += (next_char == 'N') ? 2 : 1; + meta_add_str(primary, "N"); + meta_add_str(secondary, "N"); + break; + } + + case 'P': + { + if (next_char == 'H') + { + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + current += 2; + break; + } + + /** Also account for "campbell", "raspberry" **/ + current += (meta_is_str_at(original, (current + 1), "P", "B", "")) ? 2 : 1; + meta_add_str(primary, "P"); + meta_add_str(secondary, "P"); + break; + } + + case 'Q': + { + current += (next_char == 'Q') ? 2 : 1; + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + break; + } + + case 'R': + { + /** French e.g. 'rogier', but exclude 'hochmeier' **/ + const bool no_primary = ( + !is_slavo_germanic + && current == last + && meta_is_str_at(original, (current - 2), "IE", "") + && !meta_is_str_at(original, (current - 4), "ME", "MA", "") + ); + + meta_add_str(primary, (no_primary) ? "" : "R"); + meta_add_str(secondary, "R"); + current += (next_char == 'R') ? 2 : 1; + break; + } + + case 'S': + { + /** Special cases 'island', 'isle', 'carlisle', 'carlysle' **/ + if (meta_is_str_at(original, (current - 1), "ISL", "YSL", "")) + { + current += 1; + break; + } + + /** Special case 'sugar-' **/ + if (current == 0 && meta_is_str_at(original, current, "SUGAR", "")) + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "S"); + current += 1; + break; + } + + if (meta_is_str_at(original, current, "SH", "")) + { + const bool germanic = meta_is_str_at(original, (current + 1), "HEIM", "HOEK", "HOLM", "HOLZ", ""); + const char* sound = (germanic) ? "S" : "X"; + meta_add_str(primary, sound); + meta_add_str(secondary, sound); + current += 2; + break; + } + + /** Italian & Armenian. **/ + if (meta_is_str_at(original, current, "SIO", "SIA", "SIAN", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, (is_slavo_germanic) ? "S" : "X"); + current += 3; + break; + } + + /** german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' **/ + /** also, -sz- in slavic language although in hungarian it is pronounced 's' **/ + if (current == 0 && meta_is_str_at(original, (current + 1), "M", "N", "L", "W", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + current += 1; + break; + } + if (meta_is_str_at(original, (current + 1), "Z", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + current += 2; + break; + } + + if (meta_is_str_at(original, current, "SC", "")) + { + /** Schlesinger's rule. **/ + if (meta_get_char_at(original, current + 2) == 'H') + { + /** Dutch origin, e.g. 'school', 'schooner' **/ + if (meta_is_str_at(original, (current + 3), "OO", "ER", "EN", "UY", "ED", "EM", "")) + { + /** 'schermerhorn', 'schenker' **/ + const bool x_sound = meta_is_str_at(original, (current + 3), "ER", "EN", ""); + meta_add_str(primary, (x_sound) ? "X" : "SK"); + meta_add_str(secondary, "SK"); + current += 3; + break; + } + else + { + const bool s_sound = ( + current == 0 + && !meta_is_vowel(original, 3) + && meta_get_char_at(original, 3) != 'W' + ); + meta_add_str(primary, "X"); + meta_add_str(secondary, (s_sound) ? "S" : "X"); + current += 3; + break; + } + } + + /** Default case. **/ + const char* sound = (meta_is_str_at(original, (current + 2), "E", "I", "Y", "")) ? "S" : "SK"; + meta_add_str(primary, sound); + meta_add_str(secondary, sound); + current += 3; + break; + } + + /** French e.g. 'resnais', 'artois' **/ + const bool no_primary = (current == last && meta_is_str_at(original, (current - 2), "AI", "OI", "")); + meta_add_str(primary, (no_primary) ? "" : "S"); + meta_add_str(secondary, "S"); + current += (meta_is_str_at(original, (current + 1), "S", "Z", "")) ? 2 : 1; + break; + } + + case 'T': + { + if (meta_is_str_at(original, current, "TIA", "TCH", "TION", "")) + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + current += 3; + break; + } + + if (meta_is_str_at(original, current, "TH", "TTH", "")) + { + /** Special case 'thomas', 'thames' or germanic. **/ + if ( + meta_is_str_at(original, (current + 2), "OM", "AM", "") + || meta_is_str_at(original, 0, "SCH", "VAN ", "VON ", "") + ) + meta_add_str(primary, "T"); + else + meta_add_str(primary, "0"); /* Yes, zero. */ + meta_add_str(secondary, "T"); + current += 2; + break; + } + + meta_add_str(primary, "T"); + meta_add_str(secondary, "T"); + current += (meta_is_str_at(original, (current + 1), "T", "D", "")) ? 2 : 1; + break; + } + + case 'V': + { + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + current += (next_char == 'V') ? 2 : 1; + break; + } + + case 'W': + { + /** Can also be in middle of word. **/ + if (meta_is_str_at(original, current, "WR", "")) + { + meta_add_str(primary, "R"); + meta_add_str(secondary, "R"); + current += 2; + break; + } + + const bool next_is_vowel = meta_is_vowel(original, current + 1); + if (current == 0 && (next_is_vowel || meta_is_str_at(original, current, "WH", ""))) + { + /** Wasserman should match Vasserman. **/ + meta_add_str(primary, "A"); + meta_add_str(secondary, (next_is_vowel) ? "F" : "A"); + } + + /** Arnow should match Arnoff. **/ + if ((current == last && meta_is_vowel(original, current - 1)) + || meta_is_str_at(original, (current - 1), "EWSKI", "EWSKY", "OWSKI", "OWSKY", "") + || meta_is_str_at(original, 0, "SCH", "") + ) + { + meta_add_str(primary, ""); + meta_add_str(secondary, "F"); + current += 1; + break; + } + + /** Polish e.g. 'filipowicz' **/ + if (meta_is_str_at(original, current, "WICZ", "WITZ", "")) + { + meta_add_str(primary, "TS"); + meta_add_str(secondary, "FX"); + current += 4; + break; + } + + /** Else skip it. **/ + current += 1; + break; + } + + case 'X': + { + /** French e.g. breaux **/ + const bool silent = ( + current == last + && ( + meta_is_str_at(original, (current - 2), "AU", "OU", "") + || meta_is_str_at(original, (current - 3), "IAU", "EAU", "") + ) + ); + if (!silent) + { + meta_add_str(primary, "KS"); + meta_add_str(secondary, "KS"); + } + + current += (meta_is_str_at(original, (current + 1), "C", "X", "")) ? 2 : 1; + break; + } + + case 'Z': + { + /** Chinese pinyin e.g. 'zhao' **/ + if (next_char == 'H') + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + current += 2; + break; + } + + const bool has_t_sound = ( + meta_is_str_at(original, (current + 1), "ZO", "ZI", "ZA", "") + || (is_slavo_germanic && current > 0 && meta_get_char_at(original, (current - 1)) != 'T') + ); + meta_add_str(primary, "S"); + meta_add_str(secondary, (has_t_sound) ? "TS" : "S"); + current += (next_char == 'Z') ? 2 : 1; + break; + } + + default: + current += 1; + } + } + + *primary_code = primary->str; + *secondary_code = secondary->str; + + meta_destroy_string(original); + meta_destroy_string(primary); + meta_destroy_string(secondary); + + return; + } + +#ifdef TESTING +/*** Built in test cases, written by Israel with inspiration from comments in + *** the above code, test cases written by Maurice Aubrey, and some words + *** suggested by AI. + *** + *** These tests have been integrated into the Centrallix testing environment, + *** where they can be run using `export TONLY=exp_fn_double_metaphone_00`, + *** followed by make test, in the Centrallix directory. + *** + *** The can also be run here by executing the following commands in the + *** centrallix/expression directory, which aditionally generates a coverage + *** report. These tests cover all parts of the double metaphone algorithm, + *** although some of the error cases in various helper functions (such as + *** meta_destroy_string(null)) are not covered by testing. + *** + *** Commands: + *** gcc exp_double_metaphone.c -o exp_double_metaphone.o -I .. -DTESTING -fprofile-arcs -ftest-coverage -O0 + *** ./exp_double_metaphone.o + *** gcov exp_double_metaphone.c + ***/ + +unsigned int num_tests_passed = 0u, num_tests_failed = 0u; + +void +test(const char* input, const char* expected_primary, const char* expected_secondary) + { + char* codes[2]; + + /** Run DoubleMetaphone() and extract results. **/ + char* actual_primary; + char* actual_secondary; + meta_double_metaphone( + input, + memset(&actual_primary, 0, sizeof(actual_primary)), + memset(&actual_secondary, 0, sizeof(actual_secondary)) + ); + + /** Test for correct value. **/ + if (!strcmp(expected_primary, actual_primary) && + !strcmp(expected_secondary, actual_secondary)) + num_tests_passed++; + else + { + printf( + "\nTEST FAILED: \"%s\"\n" + "Expected: %s %s\n" + "Actual: %s %s\n", + input, + expected_primary, expected_secondary, + actual_primary, actual_secondary + ); + num_tests_failed++; + } + + return; + } + +// Special thanks to the following websites for double checking the correct results: +// 1: https://words.github.io/double-metaphone +// 2: https://mainegenealogy.net/metaphone_converter.asp +// 3: https://en.toolpage.org/tool/metaphone +void +run_tests(void) + { + printf("\nRunning tests...\n"); + + /** Test that always fails. **/ + // test("This", "test", "fails."); + + /** Invalid string tests, by Israel. **/ + fprintf(stderr, "Expect two warnings between these two lines:\n"); + fprintf(stderr, "----------------\n"); + test(NULL, "", ""); + test("", "", ""); + fprintf(stderr, "----------------\n"); + + /** Basic tests, by Israel. **/ + test("Test", "TST", "TST"); + test("Basic", "PSK", "PSK"); + test("Centrallix", "SNTRLKS", "SNTRLKS"); + test("Lawrence", "LRNS", "LRNS"); + test("Philips", "FLPS", "FLPS"); + test("Acceptingness", "AKSPTNNS", "AKSPTNKNS"); + test("Supercalifragilisticexpialidocious", "SPRKLFRJLSTSKSPLTSS", "SPRKLFRKLSTSKSPLTXS"); + test("Suoicodilaipxecitsiligarfilacrepus", "SKTLPKSSTSLKRFLKRPS", "SKTLPKSSTSLKRFLKRPS"); + + /** Match tests, from code comments above. **/ + test("Smith", "SM0", "XMT"); + test("Schmidt", "XMT", "SMT"); + test("Snider", "SNTR", "XNTR"); + test("Schneider", "XNTR", "SNTR"); + test("Arnow", "ARN", "ARNF"); + test("Arnoff", "ARNF", "ARNF"); + + /** Example tests, from examples in code comments above. **/ + test("Accede", "AKST", "AKST"); + test("Accident", "AKSTNT", "AKSTNT"); + test("Actually", "AKTL", "AKTL"); + test("Arch", "ARX", "ARK"); + test("Artois", "ART", "ARTS"); + test("Bacchus", "PKS", "PKS"); + test("Bacci", "PX", "PX"); + test("Bajador", "PJTR", "PHTR"); + test("Bellocchio", "PLX", "PLX"); + test("Bertucci", "PRTX", "PRTX"); + test("Biaggi", "PJ", "PK"); + test("Bough", "P", "P"); + test("Breaux", "PR", "PR"); + test("Broughton", "PRTN", "PRTN"); + test("Cabrillo", "KPRL", "KPR"); + test("Caesar", "SSR", "SSR"); + test("Cagney", "KKN", "KKN"); + test("Campbell", "KMPL", "KMPL"); + test("Carlisle", "KRLL", "KRLL"); + test("Carlysle", "KRLL", "KRLL"); + test("Chemistry", "KMSTR", "KMSTR"); + test("Chianti", "KNT", "KNT"); + test("Chorus", "KRS", "KRS"); + test("Cough", "KF", "KF"); + test("Czerny", "SRN", "XRN"); + test("Dumb", "TM", "TM"); + test("Edgar", "ATKR", "ATKR"); + test("Edge", "AJ", "AJ"); + test("Filipowicz", "FLPTS", "FLPFX"); + test("Focaccia", "FKX", "FKX"); + test("Gallegos", "KLKS", "KKS"); + test("Germanic", "KRMNK", "JRMNK"); + test("Ghiradelli", "JRTL", "JRTL"); + test("Ghislane", "JLN", "JLN"); + test("Gospel", "KSPL", "KSPL"); + test("Gough", "KF", "KF"); + test("Greek", "KRK", "KRK"); + test("Hochmeier", "HKMR", "HKMR"); + test("Hugh", "H", "H"); + test("Island", "ALNT", "ALNT"); + test("Isle", "AL", "AL"); + test("Italian", "ATLN", "ATLN"); + test("Jankelowicz", "JNKLTS", "ANKLFX"); + test("Jose", "HS", "HS"); + test("Laugh", "LF", "LF"); + test("Mac Caffrey", "MKFR", "MKFR"); + test("Mac Gregor", "MKRKR", "MKRKR"); + test("Manager", "MNKR", "MNJR"); + test("McHugh", "MK", "MK"); + test("McLaughlin", "MKLFLN", "MKLFLN"); + test("Michael", "MKL", "MXL"); + test("Middle", "MTL", "MTL"); + test("Orchestra", "ARKSTR", "ARKSTR"); + test("Orchid", "ARKT", "ARKT"); + test("Pinyin", "PNN", "PNN"); + test("Raspberry", "RSPR", "RSPR"); + test("Resnais", "RSN", "RSNS"); + test("Rogier", "RJ", "RJR"); + test("Rough", "RF", "RF"); + test("Salvador", "SLFTR", "SLFTR"); + test("San jacinto", "SNHSNT", "SNHSNT"); + test("Schenker", "XNKR", "SKNKR"); + test("Schermerhorn", "XRMRRN", "SKRMRRN"); + test("Schlesinger", "XLSNKR", "SLSNJR"); + test("School", "SKL", "SKL"); + test("Schooner", "SKNR", "SKNR"); + test("Succeed", "SKST", "SKST"); + test("Sugar", "XKR", "SKR"); + test("Sugary", "XKR", "SKR"); + test("Tagliaro", "TKLR", "TLR"); + test("Thames", "TMS", "TMS"); + test("Thomas", "TMS", "TMS"); + test("Thumb", "0M", "TM"); + test("Tichner", "TXNR", "TKNR"); + test("Tough", "TF", "TF"); + test("Vghee", "FK", "FK"); + test("Wachtler", "AKTLR", "FKTLR"); + test("Wechsler", "AKSLR", "FKSLR"); + test("Word", "ART", "FRT"); + test("Xavier", "SF", "SFR"); + test("Yankelovich", "ANKLFX", "ANKLFK"); + test("Zhao", "J", "J"); + + /** Interesting Edge Case: "McClellan" **/ + /*** Note: Sources (1) and (3) both include a double K ("MKKLLN"), but the + *** original code on GitHub and mainegenealogy.net do not. I chose "MKLLN" + *** to be correct because I personally do not pronounce the second c. + ***/ + test("McClellan", "MKLLN", "MKLLN"); + + /** Maurice Aubrey's Tests. **/ + /** Source: https://github.com/gitpan/Text-DoubleMetaphone/blob/master/t/words.txt **/ + test("maurice", "MRS", "MRS"); + test("aubrey", "APR", "APR"); + test("cambrillo", "KMPRL", "KMPR"); + test("heidi", "HT", "HT"); + test("katherine", "K0RN", "KTRN"); + test("catherine", "K0RN", "KTRN"); + test("richard", "RXRT", "RKRT"); + test("bob", "PP", "PP"); + test("eric", "ARK", "ARK"); + test("geoff", "JF", "KF"); + test("dave", "TF", "TF"); + test("ray", "R", "R"); + test("steven", "STFN", "STFN"); + test("bryce", "PRS", "PRS"); + test("randy", "RNT", "RNT"); + test("bryan", "PRN", "PRN"); + test("brian", "PRN", "PRN"); + test("otto", "AT", "AT"); + test("auto", "AT", "AT"); + + /** GPT-5 Coverage Tests. **/ + /*** GPT-5 mini (Preview) running in GitHub Copilot suggested the words + *** after analizing a generated coverage report, and I (Israel) used + *** them to write the tests below. I kept the AI's reasoning for tests, + *** while removing tests that did not contribute any coverage, but after + *** a few reprompts, the AI started just giving words without reasoning. + *** I guess we were both getting pretty tired of writing tests. + ***/ + test("Abbott", "APT", "APT"); /* double-B ("BB") handling. */ + test("Back", "PK", "PK"); /* "CK"/"CG"/"CQ" branch. */ + test("Bacher", "PKR", "PKR"); /* matches "...BACHER" / ACH special-case. */ + test("Charles", "XRLS", "XRLS"); /* initial "CH" -> the branch that maps to "X"/"X" at start. */ + test("Ghana", "KN", "KN"); /* initial "GH" special-start handling. */ + test("Gnome", "NM", "NM"); /* "GN" sequence handling. */ + test("Raj", "RJ", "R"); /* J at end (exercise J-last behavior). */ + test("Quentin", "KNTN", "KNTN"); /* Q case (Q -> K mapping). */ + test("Who", "A", "A"); /* "WH" at start handling. */ + test("Shoemaker", "XMKR", "XMKR"); /* "SH" general mapping paths. */ + test("Sian", "SN", "XN"); /* "SIO"/"SIA"/"SIAN" branch. */ + test("Scold", "SKLT", "SKLT"); /* "SC" default / "SK" vs other SC subcases. */ + test("Station", "STXN", "STXN"); /* "TION" -> X mapping. */ + test("Match", "MX", "MX"); /* "TCH"/"TIA" -> X mapping. */ + test("Pizza", "PS", "PTS"); /* double-Z ("ZZ") handling. */ + test("Agnes", "AKNS", "ANS"); /* "GN" at index 1 (GN handling that yields KN / N). */ + test("Science", "SNS", "SNS"); /* "SC" followed by I (SC + I/E/Y branch). */ + test("Van Gogh", "FNKK", "FNKK"); + test("Josef", "JSF", "HSF"); + test("Object", "APJKT", "APJKT"); + test("Sholz", "SLS", "SLS"); + test("Scharf", "XRF", "XRF"); + test("Kasia", "KS", "KS"); + test("Van Geller", "FNKLR", "FNKLR"); + + const unsigned int total_tests = num_tests_passed + num_tests_failed; + printf("\nTests completed!\n"); + printf(" > Failed: %u\n", num_tests_failed); + printf(" > Skipped: %u\n", 0u); /* Implementation removed. */ + printf(" > Passed: %u/%u\n", num_tests_passed, total_tests); + + return; + } + +int main(void) + { + run_tests(); + + return 0; + } + +/** Prevent scope leak. **/ +#undef META_FREE +#undef META_MALLOC +#undef META_REALLOC +#undef SAFE_MALLOC +#undef SAFE_REALLOC + +#endif