|
| 1 | +// test_data_validation.c — Unit tests for token-data hardening helpers |
| 2 | +#include <stdbool.h> |
| 3 | +#include <stdint.h> |
| 4 | +#include <stdio.h> |
| 5 | +#include <string.h> |
| 6 | + |
| 7 | +#include "data_validation.h" |
| 8 | + |
| 9 | +typedef struct { |
| 10 | + int passed; |
| 11 | + int failed; |
| 12 | +} TestStats; |
| 13 | + |
| 14 | +#define CHECK_TRUE(stats, cond, msg) \ |
| 15 | + do { \ |
| 16 | + if (!(cond)) { \ |
| 17 | + fprintf(stderr, "FAIL: %s (%s:%d)\n", msg, __FILE__, __LINE__); \ |
| 18 | + (stats)->failed++; \ |
| 19 | + return; \ |
| 20 | + } \ |
| 21 | + } while (0) |
| 22 | + |
| 23 | +#define CHECK_EQ_INT(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg) |
| 24 | +#define CHECK_EQ_SIZE(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg) |
| 25 | + |
| 26 | +static uint32_t lcg_next(uint32_t *state) { |
| 27 | + *state = (*state * 1664525u) + 1013904223u; |
| 28 | + return *state; |
| 29 | +} |
| 30 | + |
| 31 | +static void test_bytes_to_token_count_even(TestStats *stats) { |
| 32 | + size_t n_tokens = 0; |
| 33 | + size_t extra = 99; |
| 34 | + CHECK_TRUE(stats, token_data_bytes_to_token_count(1024, &n_tokens, &extra), |
| 35 | + "even byte length should map to token count"); |
| 36 | + CHECK_EQ_SIZE(stats, n_tokens, 512, "1024 bytes should map to 512 tokens"); |
| 37 | + CHECK_EQ_SIZE(stats, extra, 0, "even byte length should have zero remainder"); |
| 38 | + stats->passed++; |
| 39 | +} |
| 40 | + |
| 41 | +static void test_bytes_to_token_count_odd(TestStats *stats) { |
| 42 | + size_t n_tokens = 0; |
| 43 | + size_t extra = 0; |
| 44 | + CHECK_TRUE(stats, !token_data_bytes_to_token_count(1025, &n_tokens, &extra), |
| 45 | + "odd byte length should fail alignment check"); |
| 46 | + CHECK_EQ_SIZE(stats, n_tokens, 512, "odd byte length should still report floor token count"); |
| 47 | + CHECK_EQ_SIZE(stats, extra, 1, "1025 bytes should report one extra byte"); |
| 48 | + stats->passed++; |
| 49 | +} |
| 50 | + |
| 51 | +static void test_bytes_to_token_count_null_outputs(TestStats *stats) { |
| 52 | + CHECK_TRUE(stats, token_data_bytes_to_token_count(8, NULL, NULL), |
| 53 | + "alignment helper should work with null output pointers"); |
| 54 | + CHECK_TRUE(stats, !token_data_bytes_to_token_count(9, NULL, NULL), |
| 55 | + "alignment helper should fail odd byte length with null outputs"); |
| 56 | + stats->passed++; |
| 57 | +} |
| 58 | + |
| 59 | +static void test_min_tokens_boundary(TestStats *stats) { |
| 60 | + size_t required = 0; |
| 61 | + CHECK_TRUE(stats, token_data_has_min_tokens(257, 256, &required), "257 tokens should satisfy seq=256"); |
| 62 | + CHECK_EQ_SIZE(stats, required, 257, "required tokens should be seq+1"); |
| 63 | + stats->passed++; |
| 64 | +} |
| 65 | + |
| 66 | +static void test_min_tokens_short(TestStats *stats) { |
| 67 | + size_t required = 0; |
| 68 | + CHECK_TRUE(stats, !token_data_has_min_tokens(256, 256, &required), "256 tokens should fail seq=256"); |
| 69 | + CHECK_EQ_SIZE(stats, required, 257, "required tokens should still be seq+1"); |
| 70 | + stats->passed++; |
| 71 | +} |
| 72 | + |
| 73 | +static void test_min_tokens_negative_seq(TestStats *stats) { |
| 74 | + size_t required = 777; |
| 75 | + CHECK_TRUE(stats, !token_data_has_min_tokens(10, -1, &required), "negative seq should fail min-token check"); |
| 76 | + CHECK_EQ_SIZE(stats, required, 777, "required token out param should remain unchanged for invalid seq"); |
| 77 | + stats->passed++; |
| 78 | +} |
| 79 | + |
| 80 | +static void test_validate_too_short(TestStats *stats) { |
| 81 | + uint16_t tokens[2] = {1, 2}; |
| 82 | + TokenDataValidationError err = {0}; |
| 83 | + TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, &err); |
| 84 | + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "too-short dataset should fail"); |
| 85 | + CHECK_EQ_SIZE(stats, err.required_tokens, 5, "required token count should be reported"); |
| 86 | + stats->passed++; |
| 87 | +} |
| 88 | + |
| 89 | +static void test_validate_too_short_precedes_oob(TestStats *stats) { |
| 90 | + uint16_t tokens[2] = {65000, 1}; |
| 91 | + TokenDataValidationError err = {0}; |
| 92 | + TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, &err); |
| 93 | + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "too-short check should happen before OOB check"); |
| 94 | + CHECK_EQ_SIZE(stats, err.required_tokens, 5, "required token count should still be reported"); |
| 95 | + stats->passed++; |
| 96 | +} |
| 97 | + |
| 98 | +static void test_validate_too_short_with_null_err(TestStats *stats) { |
| 99 | + uint16_t tokens[2] = {1, 2}; |
| 100 | + TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, NULL); |
| 101 | + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "validation should work when err output is null"); |
| 102 | + stats->passed++; |
| 103 | +} |
| 104 | + |
| 105 | +static void test_validate_oob_first(TestStats *stats) { |
| 106 | + uint16_t tokens[6] = {32000, 1, 2, 3, 4, 5}; |
| 107 | + TokenDataValidationError err = {0}; |
| 108 | + TokenDataValidationCode code = token_data_validate(tokens, 6, 4, 32000, &err); |
| 109 | + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "first token OOB should fail"); |
| 110 | + CHECK_EQ_SIZE(stats, err.bad_index, 0, "bad index should point to first token"); |
| 111 | + CHECK_EQ_INT(stats, err.bad_token, 32000, "bad token value should be reported"); |
| 112 | + stats->passed++; |
| 113 | +} |
| 114 | + |
| 115 | +static void test_validate_oob_middle(TestStats *stats) { |
| 116 | + uint16_t tokens[7] = {1, 2, 3, 65535, 4, 5, 6}; |
| 117 | + TokenDataValidationError err = {0}; |
| 118 | + TokenDataValidationCode code = token_data_validate(tokens, 7, 4, 32000, &err); |
| 119 | + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "middle token OOB should fail"); |
| 120 | + CHECK_EQ_SIZE(stats, err.bad_index, 3, "bad index should point to middle token"); |
| 121 | + CHECK_EQ_INT(stats, err.bad_token, 65535, "bad token value should be reported"); |
| 122 | + stats->passed++; |
| 123 | +} |
| 124 | + |
| 125 | +static void test_validate_oob_last(TestStats *stats) { |
| 126 | + uint16_t tokens[6] = {1, 2, 3, 4, 5, 40000}; |
| 127 | + TokenDataValidationError err = {0}; |
| 128 | + TokenDataValidationCode code = token_data_validate(tokens, 6, 4, 32000, &err); |
| 129 | + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "last token OOB should fail"); |
| 130 | + CHECK_EQ_SIZE(stats, err.bad_index, 5, "bad index should point to last token"); |
| 131 | + CHECK_EQ_INT(stats, err.bad_token, 40000, "bad token value should be reported"); |
| 132 | + stats->passed++; |
| 133 | +} |
| 134 | + |
| 135 | +static void test_validate_ok(TestStats *stats) { |
| 136 | + uint16_t tokens[8] = {0, 1, 2, 3, 4, 5, 31998, 31999}; |
| 137 | + TokenDataValidationError err; |
| 138 | + memset(&err, 0xA5, sizeof(err)); |
| 139 | + TokenDataValidationCode code = token_data_validate(tokens, 8, 4, 32000, &err); |
| 140 | + CHECK_EQ_INT(stats, code, TOKEN_DATA_VALID, "valid dataset should pass"); |
| 141 | + stats->passed++; |
| 142 | +} |
| 143 | + |
| 144 | +static void test_validate_vocab_boundary(TestStats *stats) { |
| 145 | + uint16_t valid_tokens[3] = {0, 0, 0}; |
| 146 | + TokenDataValidationError err = {0}; |
| 147 | + TokenDataValidationCode valid_code = token_data_validate(valid_tokens, 3, 2, 1, &err); |
| 148 | + CHECK_EQ_INT(stats, valid_code, TOKEN_DATA_VALID, "token 0 should be valid when vocab=1"); |
| 149 | + |
| 150 | + uint16_t invalid_tokens[3] = {0, 1, 0}; |
| 151 | + TokenDataValidationCode invalid_code = token_data_validate(invalid_tokens, 3, 2, 1, &err); |
| 152 | + CHECK_EQ_INT(stats, invalid_code, TOKEN_DATA_ERR_OOB_TOKEN, "token >= vocab should fail at vocab boundary"); |
| 153 | + CHECK_EQ_SIZE(stats, err.bad_index, 1, "boundary OOB should report exact index"); |
| 154 | + CHECK_EQ_INT(stats, err.bad_token, 1, "boundary OOB should report offending token"); |
| 155 | + stats->passed++; |
| 156 | +} |
| 157 | + |
| 158 | +static void test_find_oob_empty(TestStats *stats) { |
| 159 | + size_t bad_index = 123; |
| 160 | + uint16_t bad_token = 456; |
| 161 | + CHECK_TRUE(stats, !token_data_find_oob_token(NULL, 0, 32000, &bad_index, &bad_token), |
| 162 | + "empty dataset should not report OOB token"); |
| 163 | + CHECK_EQ_SIZE(stats, bad_index, 123, "bad index should remain unchanged for empty input"); |
| 164 | + CHECK_EQ_INT(stats, bad_token, 456, "bad token should remain unchanged for empty input"); |
| 165 | + stats->passed++; |
| 166 | +} |
| 167 | + |
| 168 | +static void test_find_oob_null_outputs(TestStats *stats) { |
| 169 | + uint16_t tokens[4] = {0, 1, 32000, 2}; |
| 170 | + CHECK_TRUE(stats, token_data_find_oob_token(tokens, 4, 32000, NULL, NULL), |
| 171 | + "OOB scan should work with null output pointers"); |
| 172 | + stats->passed++; |
| 173 | +} |
| 174 | + |
| 175 | +static void test_find_oob_invalid_vocab(TestStats *stats) { |
| 176 | + uint16_t tokens[3] = {0, 1, 2}; |
| 177 | + CHECK_TRUE(stats, !token_data_find_oob_token(tokens, 3, 0, NULL, NULL), |
| 178 | + "OOB scan should reject non-positive vocab"); |
| 179 | + CHECK_TRUE(stats, !token_data_find_oob_token(tokens, 3, -1, NULL, NULL), |
| 180 | + "OOB scan should reject negative vocab"); |
| 181 | + stats->passed++; |
| 182 | +} |
| 183 | + |
| 184 | +static void test_find_oob_randomized_consistency(TestStats *stats) { |
| 185 | + uint32_t seed = 1; |
| 186 | + for (int iter = 0; iter < 512; iter++) { |
| 187 | + int vocab = (int)(lcg_next(&seed) % 128u) + 1; |
| 188 | + size_t n_tokens = (size_t)(lcg_next(&seed) % 64u); |
| 189 | + uint16_t tokens[64] = {0}; |
| 190 | + |
| 191 | + bool expected_found = false; |
| 192 | + size_t expected_index = 0; |
| 193 | + uint16_t expected_token = 0; |
| 194 | + for (size_t i = 0; i < n_tokens; i++) { |
| 195 | + tokens[i] = (uint16_t)(lcg_next(&seed) % 256u); |
| 196 | + if (!expected_found && (int)tokens[i] >= vocab) { |
| 197 | + expected_found = true; |
| 198 | + expected_index = i; |
| 199 | + expected_token = tokens[i]; |
| 200 | + } |
| 201 | + } |
| 202 | + |
| 203 | + size_t got_index = 0; |
| 204 | + uint16_t got_token = 0; |
| 205 | + bool got_found = token_data_find_oob_token(tokens, n_tokens, vocab, &got_index, &got_token); |
| 206 | + CHECK_EQ_INT(stats, got_found, expected_found, "randomized OOB scan should match reference result"); |
| 207 | + if (expected_found) { |
| 208 | + CHECK_EQ_SIZE(stats, got_index, expected_index, "randomized OOB index should match reference"); |
| 209 | + CHECK_EQ_INT(stats, got_token, expected_token, "randomized OOB token should match reference"); |
| 210 | + } |
| 211 | + } |
| 212 | + stats->passed++; |
| 213 | +} |
| 214 | + |
| 215 | +int main(void) { |
| 216 | + TestStats stats = {0, 0}; |
| 217 | + |
| 218 | + test_bytes_to_token_count_even(&stats); |
| 219 | + test_bytes_to_token_count_odd(&stats); |
| 220 | + test_bytes_to_token_count_null_outputs(&stats); |
| 221 | + test_min_tokens_boundary(&stats); |
| 222 | + test_min_tokens_short(&stats); |
| 223 | + test_min_tokens_negative_seq(&stats); |
| 224 | + test_validate_too_short(&stats); |
| 225 | + test_validate_too_short_precedes_oob(&stats); |
| 226 | + test_validate_too_short_with_null_err(&stats); |
| 227 | + test_validate_oob_first(&stats); |
| 228 | + test_validate_oob_middle(&stats); |
| 229 | + test_validate_oob_last(&stats); |
| 230 | + test_validate_ok(&stats); |
| 231 | + test_validate_vocab_boundary(&stats); |
| 232 | + test_find_oob_empty(&stats); |
| 233 | + test_find_oob_null_outputs(&stats); |
| 234 | + test_find_oob_invalid_vocab(&stats); |
| 235 | + test_find_oob_randomized_consistency(&stats); |
| 236 | + |
| 237 | + printf("test_data_validation: %d passed, %d failed\n", stats.passed, stats.failed); |
| 238 | + return stats.failed == 0 ? 0 : 1; |
| 239 | +} |
0 commit comments