@@ -23,6 +23,39 @@ typedef struct {
2323#define CHECK_EQ_INT (stats , got , want , msg ) CHECK_TRUE((stats), (got) == (want), msg)
2424#define CHECK_EQ_SIZE (stats , got , want , msg ) CHECK_TRUE((stats), (got) == (want), msg)
2525
26+ static uint32_t lcg_next (uint32_t * state ) {
27+ * state = (* state * 1664525u ) + 1013904223u ;
28+ return * state ;
29+ }
30+
31+ static void test_bytes_to_token_count_even (TestStats * stats ) {
32+ size_t n_tokens = 0 ;
33+ size_t extra = 99 ;
34+ CHECK_TRUE (stats , token_data_bytes_to_token_count (1024 , & n_tokens , & extra ),
35+ "even byte length should map to token count" );
36+ CHECK_EQ_SIZE (stats , n_tokens , 512 , "1024 bytes should map to 512 tokens" );
37+ CHECK_EQ_SIZE (stats , extra , 0 , "even byte length should have zero remainder" );
38+ stats -> passed ++ ;
39+ }
40+
41+ static void test_bytes_to_token_count_odd (TestStats * stats ) {
42+ size_t n_tokens = 0 ;
43+ size_t extra = 0 ;
44+ CHECK_TRUE (stats , !token_data_bytes_to_token_count (1025 , & n_tokens , & extra ),
45+ "odd byte length should fail alignment check" );
46+ CHECK_EQ_SIZE (stats , n_tokens , 512 , "odd byte length should still report floor token count" );
47+ CHECK_EQ_SIZE (stats , extra , 1 , "1025 bytes should report one extra byte" );
48+ stats -> passed ++ ;
49+ }
50+
51+ static void test_bytes_to_token_count_null_outputs (TestStats * stats ) {
52+ CHECK_TRUE (stats , token_data_bytes_to_token_count (8 , NULL , NULL ),
53+ "alignment helper should work with null output pointers" );
54+ CHECK_TRUE (stats , !token_data_bytes_to_token_count (9 , NULL , NULL ),
55+ "alignment helper should fail odd byte length with null outputs" );
56+ stats -> passed ++ ;
57+ }
58+
2659static void test_min_tokens_boundary (TestStats * stats ) {
2760 size_t required = 0 ;
2861 CHECK_TRUE (stats , token_data_has_min_tokens (257 , 256 , & required ), "257 tokens should satisfy seq=256" );
@@ -37,6 +70,13 @@ static void test_min_tokens_short(TestStats *stats) {
3770 stats -> passed ++ ;
3871}
3972
73+ static void test_min_tokens_negative_seq (TestStats * stats ) {
74+ size_t required = 777 ;
75+ CHECK_TRUE (stats , !token_data_has_min_tokens (10 , -1 , & required ), "negative seq should fail min-token check" );
76+ CHECK_EQ_SIZE (stats , required , 777 , "required token out param should remain unchanged for invalid seq" );
77+ stats -> passed ++ ;
78+ }
79+
4080static void test_validate_too_short (TestStats * stats ) {
4181 uint16_t tokens [2 ] = {1 , 2 };
4282 TokenDataValidationError err = {0 };
@@ -46,6 +86,22 @@ static void test_validate_too_short(TestStats *stats) {
4686 stats -> passed ++ ;
4787}
4888
89+ static void test_validate_too_short_precedes_oob (TestStats * stats ) {
90+ uint16_t tokens [2 ] = {65000 , 1 };
91+ TokenDataValidationError err = {0 };
92+ TokenDataValidationCode code = token_data_validate (tokens , 2 , 4 , 32000 , & err );
93+ CHECK_EQ_INT (stats , code , TOKEN_DATA_ERR_TOO_SHORT , "too-short check should happen before OOB check" );
94+ CHECK_EQ_SIZE (stats , err .required_tokens , 5 , "required token count should still be reported" );
95+ stats -> passed ++ ;
96+ }
97+
98+ static void test_validate_too_short_with_null_err (TestStats * stats ) {
99+ uint16_t tokens [2 ] = {1 , 2 };
100+ TokenDataValidationCode code = token_data_validate (tokens , 2 , 4 , 32000 , NULL );
101+ CHECK_EQ_INT (stats , code , TOKEN_DATA_ERR_TOO_SHORT , "validation should work when err output is null" );
102+ stats -> passed ++ ;
103+ }
104+
49105static void test_validate_oob_first (TestStats * stats ) {
50106 uint16_t tokens [6 ] = {32000 , 1 , 2 , 3 , 4 , 5 };
51107 TokenDataValidationError err = {0 };
@@ -85,6 +141,20 @@ static void test_validate_ok(TestStats *stats) {
85141 stats -> passed ++ ;
86142}
87143
144+ static void test_validate_vocab_boundary (TestStats * stats ) {
145+ uint16_t valid_tokens [3 ] = {0 , 0 , 0 };
146+ TokenDataValidationError err = {0 };
147+ TokenDataValidationCode valid_code = token_data_validate (valid_tokens , 3 , 2 , 1 , & err );
148+ CHECK_EQ_INT (stats , valid_code , TOKEN_DATA_VALID , "token 0 should be valid when vocab=1" );
149+
150+ uint16_t invalid_tokens [3 ] = {0 , 1 , 0 };
151+ TokenDataValidationCode invalid_code = token_data_validate (invalid_tokens , 3 , 2 , 1 , & err );
152+ CHECK_EQ_INT (stats , invalid_code , TOKEN_DATA_ERR_OOB_TOKEN , "token >= vocab should fail at vocab boundary" );
153+ CHECK_EQ_SIZE (stats , err .bad_index , 1 , "boundary OOB should report exact index" );
154+ CHECK_EQ_INT (stats , err .bad_token , 1 , "boundary OOB should report offending token" );
155+ stats -> passed ++ ;
156+ }
157+
88158static void test_find_oob_empty (TestStats * stats ) {
89159 size_t bad_index = 123 ;
90160 uint16_t bad_token = 456 ;
@@ -95,17 +165,74 @@ static void test_find_oob_empty(TestStats *stats) {
95165 stats -> passed ++ ;
96166}
97167
168+ static void test_find_oob_null_outputs (TestStats * stats ) {
169+ uint16_t tokens [4 ] = {0 , 1 , 32000 , 2 };
170+ CHECK_TRUE (stats , token_data_find_oob_token (tokens , 4 , 32000 , NULL , NULL ),
171+ "OOB scan should work with null output pointers" );
172+ stats -> passed ++ ;
173+ }
174+
175+ static void test_find_oob_invalid_vocab (TestStats * stats ) {
176+ uint16_t tokens [3 ] = {0 , 1 , 2 };
177+ CHECK_TRUE (stats , !token_data_find_oob_token (tokens , 3 , 0 , NULL , NULL ),
178+ "OOB scan should reject non-positive vocab" );
179+ CHECK_TRUE (stats , !token_data_find_oob_token (tokens , 3 , -1 , NULL , NULL ),
180+ "OOB scan should reject negative vocab" );
181+ stats -> passed ++ ;
182+ }
183+
184+ static void test_find_oob_randomized_consistency (TestStats * stats ) {
185+ uint32_t seed = 1 ;
186+ for (int iter = 0 ; iter < 512 ; iter ++ ) {
187+ int vocab = (int )(lcg_next (& seed ) % 128u ) + 1 ;
188+ size_t n_tokens = (size_t )(lcg_next (& seed ) % 64u );
189+ uint16_t tokens [64 ] = {0 };
190+
191+ bool expected_found = false;
192+ size_t expected_index = 0 ;
193+ uint16_t expected_token = 0 ;
194+ for (size_t i = 0 ; i < n_tokens ; i ++ ) {
195+ tokens [i ] = (uint16_t )(lcg_next (& seed ) % 256u );
196+ if (!expected_found && (int )tokens [i ] >= vocab ) {
197+ expected_found = true;
198+ expected_index = i ;
199+ expected_token = tokens [i ];
200+ }
201+ }
202+
203+ size_t got_index = 0 ;
204+ uint16_t got_token = 0 ;
205+ bool got_found = token_data_find_oob_token (tokens , n_tokens , vocab , & got_index , & got_token );
206+ CHECK_EQ_INT (stats , got_found , expected_found , "randomized OOB scan should match reference result" );
207+ if (expected_found ) {
208+ CHECK_EQ_SIZE (stats , got_index , expected_index , "randomized OOB index should match reference" );
209+ CHECK_EQ_INT (stats , got_token , expected_token , "randomized OOB token should match reference" );
210+ }
211+ }
212+ stats -> passed ++ ;
213+ }
214+
98215int main (void ) {
99216 TestStats stats = {0 , 0 };
100217
218+ test_bytes_to_token_count_even (& stats );
219+ test_bytes_to_token_count_odd (& stats );
220+ test_bytes_to_token_count_null_outputs (& stats );
101221 test_min_tokens_boundary (& stats );
102222 test_min_tokens_short (& stats );
223+ test_min_tokens_negative_seq (& stats );
103224 test_validate_too_short (& stats );
225+ test_validate_too_short_precedes_oob (& stats );
226+ test_validate_too_short_with_null_err (& stats );
104227 test_validate_oob_first (& stats );
105228 test_validate_oob_middle (& stats );
106229 test_validate_oob_last (& stats );
107230 test_validate_ok (& stats );
231+ test_validate_vocab_boundary (& stats );
108232 test_find_oob_empty (& stats );
233+ test_find_oob_null_outputs (& stats );
234+ test_find_oob_invalid_vocab (& stats );
235+ test_find_oob_randomized_consistency (& stats );
109236
110237 printf ("test_data_validation: %d passed, %d failed\n" , stats .passed , stats .failed );
111238 return stats .failed == 0 ? 0 : 1 ;
0 commit comments