Skip to content

Commit fd86f99

Browse files
committed
[fix] Harden token-data ingestion with validation (upstream PR maderix#30)
2 parents 21da532 + 60b0512 commit fd86f99

7 files changed

Lines changed: 482 additions & 100 deletions

File tree

training/Makefile

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
CC = xcrun clang
2+
CC_C = xcrun clang
23

34
ANE_COMPAT = -Wno-deprecated-declarations
45
SEC_FLAGS = -fstack-protector-strong -Wformat-security
56

67
CFLAGS = -O2 -Wall $(ANE_COMPAT) -fobjc-arc $(SEC_FLAGS)
8+
CFLAGS_C = -O2 -Wall -Wextra -Werror -std=c11
79
CFLAGS_DEBUG = -O0 -g -Wall $(ANE_COMPAT) -fobjc-arc -fsanitize=address,undefined
810
FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface
911
LDFLAGS = $(FRAMEWORKS) -ldl
1012

11-
HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h
13+
HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h data_validation.h
1214

1315
HEADERS_ANE = $(HEADERS_LARGE) ane_rmsnorm_bwd.h ane_classifier.h
1416

@@ -52,8 +54,13 @@ test_ane_advanced: test_ane_advanced.m
5254
test_chaining: test_chaining.m
5355
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
5456

57+
test_data_validation: test_data_validation.c data_validation.h
58+
$(CC_C) $(CFLAGS_C) -o $@ $<
59+
5560
probes: $(PROBES)
5661

62+
security-tests: test_data_validation
63+
5764
data: tokenize
5865
@bash download_data.sh
5966

@@ -73,7 +80,6 @@ verify-flags:
7380
@xcrun clang --version
7481

7582
clean:
76-
rm -f train train_large train_large_ane train_opt train_double_buffer $(PROBES) test_rmsnorm_bwd test_classifier
77-
78-
.PHONY: clean tokenize probes verify-flags data setup
83+
rm -f train train_large train_large_ane train_opt train_double_buffer $(PROBES) test_rmsnorm_bwd test_classifier test_data_validation test_chaining
7984

85+
.PHONY: clean tokenize probes security-tests verify-flags data setup

training/README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,12 @@ Weights passed via IOSurface spatial dimension — compile 9 kernels once at sta
7878
bash download_data.sh
7979
```
8080

81-
Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from HuggingFace. Produces `tinystories_data00.bin` (~41 MB, ~20M tokens).
81+
Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from HuggingFace. Produces `tinystories_data00.bin` (~41 MB, ~20M tokens).
82+
83+
All training pipelines perform token-data validation at startup:
84+
- token file byte length must align to 16-bit token boundaries
85+
- token file must contain at least `SEQ+1` tokens
86+
- every token id must be within `[0, vocab_size)`
8287

8388
### 2. Build & Train
8489

training/data_validation.h

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
// data_validation.h — Shared token-data validation helpers
2+
#pragma once
3+
#include <stdbool.h>
4+
#include <stddef.h>
5+
#include <stdint.h>
6+
7+
typedef enum {
8+
TOKEN_DATA_VALID = 0,
9+
TOKEN_DATA_ERR_TOO_SHORT = 1,
10+
TOKEN_DATA_ERR_OOB_TOKEN = 2
11+
} TokenDataValidationCode;
12+
13+
typedef struct {
14+
size_t required_tokens;
15+
size_t bad_index;
16+
uint16_t bad_token;
17+
} TokenDataValidationError;
18+
19+
// Token files are 16-bit ids. Return false when byte length is misaligned.
20+
static inline bool token_data_bytes_to_token_count(size_t n_bytes, size_t *n_tokens, size_t *extra_bytes) {
21+
size_t rem = n_bytes % sizeof(uint16_t);
22+
if (n_tokens) *n_tokens = n_bytes / sizeof(uint16_t);
23+
if (extra_bytes) *extra_bytes = rem;
24+
return rem == 0;
25+
}
26+
27+
static inline bool token_data_has_min_tokens(size_t n_tokens, int seq, size_t *required_tokens) {
28+
if (seq < 0) return false;
29+
size_t needed = (size_t)seq + 1;
30+
if (required_tokens) *required_tokens = needed;
31+
return n_tokens >= needed;
32+
}
33+
34+
static inline bool token_data_find_oob_token(const uint16_t *token_data, size_t n_tokens, int vocab,
35+
size_t *bad_index, uint16_t *bad_token) {
36+
if (!token_data || n_tokens == 0 || vocab <= 0) return false;
37+
for (size_t i = 0; i < n_tokens; i++) {
38+
if ((int)token_data[i] >= vocab) {
39+
if (bad_index) *bad_index = i;
40+
if (bad_token) *bad_token = token_data[i];
41+
return true;
42+
}
43+
}
44+
return false;
45+
}
46+
47+
static inline TokenDataValidationCode token_data_validate(const uint16_t *token_data, size_t n_tokens,
48+
int seq, int vocab,
49+
TokenDataValidationError *err) {
50+
if (err) {
51+
err->required_tokens = 0;
52+
err->bad_index = 0;
53+
err->bad_token = 0;
54+
}
55+
56+
size_t required = 0;
57+
if (!token_data_has_min_tokens(n_tokens, seq, &required)) {
58+
if (err) err->required_tokens = required;
59+
return TOKEN_DATA_ERR_TOO_SHORT;
60+
}
61+
62+
size_t bad_index = 0;
63+
uint16_t bad_token = 0;
64+
if (token_data_find_oob_token(token_data, n_tokens, vocab, &bad_index, &bad_token)) {
65+
if (err) {
66+
err->bad_index = bad_index;
67+
err->bad_token = bad_token;
68+
}
69+
return TOKEN_DATA_ERR_OOB_TOKEN;
70+
}
71+
72+
return TOKEN_DATA_VALID;
73+
}

training/test_data_validation.c

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
// test_data_validation.c — Unit tests for token-data hardening helpers
2+
#include <stdbool.h>
3+
#include <stdint.h>
4+
#include <stdio.h>
5+
#include <string.h>
6+
7+
#include "data_validation.h"
8+
9+
typedef struct {
10+
int passed;
11+
int failed;
12+
} TestStats;
13+
14+
#define CHECK_TRUE(stats, cond, msg) \
15+
do { \
16+
if (!(cond)) { \
17+
fprintf(stderr, "FAIL: %s (%s:%d)\n", msg, __FILE__, __LINE__); \
18+
(stats)->failed++; \
19+
return; \
20+
} \
21+
} while (0)
22+
23+
#define CHECK_EQ_INT(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg)
24+
#define CHECK_EQ_SIZE(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg)
25+
26+
static uint32_t lcg_next(uint32_t *state) {
27+
*state = (*state * 1664525u) + 1013904223u;
28+
return *state;
29+
}
30+
31+
static void test_bytes_to_token_count_even(TestStats *stats) {
32+
size_t n_tokens = 0;
33+
size_t extra = 99;
34+
CHECK_TRUE(stats, token_data_bytes_to_token_count(1024, &n_tokens, &extra),
35+
"even byte length should map to token count");
36+
CHECK_EQ_SIZE(stats, n_tokens, 512, "1024 bytes should map to 512 tokens");
37+
CHECK_EQ_SIZE(stats, extra, 0, "even byte length should have zero remainder");
38+
stats->passed++;
39+
}
40+
41+
static void test_bytes_to_token_count_odd(TestStats *stats) {
42+
size_t n_tokens = 0;
43+
size_t extra = 0;
44+
CHECK_TRUE(stats, !token_data_bytes_to_token_count(1025, &n_tokens, &extra),
45+
"odd byte length should fail alignment check");
46+
CHECK_EQ_SIZE(stats, n_tokens, 512, "odd byte length should still report floor token count");
47+
CHECK_EQ_SIZE(stats, extra, 1, "1025 bytes should report one extra byte");
48+
stats->passed++;
49+
}
50+
51+
static void test_bytes_to_token_count_null_outputs(TestStats *stats) {
52+
CHECK_TRUE(stats, token_data_bytes_to_token_count(8, NULL, NULL),
53+
"alignment helper should work with null output pointers");
54+
CHECK_TRUE(stats, !token_data_bytes_to_token_count(9, NULL, NULL),
55+
"alignment helper should fail odd byte length with null outputs");
56+
stats->passed++;
57+
}
58+
59+
static void test_min_tokens_boundary(TestStats *stats) {
60+
size_t required = 0;
61+
CHECK_TRUE(stats, token_data_has_min_tokens(257, 256, &required), "257 tokens should satisfy seq=256");
62+
CHECK_EQ_SIZE(stats, required, 257, "required tokens should be seq+1");
63+
stats->passed++;
64+
}
65+
66+
static void test_min_tokens_short(TestStats *stats) {
67+
size_t required = 0;
68+
CHECK_TRUE(stats, !token_data_has_min_tokens(256, 256, &required), "256 tokens should fail seq=256");
69+
CHECK_EQ_SIZE(stats, required, 257, "required tokens should still be seq+1");
70+
stats->passed++;
71+
}
72+
73+
static void test_min_tokens_negative_seq(TestStats *stats) {
74+
size_t required = 777;
75+
CHECK_TRUE(stats, !token_data_has_min_tokens(10, -1, &required), "negative seq should fail min-token check");
76+
CHECK_EQ_SIZE(stats, required, 777, "required token out param should remain unchanged for invalid seq");
77+
stats->passed++;
78+
}
79+
80+
static void test_validate_too_short(TestStats *stats) {
81+
uint16_t tokens[2] = {1, 2};
82+
TokenDataValidationError err = {0};
83+
TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, &err);
84+
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "too-short dataset should fail");
85+
CHECK_EQ_SIZE(stats, err.required_tokens, 5, "required token count should be reported");
86+
stats->passed++;
87+
}
88+
89+
static void test_validate_too_short_precedes_oob(TestStats *stats) {
90+
uint16_t tokens[2] = {65000, 1};
91+
TokenDataValidationError err = {0};
92+
TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, &err);
93+
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "too-short check should happen before OOB check");
94+
CHECK_EQ_SIZE(stats, err.required_tokens, 5, "required token count should still be reported");
95+
stats->passed++;
96+
}
97+
98+
static void test_validate_too_short_with_null_err(TestStats *stats) {
99+
uint16_t tokens[2] = {1, 2};
100+
TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, NULL);
101+
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "validation should work when err output is null");
102+
stats->passed++;
103+
}
104+
105+
static void test_validate_oob_first(TestStats *stats) {
106+
uint16_t tokens[6] = {32000, 1, 2, 3, 4, 5};
107+
TokenDataValidationError err = {0};
108+
TokenDataValidationCode code = token_data_validate(tokens, 6, 4, 32000, &err);
109+
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "first token OOB should fail");
110+
CHECK_EQ_SIZE(stats, err.bad_index, 0, "bad index should point to first token");
111+
CHECK_EQ_INT(stats, err.bad_token, 32000, "bad token value should be reported");
112+
stats->passed++;
113+
}
114+
115+
static void test_validate_oob_middle(TestStats *stats) {
116+
uint16_t tokens[7] = {1, 2, 3, 65535, 4, 5, 6};
117+
TokenDataValidationError err = {0};
118+
TokenDataValidationCode code = token_data_validate(tokens, 7, 4, 32000, &err);
119+
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "middle token OOB should fail");
120+
CHECK_EQ_SIZE(stats, err.bad_index, 3, "bad index should point to middle token");
121+
CHECK_EQ_INT(stats, err.bad_token, 65535, "bad token value should be reported");
122+
stats->passed++;
123+
}
124+
125+
static void test_validate_oob_last(TestStats *stats) {
126+
uint16_t tokens[6] = {1, 2, 3, 4, 5, 40000};
127+
TokenDataValidationError err = {0};
128+
TokenDataValidationCode code = token_data_validate(tokens, 6, 4, 32000, &err);
129+
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "last token OOB should fail");
130+
CHECK_EQ_SIZE(stats, err.bad_index, 5, "bad index should point to last token");
131+
CHECK_EQ_INT(stats, err.bad_token, 40000, "bad token value should be reported");
132+
stats->passed++;
133+
}
134+
135+
static void test_validate_ok(TestStats *stats) {
136+
uint16_t tokens[8] = {0, 1, 2, 3, 4, 5, 31998, 31999};
137+
TokenDataValidationError err;
138+
memset(&err, 0xA5, sizeof(err));
139+
TokenDataValidationCode code = token_data_validate(tokens, 8, 4, 32000, &err);
140+
CHECK_EQ_INT(stats, code, TOKEN_DATA_VALID, "valid dataset should pass");
141+
stats->passed++;
142+
}
143+
144+
static void test_validate_vocab_boundary(TestStats *stats) {
145+
uint16_t valid_tokens[3] = {0, 0, 0};
146+
TokenDataValidationError err = {0};
147+
TokenDataValidationCode valid_code = token_data_validate(valid_tokens, 3, 2, 1, &err);
148+
CHECK_EQ_INT(stats, valid_code, TOKEN_DATA_VALID, "token 0 should be valid when vocab=1");
149+
150+
uint16_t invalid_tokens[3] = {0, 1, 0};
151+
TokenDataValidationCode invalid_code = token_data_validate(invalid_tokens, 3, 2, 1, &err);
152+
CHECK_EQ_INT(stats, invalid_code, TOKEN_DATA_ERR_OOB_TOKEN, "token >= vocab should fail at vocab boundary");
153+
CHECK_EQ_SIZE(stats, err.bad_index, 1, "boundary OOB should report exact index");
154+
CHECK_EQ_INT(stats, err.bad_token, 1, "boundary OOB should report offending token");
155+
stats->passed++;
156+
}
157+
158+
static void test_find_oob_empty(TestStats *stats) {
159+
size_t bad_index = 123;
160+
uint16_t bad_token = 456;
161+
CHECK_TRUE(stats, !token_data_find_oob_token(NULL, 0, 32000, &bad_index, &bad_token),
162+
"empty dataset should not report OOB token");
163+
CHECK_EQ_SIZE(stats, bad_index, 123, "bad index should remain unchanged for empty input");
164+
CHECK_EQ_INT(stats, bad_token, 456, "bad token should remain unchanged for empty input");
165+
stats->passed++;
166+
}
167+
168+
static void test_find_oob_null_outputs(TestStats *stats) {
169+
uint16_t tokens[4] = {0, 1, 32000, 2};
170+
CHECK_TRUE(stats, token_data_find_oob_token(tokens, 4, 32000, NULL, NULL),
171+
"OOB scan should work with null output pointers");
172+
stats->passed++;
173+
}
174+
175+
static void test_find_oob_invalid_vocab(TestStats *stats) {
176+
uint16_t tokens[3] = {0, 1, 2};
177+
CHECK_TRUE(stats, !token_data_find_oob_token(tokens, 3, 0, NULL, NULL),
178+
"OOB scan should reject non-positive vocab");
179+
CHECK_TRUE(stats, !token_data_find_oob_token(tokens, 3, -1, NULL, NULL),
180+
"OOB scan should reject negative vocab");
181+
stats->passed++;
182+
}
183+
184+
static void test_find_oob_randomized_consistency(TestStats *stats) {
185+
uint32_t seed = 1;
186+
for (int iter = 0; iter < 512; iter++) {
187+
int vocab = (int)(lcg_next(&seed) % 128u) + 1;
188+
size_t n_tokens = (size_t)(lcg_next(&seed) % 64u);
189+
uint16_t tokens[64] = {0};
190+
191+
bool expected_found = false;
192+
size_t expected_index = 0;
193+
uint16_t expected_token = 0;
194+
for (size_t i = 0; i < n_tokens; i++) {
195+
tokens[i] = (uint16_t)(lcg_next(&seed) % 256u);
196+
if (!expected_found && (int)tokens[i] >= vocab) {
197+
expected_found = true;
198+
expected_index = i;
199+
expected_token = tokens[i];
200+
}
201+
}
202+
203+
size_t got_index = 0;
204+
uint16_t got_token = 0;
205+
bool got_found = token_data_find_oob_token(tokens, n_tokens, vocab, &got_index, &got_token);
206+
CHECK_EQ_INT(stats, got_found, expected_found, "randomized OOB scan should match reference result");
207+
if (expected_found) {
208+
CHECK_EQ_SIZE(stats, got_index, expected_index, "randomized OOB index should match reference");
209+
CHECK_EQ_INT(stats, got_token, expected_token, "randomized OOB token should match reference");
210+
}
211+
}
212+
stats->passed++;
213+
}
214+
215+
int main(void) {
216+
TestStats stats = {0, 0};
217+
218+
test_bytes_to_token_count_even(&stats);
219+
test_bytes_to_token_count_odd(&stats);
220+
test_bytes_to_token_count_null_outputs(&stats);
221+
test_min_tokens_boundary(&stats);
222+
test_min_tokens_short(&stats);
223+
test_min_tokens_negative_seq(&stats);
224+
test_validate_too_short(&stats);
225+
test_validate_too_short_precedes_oob(&stats);
226+
test_validate_too_short_with_null_err(&stats);
227+
test_validate_oob_first(&stats);
228+
test_validate_oob_middle(&stats);
229+
test_validate_oob_last(&stats);
230+
test_validate_ok(&stats);
231+
test_validate_vocab_boundary(&stats);
232+
test_find_oob_empty(&stats);
233+
test_find_oob_null_outputs(&stats);
234+
test_find_oob_invalid_vocab(&stats);
235+
test_find_oob_randomized_consistency(&stats);
236+
237+
printf("test_data_validation: %d passed, %d failed\n", stats.passed, stats.failed);
238+
return stats.failed == 0 ? 0 : 1;
239+
}

0 commit comments

Comments
 (0)