Skip to content

Commit 60b0512

Browse files
committed
Harden token file layout checks and prevent exec-time fd leaks
1 parent 991bf4d commit 60b0512

6 files changed

Lines changed: 218 additions & 41 deletions

File tree

training/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ bash download_data.sh
8181
Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from HuggingFace. Produces `tinystories_data00.bin` (~41 MB, ~20M tokens).
8282

8383
All training pipelines perform token-data validation at startup:
84+
- token file byte length must align to 16-bit token boundaries
8485
- token file must contain at least `SEQ+1` tokens
8586
- every token id must be within `[0, vocab_size)`
8687

training/data_validation.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ typedef struct {
1616
uint16_t bad_token;
1717
} TokenDataValidationError;
1818

19+
// Token files are 16-bit ids. Return false when byte length is misaligned.
20+
static inline bool token_data_bytes_to_token_count(size_t n_bytes, size_t *n_tokens, size_t *extra_bytes) {
21+
size_t rem = n_bytes % sizeof(uint16_t);
22+
if (n_tokens) *n_tokens = n_bytes / sizeof(uint16_t);
23+
if (extra_bytes) *extra_bytes = rem;
24+
return rem == 0;
25+
}
26+
1927
static inline bool token_data_has_min_tokens(size_t n_tokens, int seq, size_t *required_tokens) {
2028
if (seq < 0) return false;
2129
size_t needed = (size_t)seq + 1;

training/test_data_validation.c

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,39 @@ typedef struct {
2323
#define CHECK_EQ_INT(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg)
2424
#define CHECK_EQ_SIZE(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg)
2525

26+
static uint32_t lcg_next(uint32_t *state) {
27+
*state = (*state * 1664525u) + 1013904223u;
28+
return *state;
29+
}
30+
31+
static void test_bytes_to_token_count_even(TestStats *stats) {
32+
size_t n_tokens = 0;
33+
size_t extra = 99;
34+
CHECK_TRUE(stats, token_data_bytes_to_token_count(1024, &n_tokens, &extra),
35+
"even byte length should map to token count");
36+
CHECK_EQ_SIZE(stats, n_tokens, 512, "1024 bytes should map to 512 tokens");
37+
CHECK_EQ_SIZE(stats, extra, 0, "even byte length should have zero remainder");
38+
stats->passed++;
39+
}
40+
41+
static void test_bytes_to_token_count_odd(TestStats *stats) {
42+
size_t n_tokens = 0;
43+
size_t extra = 0;
44+
CHECK_TRUE(stats, !token_data_bytes_to_token_count(1025, &n_tokens, &extra),
45+
"odd byte length should fail alignment check");
46+
CHECK_EQ_SIZE(stats, n_tokens, 512, "odd byte length should still report floor token count");
47+
CHECK_EQ_SIZE(stats, extra, 1, "1025 bytes should report one extra byte");
48+
stats->passed++;
49+
}
50+
51+
static void test_bytes_to_token_count_null_outputs(TestStats *stats) {
52+
CHECK_TRUE(stats, token_data_bytes_to_token_count(8, NULL, NULL),
53+
"alignment helper should work with null output pointers");
54+
CHECK_TRUE(stats, !token_data_bytes_to_token_count(9, NULL, NULL),
55+
"alignment helper should fail odd byte length with null outputs");
56+
stats->passed++;
57+
}
58+
2659
static void test_min_tokens_boundary(TestStats *stats) {
2760
size_t required = 0;
2861
CHECK_TRUE(stats, token_data_has_min_tokens(257, 256, &required), "257 tokens should satisfy seq=256");
@@ -37,6 +70,13 @@ static void test_min_tokens_short(TestStats *stats) {
3770
stats->passed++;
3871
}
3972

73+
static void test_min_tokens_negative_seq(TestStats *stats) {
74+
size_t required = 777;
75+
CHECK_TRUE(stats, !token_data_has_min_tokens(10, -1, &required), "negative seq should fail min-token check");
76+
CHECK_EQ_SIZE(stats, required, 777, "required token out param should remain unchanged for invalid seq");
77+
stats->passed++;
78+
}
79+
4080
static void test_validate_too_short(TestStats *stats) {
4181
uint16_t tokens[2] = {1, 2};
4282
TokenDataValidationError err = {0};
@@ -46,6 +86,22 @@ static void test_validate_too_short(TestStats *stats) {
4686
stats->passed++;
4787
}
4888

89+
static void test_validate_too_short_precedes_oob(TestStats *stats) {
90+
uint16_t tokens[2] = {65000, 1};
91+
TokenDataValidationError err = {0};
92+
TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, &err);
93+
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "too-short check should happen before OOB check");
94+
CHECK_EQ_SIZE(stats, err.required_tokens, 5, "required token count should still be reported");
95+
stats->passed++;
96+
}
97+
98+
static void test_validate_too_short_with_null_err(TestStats *stats) {
99+
uint16_t tokens[2] = {1, 2};
100+
TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, NULL);
101+
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "validation should work when err output is null");
102+
stats->passed++;
103+
}
104+
49105
static void test_validate_oob_first(TestStats *stats) {
50106
uint16_t tokens[6] = {32000, 1, 2, 3, 4, 5};
51107
TokenDataValidationError err = {0};
@@ -85,6 +141,20 @@ static void test_validate_ok(TestStats *stats) {
85141
stats->passed++;
86142
}
87143

144+
static void test_validate_vocab_boundary(TestStats *stats) {
145+
uint16_t valid_tokens[3] = {0, 0, 0};
146+
TokenDataValidationError err = {0};
147+
TokenDataValidationCode valid_code = token_data_validate(valid_tokens, 3, 2, 1, &err);
148+
CHECK_EQ_INT(stats, valid_code, TOKEN_DATA_VALID, "token 0 should be valid when vocab=1");
149+
150+
uint16_t invalid_tokens[3] = {0, 1, 0};
151+
TokenDataValidationCode invalid_code = token_data_validate(invalid_tokens, 3, 2, 1, &err);
152+
CHECK_EQ_INT(stats, invalid_code, TOKEN_DATA_ERR_OOB_TOKEN, "token >= vocab should fail at vocab boundary");
153+
CHECK_EQ_SIZE(stats, err.bad_index, 1, "boundary OOB should report exact index");
154+
CHECK_EQ_INT(stats, err.bad_token, 1, "boundary OOB should report offending token");
155+
stats->passed++;
156+
}
157+
88158
static void test_find_oob_empty(TestStats *stats) {
89159
size_t bad_index = 123;
90160
uint16_t bad_token = 456;
@@ -95,17 +165,74 @@ static void test_find_oob_empty(TestStats *stats) {
95165
stats->passed++;
96166
}
97167

168+
static void test_find_oob_null_outputs(TestStats *stats) {
169+
uint16_t tokens[4] = {0, 1, 32000, 2};
170+
CHECK_TRUE(stats, token_data_find_oob_token(tokens, 4, 32000, NULL, NULL),
171+
"OOB scan should work with null output pointers");
172+
stats->passed++;
173+
}
174+
175+
static void test_find_oob_invalid_vocab(TestStats *stats) {
176+
uint16_t tokens[3] = {0, 1, 2};
177+
CHECK_TRUE(stats, !token_data_find_oob_token(tokens, 3, 0, NULL, NULL),
178+
"OOB scan should reject non-positive vocab");
179+
CHECK_TRUE(stats, !token_data_find_oob_token(tokens, 3, -1, NULL, NULL),
180+
"OOB scan should reject negative vocab");
181+
stats->passed++;
182+
}
183+
184+
static void test_find_oob_randomized_consistency(TestStats *stats) {
185+
uint32_t seed = 1;
186+
for (int iter = 0; iter < 512; iter++) {
187+
int vocab = (int)(lcg_next(&seed) % 128u) + 1;
188+
size_t n_tokens = (size_t)(lcg_next(&seed) % 64u);
189+
uint16_t tokens[64] = {0};
190+
191+
bool expected_found = false;
192+
size_t expected_index = 0;
193+
uint16_t expected_token = 0;
194+
for (size_t i = 0; i < n_tokens; i++) {
195+
tokens[i] = (uint16_t)(lcg_next(&seed) % 256u);
196+
if (!expected_found && (int)tokens[i] >= vocab) {
197+
expected_found = true;
198+
expected_index = i;
199+
expected_token = tokens[i];
200+
}
201+
}
202+
203+
size_t got_index = 0;
204+
uint16_t got_token = 0;
205+
bool got_found = token_data_find_oob_token(tokens, n_tokens, vocab, &got_index, &got_token);
206+
CHECK_EQ_INT(stats, got_found, expected_found, "randomized OOB scan should match reference result");
207+
if (expected_found) {
208+
CHECK_EQ_SIZE(stats, got_index, expected_index, "randomized OOB index should match reference");
209+
CHECK_EQ_INT(stats, got_token, expected_token, "randomized OOB token should match reference");
210+
}
211+
}
212+
stats->passed++;
213+
}
214+
98215
int main(void) {
99216
TestStats stats = {0, 0};
100217

218+
test_bytes_to_token_count_even(&stats);
219+
test_bytes_to_token_count_odd(&stats);
220+
test_bytes_to_token_count_null_outputs(&stats);
101221
test_min_tokens_boundary(&stats);
102222
test_min_tokens_short(&stats);
223+
test_min_tokens_negative_seq(&stats);
103224
test_validate_too_short(&stats);
225+
test_validate_too_short_precedes_oob(&stats);
226+
test_validate_too_short_with_null_err(&stats);
104227
test_validate_oob_first(&stats);
105228
test_validate_oob_middle(&stats);
106229
test_validate_oob_last(&stats);
107230
test_validate_ok(&stats);
231+
test_validate_vocab_boundary(&stats);
108232
test_find_oob_empty(&stats);
233+
test_find_oob_null_outputs(&stats);
234+
test_find_oob_invalid_vocab(&stats);
235+
test_find_oob_randomized_consistency(&stats);
109236

110237
printf("test_data_validation: %d passed, %d failed\n", stats.passed, stats.failed);
111238
return stats.failed == 0 ? 0 : 1;

training/train_large.m

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -283,28 +283,42 @@ int main(int argc, char *argv[]) {
283283
printf("ANE FLOPs/step: %.0fM (fwd+bwd_dx+sdpa_bwd) | CPU: dW+cls (cblas)\n\n", ane_f/1e6);
284284
}
285285

286-
// mmap token data
286+
// mmap token data
287287
int data_fd = open(DATA_PATH, O_RDONLY);
288288
if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; }
289-
struct stat st; fstat(data_fd, &st);
290-
size_t data_len = st.st_size;
289+
struct stat st;
290+
if (fstat(data_fd, &st) != 0) { perror("fstat"); close(data_fd); return 1; }
291+
size_t data_len = (size_t)st.st_size;
292+
size_t n_tokens = 0, extra_bytes = 0;
293+
if (!token_data_bytes_to_token_count(data_len, &n_tokens, &extra_bytes)) {
294+
fprintf(stderr,
295+
"Token data validation failed: file size %zu bytes has %zu extra byte(s); expected 16-bit tokens\n",
296+
data_len, extra_bytes);
297+
close(data_fd);
298+
return 1;
299+
}
300+
if (n_tokens == 0) {
301+
fprintf(stderr, "Token data validation failed: token file is empty\n");
302+
close(data_fd);
303+
return 1;
304+
}
291305
uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
292-
if (token_data == MAP_FAILED) { printf("mmap failed\n"); close(data_fd); return 1; }
293-
size_t n_tokens = data_len / 2;
306+
if (token_data == MAP_FAILED) { perror("mmap"); close(data_fd); return 1; }
307+
close(data_fd); // mapping remains valid; avoid fd leaks across exec() restarts
294308
printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);
295309

296310
TokenDataValidationError data_err = {0};
297311
TokenDataValidationCode data_code = token_data_validate(token_data, n_tokens, SEQ, VOCAB, &data_err);
298312
if (data_code == TOKEN_DATA_ERR_TOO_SHORT) {
299313
fprintf(stderr, "Token data validation failed: need at least %zu tokens (SEQ+1), got %zu\n",
300314
data_err.required_tokens, n_tokens);
301-
munmap(token_data, data_len); close(data_fd);
315+
munmap(token_data, data_len);
302316
return 1;
303317
}
304318
if (data_code == TOKEN_DATA_ERR_OOB_TOKEN) {
305319
fprintf(stderr, "Token data validation failed: token %u at index %zu is outside vocab [0, %d)\n",
306320
data_err.bad_token, data_err.bad_index, VOCAB);
307-
munmap(token_data, data_len); close(data_fd);
321+
munmap(token_data, data_len);
308322
return 1;
309323
}
310324

@@ -695,19 +709,18 @@ int main(int argc, char *argv[]) {
695709
printf("ANE utilization: %.1f%% of 15.8 TFLOPS\n", 100*ane_flops/(total_train_ms*1e9)/15.8);
696710

697711
// Cleanup
698-
for (int L=0; L<NLAYERS; L++) {
699-
free_layer_kernels(&kern[L]);
700-
free_kern(sdpaBwd2[L]);
701-
layer_weights_free(&lw[L]);
702-
layer_adam_free(&la[L]);
703-
layer_acts_free(&acts[L]);
704-
layer_grads_free(&grads[L]);
705-
}
706-
munmap(token_data, data_len);
707-
close(data_fd);
708-
free(rms_final); free(embed); free(grms_final); free(gembed);
709-
adam_free(&arms_final); adam_free(&aembed);
710-
free(dy); free(dffn); free(dh1); free(dh3); free(dx_ffn); free(dx2);
712+
for (int L=0; L<NLAYERS; L++) {
713+
free_layer_kernels(&kern[L]);
714+
free_kern(sdpaBwd2[L]);
715+
layer_weights_free(&lw[L]);
716+
layer_adam_free(&la[L]);
717+
layer_acts_free(&acts[L]);
718+
layer_grads_free(&grads[L]);
719+
}
720+
munmap(token_data, data_len);
721+
free(rms_final); free(embed); free(grms_final); free(gembed);
722+
adam_free(&arms_final); adam_free(&aembed);
723+
free(dy); free(dffn); free(dh1); free(dh3); free(dx_ffn); free(dx2);
711724
free(do_out_buf); free(dq); free(dk); free(dv); free(dx_attn);
712725
free(x_cur); free(x_final); free(logits); free(dlogits);
713726
}

training/train_large_ane.m

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -274,25 +274,39 @@ int main(int argc, char *argv[]) {
274274
// mmap token data
275275
int data_fd = open(DATA_PATH, O_RDONLY);
276276
if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; }
277-
struct stat st; fstat(data_fd, &st);
278-
size_t data_len = st.st_size;
277+
struct stat st;
278+
if (fstat(data_fd, &st) != 0) { perror("fstat"); close(data_fd); return 1; }
279+
size_t data_len = (size_t)st.st_size;
280+
size_t n_tokens = 0, extra_bytes = 0;
281+
if (!token_data_bytes_to_token_count(data_len, &n_tokens, &extra_bytes)) {
282+
fprintf(stderr,
283+
"Token data validation failed: file size %zu bytes has %zu extra byte(s); expected 16-bit tokens\n",
284+
data_len, extra_bytes);
285+
close(data_fd);
286+
return 1;
287+
}
288+
if (n_tokens == 0) {
289+
fprintf(stderr, "Token data validation failed: token file is empty\n");
290+
close(data_fd);
291+
return 1;
292+
}
279293
uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
280-
if (token_data == MAP_FAILED) { printf("mmap failed\n"); close(data_fd); return 1; }
281-
size_t n_tokens = data_len / 2;
294+
if (token_data == MAP_FAILED) { perror("mmap"); close(data_fd); return 1; }
295+
close(data_fd); // mapping remains valid; avoid fd leaks across exec() restarts
282296
printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);
283297

284298
TokenDataValidationError data_err = {0};
285299
TokenDataValidationCode data_code = token_data_validate(token_data, n_tokens, SEQ, VOCAB, &data_err);
286300
if (data_code == TOKEN_DATA_ERR_TOO_SHORT) {
287301
fprintf(stderr, "Token data validation failed: need at least %zu tokens (SEQ+1), got %zu\n",
288302
data_err.required_tokens, n_tokens);
289-
munmap(token_data, data_len); close(data_fd);
303+
munmap(token_data, data_len);
290304
return 1;
291305
}
292306
if (data_code == TOKEN_DATA_ERR_OOB_TOKEN) {
293307
fprintf(stderr, "Token data validation failed: token %u at index %zu is outside vocab [0, %d)\n",
294308
data_err.bad_token, data_err.bad_index, VOCAB);
295-
munmap(token_data, data_len); close(data_fd);
309+
munmap(token_data, data_len);
296310
return 1;
297311
}
298312

@@ -749,7 +763,7 @@ int main(int argc, char *argv[]) {
749763
layer_acts_free(&acts[L]); layer_grads_free(&grads[L]);
750764
}
751765
free_kern(softmaxKern); free_kern(finalRmsKern); free_kern(classifierKern);
752-
munmap(token_data, data_len); close(data_fd);
766+
munmap(token_data, data_len);
753767
free(rms_final); free(embed); free(grms_final); free(gembed);
754768
adam_free(&arms_final); adam_free(&aembed);
755769
free(dy); free(dffn); free(dh1); free(dh3); free(dx_ffn); free(dx2);

0 commit comments

Comments
 (0)