Skip to content

Commit d4836fd

Browse files
authored
feat: return Unprocessable error while expected error happened (#5347)
This PR intends to return Unprocessable error while expected error happened. In this way, our users can print helpful message to users instead of just return the Index error. --- **This PR was primarily authored with Codex using GPT-5-Codex and then hand-reviewed by me. I AM responsible for every change made in this PR. I aimed to keep it aligned with our goals, though I may have missed minor issues. Please flag anything that feels off, I'll fix it quickly.** --------- Signed-off-by: Xuanwo <[email protected]>
1 parent ef12986 commit d4836fd

File tree

5 files changed

+54
-14
lines changed

5 files changed

+54
-14
lines changed

rust/lance-core/src/error.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ pub enum Error {
6363
Internal { message: String, location: Location },
6464
#[snafu(display("A prerequisite task failed: {message}, {location}"))]
6565
PrerequisiteFailed { message: String, location: Location },
66+
#[snafu(display("Unprocessable: {message}, {location}"))]
67+
Unprocessable { message: String, location: Location },
6668
#[snafu(display("LanceError(Arrow): {message}, {location}"))]
6769
Arrow { message: String, location: Location },
6870
#[snafu(display("LanceError(Schema): {message}, {location}"))]

rust/lance-index/src/vector/kmeans.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1319,9 +1319,12 @@ where
13191319
{
13201320
let num_rows = array.len() / dimension;
13211321
if num_rows < k {
1322-
return Err(Error::Index{message: format!(
1323-
"KMeans: can not train {k} centroids with {num_rows} vectors, choose a smaller K (< {num_rows}) instead"
1324-
),location: location!()});
1322+
return Err(Error::Unprocessable {
1323+
message: format!(
1324+
"KMeans cannot train {k} centroids with {num_rows} vectors; choose a smaller K (< {num_rows})"
1325+
),
1326+
location: location!(),
1327+
});
13251328
}
13261329

13271330
// Only sample sample_rate * num_clusters. See Faiss

rust/lance-index/src/vector/pq/builder.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,9 @@ impl PQBuildParams {
171171

172172
let num_centroids = 2_usize.pow(self.num_bits as u32);
173173
if data.len() < num_centroids {
174-
return Err(Error::Index {
174+
return Err(Error::Unprocessable {
175175
message: format!(
176-
"Not enough rows to train PQ. Requires {:?} rows but only {:?} available",
177-
num_centroids,
176+
"Not enough rows to train PQ. Requires {num_centroids} rows but only {} available",
178177
data.len()
179178
),
180179
location: location!(),

rust/lance/src/index.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2491,8 +2491,8 @@ mod tests {
24912491
.create_index(&["vector"], IndexType::Vector, None, &params, false)
24922492
.await;
24932493

2494-
assert!(matches!(result, Err(Error::Index { .. })));
2495-
if let Error::Index { message, .. } = result.unwrap_err() {
2494+
assert!(matches!(result, Err(Error::Unprocessable { .. })));
2495+
if let Error::Unprocessable { message, .. } = result.unwrap_err() {
24962496
assert_eq!(
24972497
message,
24982498
"Not enough rows to train PQ. Requires 256 rows but only 100 available",

rust/lance/src/index/vector/pq.rs

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,8 @@ pub async fn build_pq_model(
508508
params: &PQBuildParams,
509509
ivf: Option<&IvfModel>,
510510
) -> Result<ProductQuantizer> {
511+
let num_codes = 2_usize.pow(params.num_bits as u32);
512+
511513
if let Some(codebook) = &params.codebook {
512514
let dt = if metric_type == MetricType::Cosine {
513515
info!("Normalize training data for PQ training: Cosine");
@@ -577,13 +579,16 @@ pub async fn build_pq_model(
577579
training_data
578580
};
579581

580-
let num_codes = 2_usize.pow(params.num_bits as u32);
581582
if training_data.len() < num_codes {
582-
return Err(Error::Index {
583+
warn!(
584+
"Skip PQ training: only {} rows available, needs >= {}",
585+
training_data.len(),
586+
num_codes
587+
);
588+
return Err(Error::Unprocessable {
583589
message: format!(
584-
"Not enough rows to train PQ. Requires {:?} rows but only {:?} available",
585-
num_codes,
586-
training_data.len()
590+
"Not enough rows to train PQ. Requires {num_codes} rows but only {available} available",
591+
available = training_data.len()
587592
),
588593
location: location!(),
589594
});
@@ -637,7 +642,9 @@ mod tests {
637642
use crate::index::vector::ivf::build_ivf_model;
638643
use lance_core::utils::mask::RowIdMask;
639644
use lance_index::vector::ivf::IvfBuildParams;
640-
use lance_testing::datagen::generate_random_array_with_range;
645+
use lance_testing::datagen::{
646+
generate_random_array_with_range, generate_random_array_with_seed,
647+
};
641648

642649
const DIM: usize = 128;
643650
async fn generate_dataset(
@@ -761,6 +768,35 @@ mod tests {
761768
);
762769
}
763770

771+
#[tokio::test]
772+
async fn test_build_pq_model_insufficient_rows_returns_prereq() {
773+
let test_dir = TempStrDir::default();
774+
let test_uri = test_dir.as_str();
775+
776+
let dim = 16;
777+
let schema = Arc::new(Schema::new(vec![Field::new(
778+
"vector",
779+
DataType::FixedSizeList(
780+
Arc::new(Field::new("item", DataType::Float32, true)),
781+
dim as i32,
782+
),
783+
false,
784+
)]));
785+
786+
let vectors = generate_random_array_with_seed::<Float32Type>(dim * 10, [11u8; 32]);
787+
let fsl = FixedSizeListArray::try_new_from_values(vectors, dim as i32).unwrap();
788+
let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(fsl)]).unwrap();
789+
let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
790+
let dataset = Dataset::write(reader, test_uri, None).await.unwrap();
791+
792+
let params = PQBuildParams::new(16, 8);
793+
let err = build_pq_model(&dataset, "vector", dim, MetricType::L2, &params, None)
794+
.await
795+
.unwrap_err();
796+
797+
assert!(matches!(err, Error::Unprocessable { .. }));
798+
}
799+
764800
struct TestPreFilter {
765801
row_ids: Vec<u64>,
766802
}

0 commit comments

Comments
 (0)