diff --git a/blink/biencoder/data_process.py b/blink/biencoder/data_process.py index b0c0c230..da5a9e11 100644 --- a/blink/biencoder/data_process.py +++ b/blink/biencoder/data_process.py @@ -36,6 +36,10 @@ def get_context_representation( if sample[mention_key] and len(sample[mention_key]) > 0: mention_tokens = tokenizer.tokenize(sample[mention_key]) mention_tokens = [ent_start_token] + mention_tokens + [ent_end_token] + # Make sure that mention_tokens do not exceed max_seq_length + # And that at least one token from left and right context are used (left_quota >= 1, right_quota >= 1) + assert max_seq_length > 4 + mention_tokens = mention_tokens[:max_seq_length - 4] context_left = sample[context_key + "_left"] context_right = sample[context_key + "_right"]