From 3eeb3ba288444c99ccbeda83864ece313c24d88c Mon Sep 17 00:00:00 2001 From: Amit Portnoy <1131991+amitport@users.noreply.github.com> Date: Sat, 1 Mar 2025 08:19:57 +0200 Subject: [PATCH] max_seq_length should not be larger than any options when loading an auto-model, max_seq_length is read directedly from huggingface and it cannot be overwritten easily. --- sentence_transformers/models/Transformer.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sentence_transformers/models/Transformer.py b/sentence_transformers/models/Transformer.py index a95f12d7c..65c59f119 100644 --- a/sentence_transformers/models/Transformer.py +++ b/sentence_transformers/models/Transformer.py @@ -88,16 +88,18 @@ def __init__( **tokenizer_args, ) - # No max_seq_length set. Try to infer from model + max_seq_options = [] if max_seq_length is None: - if ( - hasattr(self.auto_model, "config") - and hasattr(self.auto_model.config, "max_position_embeddings") - and hasattr(self.tokenizer, "model_max_length") - ): - max_seq_length = min(self.auto_model.config.max_position_embeddings, self.tokenizer.model_max_length) - - self.max_seq_length = max_seq_length + max_seq_options.append(max_seq_length) + if ( + hasattr(self.auto_model, "config") + and hasattr(self.auto_model.config, "max_position_embeddings") + ): + max_seq_options.append(self.auto_model.config.max_position_embeddings) + if hasattr(self.tokenizer, "model_max_length"): + max_seq_options.append(self.tokenizer.model_max_length) + + self.max_seq_length = min(max_seq_options) if tokenizer_name_or_path is not None: self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__