From 95f7fd82fe8afff1266721d956ca6d2140d3a503 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BF=9F=E6=8C=81=E6=B1=9F?= <129171955+2513502304@users.noreply.github.com> Date: Mon, 5 May 2025 19:38:19 +0800 Subject: [PATCH 1/5] Update _spacy.py, Working with the new ._.trf_data object (3.7+) --- bertopic/backend/_spacy.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bertopic/backend/_spacy.py b/bertopic/backend/_spacy.py index f55fd080..6bb98d7f 100644 --- a/bertopic/backend/_spacy.py +++ b/bertopic/backend/_spacy.py @@ -84,7 +84,13 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: if embedding.has_vector: embedding = embedding.vector else: - embedding = embedding._.trf_data.tensors[-1][0] + # Transformer pipeline design: https://spacy.io/models#design-trf + try: + # For spaCy v3.0-v3.6, trf pipelines use spacy-transformers and the transformer output in doc._.trf_data is a TransformerData object. + embedding = embedding._.trf_data.tensors[-1][0] + except AttributeError as e: + # For spaCy v3.7+, trf pipelines use spacy-curated-transformers and doc._.trf_data is a DocTransformerOutput object. + embedding = embedding._.trf_data.last_hidden_layer_state.data # or embedding._.trf_data.all_outputs[-1].data if not isinstance(embedding, np.ndarray) and hasattr(embedding, "get"): # Convert cupy array to numpy array From 79eb23a33d2ac392f62d34c607361863bb6907b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BF=9F=E6=8C=81=E6=B1=9F?= <129171955+2513502304@users.noreply.github.com> Date: Mon, 5 May 2025 20:01:27 +0800 Subject: [PATCH 2/5] Update _spacy.py --- bertopic/backend/_spacy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bertopic/backend/_spacy.py b/bertopic/backend/_spacy.py index 6bb98d7f..3f502868 100644 --- a/bertopic/backend/_spacy.py +++ b/bertopic/backend/_spacy.py @@ -90,7 +90,9 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: embedding = embedding._.trf_data.tensors[-1][0] except AttributeError as e: # For spaCy v3.7+, trf pipelines use spacy-curated-transformers and doc._.trf_data is a DocTransformerOutput object. - embedding = embedding._.trf_data.last_hidden_layer_state.data # or embedding._.trf_data.all_outputs[-1].data + embedding = ( + embedding._.trf_data.last_hidden_layer_state.data + ) # or embedding._.trf_data.all_outputs[-1].data if not isinstance(embedding, np.ndarray) and hasattr(embedding, "get"): # Convert cupy array to numpy array From bab6a2e98c672e1d9c90e49bdec3e5e89dd4d59b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BF=9F=E6=8C=81=E6=B1=9F?= <129171955+2513502304@users.noreply.github.com> Date: Mon, 5 May 2025 20:03:36 +0800 Subject: [PATCH 3/5] Update _spacy.py --- bertopic/backend/_spacy.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bertopic/backend/_spacy.py b/bertopic/backend/_spacy.py index 3f502868..96a110af 100644 --- a/bertopic/backend/_spacy.py +++ b/bertopic/backend/_spacy.py @@ -90,9 +90,8 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: embedding = embedding._.trf_data.tensors[-1][0] except AttributeError as e: # For spaCy v3.7+, trf pipelines use spacy-curated-transformers and doc._.trf_data is a DocTransformerOutput object. - embedding = ( - embedding._.trf_data.last_hidden_layer_state.data - ) # or embedding._.trf_data.all_outputs[-1].data + embedding = embedding._.trf_data.last_hidden_layer_state.data + # embedding = embedding._.trf_data.all_outputs[-1].data if not isinstance(embedding, np.ndarray) and hasattr(embedding, "get"): # Convert cupy array to numpy array From bfdd61978f6d37ad6dff7b1ada49906fb751269a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BF=9F=E6=8C=81=E6=B1=9F?= <129171955+2513502304@users.noreply.github.com> Date: Mon, 5 May 2025 20:05:26 +0800 Subject: [PATCH 4/5] Update _spacy.py --- bertopic/backend/_spacy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bertopic/backend/_spacy.py b/bertopic/backend/_spacy.py index 96a110af..6fdfacc9 100644 --- a/bertopic/backend/_spacy.py +++ b/bertopic/backend/_spacy.py @@ -88,10 +88,10 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: try: # For spaCy v3.0-v3.6, trf pipelines use spacy-transformers and the transformer output in doc._.trf_data is a TransformerData object. embedding = embedding._.trf_data.tensors[-1][0] - except AttributeError as e: + except AttributeError: # For spaCy v3.7+, trf pipelines use spacy-curated-transformers and doc._.trf_data is a DocTransformerOutput object. - embedding = embedding._.trf_data.last_hidden_layer_state.data # embedding = embedding._.trf_data.all_outputs[-1].data + embedding = embedding._.trf_data.last_hidden_layer_state.data if not isinstance(embedding, np.ndarray) and hasattr(embedding, "get"): # Convert cupy array to numpy array From 37ae01e5688150faca4ca4ce4f7342113b500941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BF=9F=E6=8C=81=E6=B1=9F?= <129171955+2513502304@users.noreply.github.com> Date: Mon, 12 May 2025 13:53:34 +0800 Subject: [PATCH 5/5] Update _spacy.py --- bertopic/backend/_spacy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bertopic/backend/_spacy.py b/bertopic/backend/_spacy.py index 6fdfacc9..42025867 100644 --- a/bertopic/backend/_spacy.py +++ b/bertopic/backend/_spacy.py @@ -90,8 +90,8 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: embedding = embedding._.trf_data.tensors[-1][0] except AttributeError: # For spaCy v3.7+, trf pipelines use spacy-curated-transformers and doc._.trf_data is a DocTransformerOutput object. - # embedding = embedding._.trf_data.all_outputs[-1].data - embedding = embedding._.trf_data.last_hidden_layer_state.data + # embedding = embedding._.trf_data.all_outputs[-1].data.mean(axis=0) + embedding = embedding._.trf_data.last_hidden_layer_state.data.mean(axis=0) if not isinstance(embedding, np.ndarray) and hasattr(embedding, "get"): # Convert cupy array to numpy array