diff --git a/src/ragas/testset/persona.py b/src/ragas/testset/persona.py index 09a56663b..c3d4bc95e 100644 --- a/src/ragas/testset/persona.py +++ b/src/ragas/testset/persona.py @@ -1,5 +1,4 @@ import logging -import random import typing as t import numpy as np @@ -19,7 +18,7 @@ def default_filter(node: Node) -> bool: node.type.name == "DOCUMENT" and node.properties.get("summary_embedding") is not None ): - return random.random() < 0.25 + return True else: return False @@ -92,8 +91,14 @@ def generate_personas_from_kg( """ nodes = [node for node in kg.nodes if filter_fn(node)] + if len(nodes) == 0: + raise ValueError( + "No nodes that satisfied the given filer. Try changing the filter." + ) + summaries = [node.properties.get("summary") for node in nodes] summaries = [summary for summary in summaries if isinstance(summary, str)] + num_personas = min(num_personas, len(summaries)) embeddings = [] for node in nodes: diff --git a/src/ragas/testset/synthesizers/__init__.py b/src/ragas/testset/synthesizers/__init__.py index 7ccde116c..e31a86515 100644 --- a/src/ragas/testset/synthesizers/__init__.py +++ b/src/ragas/testset/synthesizers/__init__.py @@ -1,6 +1,7 @@ import typing as t from ragas.llms import BaseRagasLLM +from ragas.testset.graph import KnowledgeGraph from ragas.testset.synthesizers.multi_hop import ( MultiHopAbstractQuerySynthesizer, MultiHopSpecificQuerySynthesizer, @@ -14,12 +15,24 @@ QueryDistribution = t.List[t.Tuple[BaseSynthesizer, float]] -def default_query_distribution(llm: BaseRagasLLM) -> QueryDistribution: - return [ - (SingleHopSpecificQuerySynthesizer(llm=llm), 0.5), - (MultiHopAbstractQuerySynthesizer(llm=llm), 0.25), - (MultiHopSpecificQuerySynthesizer(llm=llm), 0.25), +def default_query_distribution( + llm: BaseRagasLLM, kg: t.Optional[KnowledgeGraph] = None +) -> QueryDistribution: + """ """ + default_queries = [ + SingleHopSpecificQuerySynthesizer(llm=llm), + MultiHopAbstractQuerySynthesizer(llm=llm), + MultiHopSpecificQuerySynthesizer(llm=llm), ] + if kg is not None: + available_queries = [] + for query in default_queries: + if query.get_node_clusters(kg): + available_queries.append(query) + else: + available_queries = default_queries + + return [(query, 1 / len(available_queries)) for query in available_queries] __all__ = [ diff --git a/src/ragas/testset/synthesizers/generate.py b/src/ragas/testset/synthesizers/generate.py index c006e6c6a..65db77e3f 100644 --- a/src/ragas/testset/synthesizers/generate.py +++ b/src/ragas/testset/synthesizers/generate.py @@ -10,10 +10,7 @@ from ragas._analytics import TestsetGenerationEvent, track from ragas.callbacks import new_group from ragas.cost import TokenUsageParser -from ragas.embeddings.base import ( - BaseRagasEmbeddings, - LlamaIndexEmbeddingsWrapper, -) +from ragas.embeddings.base import BaseRagasEmbeddings, LlamaIndexEmbeddingsWrapper from ragas.executor import Executor from ragas.llms import BaseRagasLLM, LangchainLLMWrapper, LlamaIndexLLMWrapper from ragas.run_config import RunConfig @@ -155,6 +152,7 @@ def generate_with_langchain_docs( if not transforms: transforms = default_transforms( + documents=list(documents), llm=transforms_llm or self.llm, embedding_model=transforms_embedding_model, ) @@ -224,6 +222,7 @@ def generate_with_llamaindex_docs( transforms_embedding_model ) transforms = default_transforms( + documents=[LCDocument(page_content=doc.text) for doc in documents], llm=llm_for_transforms, embedding_model=embedding_model_for_transforms, ) @@ -312,7 +311,9 @@ def generate( if run_config is not None: self.llm.set_run_config(run_config) - query_distribution = query_distribution or default_query_distribution(self.llm) + query_distribution = query_distribution or default_query_distribution( + self.llm, self.knowledge_graph + ) callbacks = callbacks or [] # dict to store any callbacks we define diff --git a/src/ragas/testset/synthesizers/multi_hop/abstract.py b/src/ragas/testset/synthesizers/multi_hop/abstract.py index 65020ddf8..bd8d78a70 100644 --- a/src/ragas/testset/synthesizers/multi_hop/abstract.py +++ b/src/ragas/testset/synthesizers/multi_hop/abstract.py @@ -7,7 +7,7 @@ import numpy as np from ragas.prompt import PydanticPrompt -from ragas.testset.graph import KnowledgeGraph +from ragas.testset.graph import KnowledgeGraph, Node from ragas.testset.graph_queries import get_child_nodes from ragas.testset.persona import Persona, PersonaList from ragas.testset.synthesizers.multi_hop.base import ( @@ -42,6 +42,17 @@ class MultiHopAbstractQuerySynthesizer(MultiHopQuerySynthesizer): concept_combination_prompt: PydanticPrompt = ConceptCombinationPrompt() theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt() + def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[t.Set[Node]]: + + node_clusters = knowledge_graph.find_indirect_clusters( + relationship_condition=lambda rel: ( + True if rel.get_property("summary_similarity") else False + ), + depth_limit=3, + ) + logger.info("found %d clusters", len(node_clusters)) + return node_clusters + async def _generate_scenarios( self, n: int, @@ -61,18 +72,12 @@ async def _generate_scenarios( 4. Sample diverse combinations of scenarios to get n samples """ - node_clusters = knowledge_graph.find_indirect_clusters( - relationship_condition=lambda rel: ( - True if rel.get_property("summary_similarity") else False - ), - depth_limit=3, - ) - logger.info("found %d clusters", len(node_clusters)) + node_clusters = self.get_node_clusters(knowledge_graph) scenarios = [] if len(node_clusters) == 0: raise ValueError( - "No clusters found in the knowledge graph. Use a different Synthesizer." + "No clusters found in the knowledge graph. Try changing the relationship condition." ) num_sample_per_cluster = int(np.ceil(n / len(node_clusters))) diff --git a/src/ragas/testset/synthesizers/multi_hop/base.py b/src/ragas/testset/synthesizers/multi_hop/base.py index 3b2e3010c..e51a14623 100644 --- a/src/ragas/testset/synthesizers/multi_hop/base.py +++ b/src/ragas/testset/synthesizers/multi_hop/base.py @@ -73,7 +73,7 @@ def prepare_combinations( valid_nodes = [] for node in nodes: node_themes = [ - theme.lower() for theme in node.get_property(property_name) + theme.lower() for theme in node.properties.get(property_name, []) ] if node.get_property(property_name) and any( concept.lower() in node_themes for concept in combination diff --git a/src/ragas/testset/synthesizers/multi_hop/specific.py b/src/ragas/testset/synthesizers/multi_hop/specific.py index b71af16c3..02d483278 100644 --- a/src/ragas/testset/synthesizers/multi_hop/specific.py +++ b/src/ragas/testset/synthesizers/multi_hop/specific.py @@ -7,7 +7,7 @@ import numpy as np from ragas.prompt import PydanticPrompt -from ragas.testset.graph import KnowledgeGraph +from ragas.testset.graph import KnowledgeGraph, Node from ragas.testset.persona import Persona, PersonaList from ragas.testset.synthesizers.multi_hop.base import ( MultiHopQuerySynthesizer, @@ -38,9 +38,26 @@ class MultiHopSpecificQuerySynthesizer(MultiHopQuerySynthesizer): """ name: str = "multi_hop_specific_query_synthesizer" + relation_type: str = "entities_overlap" + property_name: str = "entities" theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt() generate_query_reference_prompt: PydanticPrompt = QueryAnswerGenerationPrompt() + def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[t.Set[Node]]: + + cluster_dict = knowledge_graph.find_direct_clusters( + relationship_condition=lambda rel: ( + True if rel.type == self.relation_type else False + ) + ) + logger.info("found %d clusters", len(cluster_dict)) + node_clusters = [] + for key_node, list_of_nodes in cluster_dict.items(): + for node in list_of_nodes: + node_clusters.append((key_node, node)) + + return node_clusters + async def _generate_scenarios( self, n: int, @@ -61,26 +78,21 @@ async def _generate_scenarios( 4. Return the list of scenarios of length n """ - cluster_dict = knowledge_graph.find_direct_clusters( - relationship_condition=lambda rel: ( - True if rel.type == "entities_overlap" else False + node_clusters = self.get_node_clusters(knowledge_graph) + + if len(node_clusters) == 0: + raise ValueError( + "No clusters found in the knowledge graph. Try changing the relationship condition." ) - ) + + num_sample_per_cluster = int(np.ceil(n / len(node_clusters))) valid_relationships = [ rel for rel in knowledge_graph.relationships - if rel.type == "entities_overlap" + if rel.type == self.relation_type ] - - node_clusters = [] - for key_node, list_of_nodes in cluster_dict.items(): - for node in list_of_nodes: - node_clusters.append((key_node, node)) - - logger.info("found %d clusters", len(cluster_dict)) scenarios = [] - num_sample_per_cluster = int(np.ceil(n / len(node_clusters))) for cluster in node_clusters: if len(scenarios) < n: @@ -106,7 +118,7 @@ async def _generate_scenarios( overlapped_items, PersonaList(personas=persona_list), persona_concepts, - property_name="entities", + property_name=self.property_name, ) base_scenarios = self.sample_diverse_combinations( base_scenarios, num_sample_per_cluster diff --git a/src/ragas/testset/synthesizers/single_hop/specific.py b/src/ragas/testset/synthesizers/single_hop/specific.py index 283bca8d7..7561ac7b7 100644 --- a/src/ragas/testset/synthesizers/single_hop/specific.py +++ b/src/ragas/testset/synthesizers/single_hop/specific.py @@ -2,12 +2,13 @@ import logging import typing as t +from collections import defaultdict from dataclasses import dataclass import numpy as np from ragas.prompt import PydanticPrompt -from ragas.testset.graph import KnowledgeGraph +from ragas.testset.graph import KnowledgeGraph, Node from ragas.testset.persona import Persona, PersonaList from ragas.testset.synthesizers.base import BaseScenario from ragas.testset.synthesizers.prompts import ( @@ -40,6 +41,37 @@ class SingleHopScenario(BaseScenario): class SingleHopSpecificQuerySynthesizer(SingleHopQuerySynthesizer): name: str = "single_hop_specifc_query_synthesizer" theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt() + property_name: str = "entities" + + def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[Node]: + + node_type_dict = defaultdict(int) + for node in knowledge_graph.nodes: + if ( + node.type.name == "CHUNK" + and node.get_property(self.property_name) is not None + ): + node_type_dict["CHUNK"] += 1 + elif ( + node.type.name == "DOCUMENT" + and node.get_property(self.property_name) is not None + ): + node_type_dict["DOCUMENT"] += 1 + else: + pass + + node_filter = ( + "CHUNK" + if node_type_dict["CHUNK"] > node_type_dict["DOCUMENT"] + else "DOCUMENT" + ) + + nodes = [] + for node in knowledge_graph.nodes: + if node.type.name == node_filter: + nodes.append(node) + + return nodes async def _generate_scenarios( self, @@ -61,15 +93,7 @@ async def _generate_scenarios( 4. Return the list of scenarios """ - property_name = "entities" - nodes = [] - for node in knowledge_graph.nodes: - if ( - node.type.name == "CHUNK" - and node.get_property(property_name) is not None - ): - nodes.append(node) - + nodes = self.get_node_clusters(knowledge_graph) if len(nodes) == 0: raise ValueError("No nodes found with the `entities` property.") samples_per_node = int(np.ceil(n / len(nodes))) @@ -78,7 +102,7 @@ async def _generate_scenarios( for node in nodes: if len(scenarios) >= n: break - themes = node.get_property(property_name) + themes = node.properties.get(self.property_name, [""]) prompt_input = ThemesPersonasInput(themes=themes, personas=persona_list) persona_concepts = await self.theme_persona_matching_prompt.generate( data=prompt_input, llm=self.llm, callbacks=callbacks diff --git a/src/ragas/testset/transforms/default.py b/src/ragas/testset/transforms/default.py index 071c42756..11c0f84f9 100644 --- a/src/ragas/testset/transforms/default.py +++ b/src/ragas/testset/transforms/default.py @@ -25,8 +25,11 @@ from .engine import Transforms +from langchain_core.documents import Document as LCDocument + def default_transforms( + documents: t.List[LCDocument], llm: BaseRagasLLM, embedding_model: BaseRagasEmbeddings, ) -> Transforms: @@ -47,51 +50,116 @@ def default_transforms( """ - headline_extractor = HeadlinesExtractor(llm=llm) - splitter = HeadlineSplitter(min_tokens=500) + def count_doc_length_bins(documents, bin_ranges): + data = [num_tokens_from_string(doc.page_content) for doc in documents] + bins = {f"{start}-{end}": 0 for start, end in bin_ranges} + + for num in data: + for start, end in bin_ranges: + if start <= num <= end: + bins[f"{start}-{end}"] += 1 + break # Move to the next number once it’s placed in a bin - def summary_filter(node): + return bins + + def filter_doc_with_num_tokens(node, min_num_tokens=500): return ( node.type == NodeType.DOCUMENT - and num_tokens_from_string(node.properties["page_content"]) > 500 + and num_tokens_from_string(node.properties["page_content"]) > min_num_tokens + ) + + def filter_docs(node): + return node.type == NodeType.DOCUMENT + + def filter_chunks(node): + return node.type == NodeType.CHUNK + + bin_ranges = [(0, 100), (101, 500), (501, 100000)] + result = count_doc_length_bins(documents, bin_ranges) + result = {k: v / len(documents) for k, v in result.items()} + + transforms = [] + + if result["501-100000"] >= 0.25: + headline_extractor = HeadlinesExtractor( + llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node) + ) + splitter = HeadlineSplitter(min_tokens=500) + summary_extractor = SummaryExtractor( + llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node) + ) + + theme_extractor = ThemesExtractor( + llm=llm, filter_nodes=lambda node: filter_chunks(node) + ) + ner_extractor = NERExtractor( + llm=llm, filter_nodes=lambda node: filter_chunks(node) + ) + + summary_emb_extractor = EmbeddingExtractor( + embedding_model=embedding_model, + property_name="summary_embedding", + embed_property_name="summary", + filter_nodes=lambda node: filter_doc_with_num_tokens(node), + ) + + cosine_sim_builder = CosineSimilarityBuilder( + property_name="summary_embedding", + new_property_name="summary_similarity", + threshold=0.7, + filter_nodes=lambda node: filter_doc_with_num_tokens(node), + ) + + ner_overlap_sim = OverlapScoreBuilder( + threshold=0.01, filter_nodes=lambda node: filter_chunks(node) + ) + + node_filter = CustomNodeFilter( + llm=llm, filter_nodes=lambda node: filter_chunks(node) ) - summary_extractor = SummaryExtractor( - llm=llm, filter_nodes=lambda node: summary_filter(node) - ) - - theme_extractor = ThemesExtractor(llm=llm) - ner_extractor = NERExtractor( - llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK - ) - - summary_emb_extractor = EmbeddingExtractor( - embedding_model=embedding_model, - property_name="summary_embedding", - embed_property_name="summary", - filter_nodes=lambda node: summary_filter(node), - ) - - cosine_sim_builder = CosineSimilarityBuilder( - property_name="summary_embedding", - new_property_name="summary_similarity", - threshold=0.7, - filter_nodes=lambda node: summary_filter(node), - ) - - ner_overlap_sim = OverlapScoreBuilder( - threshold=0.01, filter_nodes=lambda node: node.type == NodeType.CHUNK - ) - - node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK) - - transforms = [ - headline_extractor, - splitter, - summary_extractor, - node_filter, - Parallel(summary_emb_extractor, theme_extractor, ner_extractor), - Parallel(cosine_sim_builder, ner_overlap_sim), - ] + transforms = [ + headline_extractor, + splitter, + summary_extractor, + node_filter, + Parallel(summary_emb_extractor, theme_extractor, ner_extractor), + Parallel(cosine_sim_builder, ner_overlap_sim), + ] + elif result["101-500"] >= 0.25: + summary_extractor = SummaryExtractor( + llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100) + ) + summary_emb_extractor = EmbeddingExtractor( + embedding_model=embedding_model, + property_name="summary_embedding", + embed_property_name="summary", + filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100), + ) + + cosine_sim_builder = CosineSimilarityBuilder( + property_name="summary_embedding", + new_property_name="summary_similarity", + threshold=0.5, + filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100), + ) + + ner_extractor = NERExtractor(llm=llm) + ner_overlap_sim = OverlapScoreBuilder(threshold=0.01) + theme_extractor = ThemesExtractor( + llm=llm, filter_nodes=lambda node: filter_docs(node) + ) + node_filter = CustomNodeFilter(llm=llm) + + transforms = [ + summary_extractor, + node_filter, + Parallel(summary_emb_extractor, theme_extractor, ner_extractor), + ner_overlap_sim, + ] + else: + raise ValueError( + "Documents appears to be too short (ie 100 tokens or less). Please provide longer documents." + ) return transforms diff --git a/src/ragas/testset/transforms/extractors/llm_based.py b/src/ragas/testset/transforms/extractors/llm_based.py index c7480e926..04616daa1 100644 --- a/src/ragas/testset/transforms/extractors/llm_based.py +++ b/src/ragas/testset/transforms/extractors/llm_based.py @@ -71,7 +71,7 @@ class TitleExtractorPrompt(PydanticPrompt[StringIO, StringIO]): class Headlines(BaseModel): - headlines: t.List[str] + headlines: t.List[str] class HeadlinesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Headlines]): @@ -115,8 +115,9 @@ class HeadlinesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Headlines "Main Concepts", "Detailed Analysis", "Future Directions", - ],) + ], ), + ), ] diff --git a/src/ragas/testset/transforms/filters.py b/src/ragas/testset/transforms/filters.py index 44370fde4..44add2758 100644 --- a/src/ragas/testset/transforms/filters.py +++ b/src/ragas/testset/transforms/filters.py @@ -63,14 +63,20 @@ class CustomNodeFilter(LLMBasedNodeFilter): async def custom_filter(self, node: Node, kg: KnowledgeGraph) -> bool: - parent_nodes = get_parent_nodes(node, kg) - if len(parent_nodes) > 0: - summary = parent_nodes[0].properties.get("summary", "") + if node.type.name == "CHUNK": + parent_nodes = get_parent_nodes(node, kg) + if len(parent_nodes) > 0: + summary = parent_nodes[0].properties.get("summary", "") + else: + summary = "" else: - summary = "" + summary = node.properties.get("summary", "") if summary == "": - logger.warning(f"Node {node} has no parent node with a summary.") + logger.warning( + f"Node {node.id} does not have a summary. Skipping filtering." + ) + return False prompt_input = QuestionPotentialInput( document_summary=summary, diff --git a/tests/unit/test_analytics.py b/tests/unit/test_analytics.py index 4233ea7d3..7f263d51c 100644 --- a/tests/unit/test_analytics.py +++ b/tests/unit/test_analytics.py @@ -2,6 +2,7 @@ import typing as t +import numpy as np import pytest from langchain_core.outputs import Generation, LLMResult from langchain_core.prompt_values import StringPromptValue as PromptValue @@ -139,11 +140,17 @@ def test_testset_generation_tracking(monkeypatch): "multi_hop_specific_query_synthesizer", ] - assert testset_event_payload.model_dump()["evolution_percentages"] == [ - 0.5, - 0.25, - 0.25, - ] + assert all( + np.isclose( + testset_event_payload.model_dump()["evolution_percentages"], + [ + 0.33, + 0.33, + 0.33, + ], + atol=0.01, + ).tolist() + ) # just in the case you actually want to check if tracking is working in the # dashboard