diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..6555687d --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,27 @@ +# Adala Development Guidelines + +## Build & Test Commands +- Install dependencies: `poetry install --with dev` +- Enter environment: `poetry shell` +- Run all tests: `pytest` +- Run specific test: `pytest tests/test_file.py::test_function_name` +- Run tests with recording: `pytest --record_mode=once --block-network` +- Run tests with network: `pytest -m "use_openai or use_azure"` +- Build docs: `mkdocs serve -f ./docs/mkdocs.yml` + +## Code Style +- Use Python type hints throughout the codebase +- Follow PEP 8 naming conventions: snake_case for variables/functions, CamelCase for classes +- Prefer composition over inheritance when extending framework components +- When defining new skills, inherit from appropriate base classes +- Use f-strings for string formatting +- Docstrings should follow Google style format +- Exception handling should use specific exception types from utils.exceptions +- Test coverage required for all new features + +## Architecture +- Agent: Main entry point - connects Skills, Environments and Runtimes +- Skills: Core capabilities (classification, extraction, etc.) +- Environments: Data sources and interaction points +- Runtimes: LLM backends (OpenAI, etc.) +- Utils: Shared functionality across components \ No newline at end of file diff --git a/adala/README.md b/adala/README.md new file mode 100644 index 00000000..1903e06e --- /dev/null +++ b/adala/README.md @@ -0,0 +1,41 @@ +# Adala - Simplified Data Processing with LLMs + +This is a refactored version of the Adala framework that provides a streamlined interface for processing tabular data through LLMs. + +## Key Components + +- **DataTable**: A lightweight wrapper around pandas DataFrame +- **BatchLLMRuntime**: Efficient batch processing of data through LLMs +- **DataProcessor**: High-level interface for data processing tasks +- **Classifier**: Specialized processor for classification tasks + +## Getting Started + +```python +import pandas as pd +from adala import Classifier + +# Create sample data +df = pd.DataFrame([ + "Not loud enough and doesn't turn on like it should.", + "The product works perfectly fine.", + "I absolutely love this device!" +], columns=["text"]) + +# Create a classifier +classifier = Classifier( + instructions="Classify product reviews as positive, negative, or neutral.", + labels=["Positive", "Negative", "Neutral"], + model="gpt-3.5-turbo" +) + +# Process the data +results = classifier.process(df) +print(results[["text", "label"]]) +``` + +## Legacy Components + +The original Adala components (Agent, Skills, Environments, Memories) are still available but deprecated. They have been moved to the `adala.legacy` module and will be removed in a future version. + +For migration guidance, see [MIGRATION.md](./core/MIGRATION.md) \ No newline at end of file diff --git a/adala/__init__.py b/adala/__init__.py index e69de29b..64bb970c 100644 --- a/adala/__init__.py +++ b/adala/__init__.py @@ -0,0 +1,31 @@ +""" +Adala: Data Processing with LLMs + +Adala provides tools for efficient data processing using Large Language Models. +""" + +__version__ = "0.0.4dev" + +# Import core components +from adala.core import DataProcessor, Classifier, LabelStudioProcessor, DataTable, BatchLLMRuntime + +# Legacy imports (with deprecation warnings) +from adala.agents import Agent +from adala.environments import StaticEnvironment +from adala.skills import ClassificationSkill, TransformSkill, LabelStudioSkill + +__all__ = [ + # Core components + 'DataProcessor', + 'Classifier', + 'LabelStudioProcessor', + 'DataTable', + 'BatchLLMRuntime', + + # Legacy components + 'Agent', + 'StaticEnvironment', + 'ClassificationSkill', + 'TransformSkill', + 'LabelStudioSkill', +] \ No newline at end of file diff --git a/adala/agents/__init__.py b/adala/agents/__init__.py index 0262c62d..3aed1c1b 100644 --- a/adala/agents/__init__.py +++ b/adala/agents/__init__.py @@ -1 +1,2 @@ -from .base import Agent, create_agent_from_file, create_agent_from_dict +"""Legacy module, use adala.core instead.""" +from adala.legacy.agents import * \ No newline at end of file diff --git a/adala/core/MIGRATION.md b/adala/core/MIGRATION.md new file mode 100644 index 00000000..4dfe1fe6 --- /dev/null +++ b/adala/core/MIGRATION.md @@ -0,0 +1,256 @@ +# Migration Guide: Transitioning to the Simplified API + +This guide will help you migrate from the original Adala API to the simplified core API. The new API offers a more direct way to process tabular data through LLMs in batch mode, with less boilerplate and abstraction. + +## Mapping Between APIs + +| Original API | Simplified API | Notes | +|--------------|----------------|-------| +| `Agent` | `DataProcessor` | The main entry point for processing data | +| `ClassificationSkill` | `Classifier` | Specialized processor for classification tasks | +| `LabelStudioSkill` | `LabelStudioProcessor` | Processor for Label Studio annotations | +| `StaticEnvironment` | Direct pandas/DataTable | Work directly with your data | +| `OpenAIChatRuntime` | `BatchLLMRuntime` | Streamlined runtime for batch processing | +| `InternalDataFrame` | `DataTable` | Enhanced pandas DataFrame for batch operations | + +## Code Examples + +### Classification Example + +#### Original API: + +```python +from adala.agents import Agent +from adala.environments import StaticEnvironment +from adala.skills import ClassificationSkill +from adala.runtimes import OpenAIChatRuntime +import pandas as pd + +df = pd.DataFrame([ + ["The product works great", "Positive"], + ["Terrible quality, avoid", "Negative"] +], columns=["text", "sentiment"]) + +agent = Agent( + skills=ClassificationSkill( + name='sentiment', + instructions="Classify text as positive, negative or neutral.", + labels={'sentiment': ["Positive", "Negative", "Neutral"]}, + input_template="Text: {text}", + output_template="Sentiment: {sentiment}" + ), + environment=StaticEnvironment( + df=df, + ground_truth_columns={'sentiment': 'sentiment'} + ), + runtimes = { + 'openai': OpenAIChatRuntime(model='gpt-3.5-turbo'), + }, + default_runtime='openai' +) + +# Train the agent +agent.learn(learning_iterations=3) + +# Run prediction on new data +test_df = pd.DataFrame(["This is a new product"], columns=["text"]) +predictions = agent.run(test_df) +``` + +#### Simplified API: + +```python +from adala.core import Classifier +import pandas as pd + +# Training data +df = pd.DataFrame([ + ["The product works great", "Positive"], + ["Terrible quality, avoid", "Negative"] +], columns=["text", "sentiment"]) + +# Create classifier +classifier = Classifier( + instructions="Classify text as positive, negative or neutral.", + labels=["Positive", "Negative", "Neutral"], + model="gpt-3.5-turbo" +) + +# No separate training step required; examples can be added as context +classifier.add_context( + examples=[ + {"text": "The product works great", "label": "Positive"}, + {"text": "Terrible quality, avoid", "label": "Negative"} + ] +) + +# Run prediction on new data +test_df = pd.DataFrame(["This is a new product"], columns=["text"]) +predictions = classifier.process(test_df) +``` + +### Custom Processing Example + +#### Original API: + +```python +from adala.agents import Agent +from adala.skills import TransformSkill +from adala.runtimes import OpenAIChatRuntime +from pydantic import BaseModel, Field + +class EntityExtraction(BaseModel): + person: str = Field(..., description="Person mentioned in text") + location: str = Field(..., description="Location mentioned in text") + +agent = Agent( + skills=TransformSkill( + name='entity_extraction', + instructions="Extract person and location entities from text.", + input_template="Text: {text}", + output_template="Entities: {entities}", + response_model=EntityExtraction + ), + runtimes = { + 'openai': OpenAIChatRuntime(model='gpt-4'), + }, + default_runtime='openai' +) + +# Run on data +import pandas as pd +df = pd.DataFrame(["John visited Paris last summer"], columns=["text"]) +results = agent.run(df) +``` + +#### Simplified API: + +```python +from adala.core import DataProcessor +from pydantic import BaseModel, Field +import pandas as pd + +class EntityExtraction(BaseModel): + person: str = Field(..., description="Person mentioned in text") + location: str = Field(..., description="Location mentioned in text") + +processor = DataProcessor( + prompt_template="Extract person and location entities from this text: {text}", + response_model=EntityExtraction, + model="gpt-4" +) + +# Run on data +df = pd.DataFrame(["John visited Paris last summer"], columns=["text"]) +results = processor.process(df) +``` + +### Label Studio Example + +#### Original API: + +```python +from adala.agents import Agent +from adala.skills import LabelStudioSkill +from adala.runtimes import OpenAIChatRuntime +import pandas as pd + +# Define the Label Studio configuration +label_config = """ + + + + + +""" + +# Create the agent with Label Studio skill +agent = Agent( + skills=LabelStudioSkill( + name='ner_tagger', + instructions="Annotate the text with named entities.", + label_config=label_config + ), + runtimes = { + 'openai': OpenAIChatRuntime(model='gpt-3.5-turbo'), + }, + default_runtime='openai' +) + +# Run on data +df = pd.DataFrame(["John works at Apple in San Francisco"], columns=["text"]) +results = agent.run(df) +``` + +#### Simplified API: + +```python +from adala import LabelStudioProcessor +import pandas as pd + +# Define the Label Studio configuration +label_config = """ + + + + + +""" + +# Create the processor +processor = LabelStudioProcessor( + label_config=label_config, + instructions="Annotate the text with named entities.", + model="gpt-3.5-turbo" +) + +# Run on data +df = pd.DataFrame(["John works at Apple in San Francisco"], columns=["text"]) +results = processor.process(df) +``` + +## Async Processing + +The simplified API supports asynchronous processing out of the box: + +```python +import asyncio +from adala.core import Classifier +import pandas as pd + +classifier = Classifier( + instructions="Classify the sentiment of the text.", + labels=["Positive", "Negative", "Neutral"], + model="gpt-3.5-turbo" +) + +async def process_data(): + df = pd.DataFrame(["I love this product", "This is terrible"], columns=["text"]) + results = await classifier.aprocess(df) + return results + +# Run async function +results = asyncio.run(process_data()) +``` + +## Benefits of Migration + +1. **Less Boilerplate**: Write less code to accomplish the same tasks +2. **Better Performance**: Direct batch processing without unnecessary wrapper code +3. **Simpler Mental Model**: Work directly with your data instead of through multiple abstraction layers +4. **Async Support**: First-class support for asynchronous processing +5. **Maintainability**: Less code means fewer bugs and easier maintenance + +## Common Migration Patterns + +1. **Replace Agent with DataProcessor or Classifier**: Choose the appropriate processor for your task +2. **Eliminate Environment**: Work directly with your data in pandas or DataTable format +3. **Convert Skills to Prompt Templates**: Move your skill logic into prompt templates and response models +4. **Replace Runtime Configuration**: Use the simplified BatchLLMRuntime with concurrency settings +5. **Use add_context() for Examples**: Instead of StaticEnvironment, add examples via context \ No newline at end of file diff --git a/adala/core/README.md b/adala/core/README.md new file mode 100644 index 00000000..61ce9f63 --- /dev/null +++ b/adala/core/README.md @@ -0,0 +1,136 @@ +# Adala Core - Simplified Data Processing with LLMs + +This module provides a streamlined interface for processing tabular data through LLMs, removing unnecessary abstraction layers while preserving the core functionality. + +## Key Components + +### DataTable + +A direct wrapper around pandas DataFrame that provides a clean interface for working with tabular data: + +```python +from adala.utils.internal_data import DataTable + +# Create from list of dicts +data = DataTable([ + {"text": "Sample text 1", "label": "positive"}, + {"text": "Sample text 2", "label": "negative"} +]) + +# Convert pandas DataFrame to DataTable +import pandas as pd +df = pd.DataFrame({"text": ["Sample 1", "Sample 2"]}) +data = DataTable.from_dataframe(df) + +# Get records as list of dicts +records = data.to_records() +``` + +### BatchLLMRuntime + +A runtime for processing batches of data through LLMs with efficient batching and concurrency: + +```python +from adala.runtimes.batch_llm import BatchLLMRuntime +from pydantic import BaseModel, Field + +class ClassificationOutput(BaseModel): + label: str = Field(..., description="Classification label") + +runtime = BatchLLMRuntime( + model="gpt-3.5-turbo", + max_tokens=1000, + temperature=0.0, + batch_size=10, + concurrency=4 +) + +results = runtime.process_batch( + data=my_data, + prompt_template="Classify the following text as positive or negative: {text}", + response_model=ClassificationOutput +) +``` + +### DataProcessor + +A high-level interface for data processing tasks: + +```python +from adala.core.processor import DataProcessor +from pydantic import BaseModel, Field + +class NamedEntityOutput(BaseModel): + person: str = Field(..., description="Name of person mentioned") + location: str = Field(..., description="Location mentioned") + +processor = DataProcessor( + prompt_template="Extract the person and location from this text: {text}", + response_model=NamedEntityOutput, + model="gpt-4" +) + +# Process a batch +results = processor.process(my_data) + +# Process asynchronously +import asyncio +async def run(): + results = await processor.aprocess(my_data) +``` + +### Classifier + +A specialized processor for classification tasks: + +```python +from adala.core.processor import Classifier + +classifier = Classifier( + instructions="Classify the text as one of the given categories", + labels=["Sports", "Politics", "Technology", "Entertainment"], + input_field="content", + output_field="category" +) + +# Add context to be included in all prompts +classifier.add_context( + examples=["Example 1: Sports", "Example 2: Politics"] +) + +# Process data +results = classifier.process(my_data) +``` + +## Benefits + +- **Direct Data Access**: Work directly with your tabular data +- **Minimal Configuration**: Less boilerplate, more focused on your task +- **Efficient Batch Processing**: Built-in batching and parallelism +- **Structured Output**: Pydantic models ensure properly formatted results +- **Async Support**: Process data asynchronously for greater throughput + +## Example Workflow + +```python +import pandas as pd +from adala.core.processor import Classifier + +# Load your data +df = pd.DataFrame([ + "New iPhone announced with improved camera", + "Government passes new healthcare legislation", + "Lakers win championship after close game" +], columns=["text"]) + +# Create and configure classifier +classifier = Classifier( + instructions="Classify the news headline into the appropriate category.", + labels=["Technology", "Politics", "Sports", "Entertainment"], + model="gpt-3.5-turbo" +) + +# Process the data +results = classifier.process(df) +print(results) +``` \ No newline at end of file diff --git a/adala/core/__init__.py b/adala/core/__init__.py new file mode 100644 index 00000000..3bba96c0 --- /dev/null +++ b/adala/core/__init__.py @@ -0,0 +1,19 @@ +""" +Adala Core - Simplified data processing with LLMs + +This module provides a streamlined interface for batch processing tabular data +through LLMs without unnecessary abstraction layers. +""" + +from adala.core.processor import DataProcessor, Classifier +from adala.core.label_studio import LabelStudioProcessor +from adala.utils.internal_data import DataTable +from adala.runtimes.batch_llm import BatchLLMRuntime + +__all__ = [ + 'DataProcessor', + 'Classifier', + 'LabelStudioProcessor', + 'DataTable', + 'BatchLLMRuntime' +] \ No newline at end of file diff --git a/adala/core/label_studio.py b/adala/core/label_studio.py new file mode 100644 index 00000000..779d8a01 --- /dev/null +++ b/adala/core/label_studio.py @@ -0,0 +1,300 @@ +""" +Label Studio Processor + +A simplified interface for Label Studio integration with Adala. +This module provides functionality to process data through Label Studio's annotation schema +using LLMs for automated labeling. +""" + +import re +import logging +from typing import List, Optional, Dict, Any, Union, Type, Sequence +from dataclasses import dataclass +from functools import cached_property + +import pandas as pd +from pydantic import BaseModel + +from adala.core.processor import DataProcessor +from adala.utils.internal_data import DataTable +from adala.runtimes.batch_llm import BatchLLMRuntime + +# Import Label Studio SDK +from label_studio_sdk.label_interface import LabelInterface +from label_studio_sdk.label_interface.control_tags import ControlTag, ObjectTag +from label_studio_sdk._extensions.label_studio_tools.core.utils.json_schema import ( + json_schema_to_pydantic, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class NERTag: + """Represents a Named Entity Recognition tag with its properties.""" + + start: Optional[int] = None + end: Optional[int] = None + text: Optional[str] = None + labels: List[str] = None + + def __post_init__(self): + self.labels = self.labels or [] + + +def extract_variable_name(input_string: str) -> List[str]: + """Extract variable names specified as $ from a string.""" + pattern = r"\$([a-zA-Z0-9_]+)" + return re.findall(pattern, input_string) + + +def validate_ner_tag_format( + df: DataTable, input_field: str, output_field: str +) -> DataTable: + """ + Validate and fix NER tag format in the output. + + Args: + df: Input DataTable containing both input and output fields + input_field: Name of the input text field + output_field: Name of the output field containing NER tags + + Returns: + DataTable with validated and fixed NER tag format + """ + for _, row in df.iterrows(): + if row.get("_error"): + continue + + input_text = row.get(input_field, "") + output_field_data = row.get(output_field) + + if not isinstance(output_field_data, list): + continue + + for item in output_field_data: + if not isinstance(item, dict): + continue + + start = item.get("start") + end = item.get("end") + + if ( + start is not None + and end is not None + and 0 <= start < end <= len(input_text) + ): + if not item.get("text"): + item["text"] = input_text[start:end] + + return df + + +def extract_missing_indices( + df: DataTable, input_field: str, output_field: str, text_field: str = "text" +) -> DataTable: + """ + Extract missing start and end indices for NER tags. + + Args: + df: Input DataTable containing both input and output fields + input_field: Name of the input text field + output_field: Name of the output field containing NER tags + text_field: Name of the field containing the text to match + + Returns: + DataTable with extracted indices where missing + """ + for _, row in df.iterrows(): + if row.get("_error"): + continue + + input_text = row.get(input_field, "") + output_field_data = row.get(output_field) + + if not isinstance(output_field_data, list): + continue + + for item in output_field_data: + if not isinstance(item, dict): + continue + + if (item.get("start") is None or item.get("end") is None) and item.get( + text_field + ): + text = item.get(text_field) + if isinstance(text, str): + start = input_text.find(text) + if start >= 0: + item["start"] = start + item["end"] = start + len(text) + + return df + + +class LabelStudioProcessor(DataProcessor): + """ + A processor for Label Studio annotations that uses LLMs for automated labeling. + + This processor takes a Label Studio XML configuration and generates appropriate + prompts and response models for annotation tasks. + """ + + def __init__( + self, + label_config: str, + allowed_control_tags: Optional[Sequence[str]] = None, + allowed_object_tags: Optional[Sequence[str]] = None, + instructions: Optional[str] = None, + runtime: Optional[BatchLLMRuntime] = None, + **runtime_kwargs, + ): + """ + Initialize a Label Studio processor. + + Args: + label_config: Label Studio XML configuration + allowed_control_tags: Optional sequence of control tag names to include + allowed_object_tags: Optional sequence of object tag names to include + instructions: Custom instructions for the LLM + runtime: BatchLLMRuntime instance (created automatically if not provided) + runtime_kwargs: Additional arguments to pass to BatchLLMRuntime if created + """ + self.label_config = label_config + self.allowed_control_tags = allowed_control_tags + self.allowed_object_tags = allowed_object_tags + + # Create and configure label interface + self._label_interface = self._create_filtered_interface() + + # Get schema from interface + self.field_schema = self._label_interface.to_json_schema() + + # Initialize the processor + super().__init__( + prompt_template=instructions + or "Annotate the input data according to the provided schema.", + response_model=BaseModel, # Placeholder, will be set dynamically + runtime=runtime, + **runtime_kwargs, + ) + + def _create_filtered_interface(self) -> LabelInterface: + """Create a filtered LabelInterface based on allowed tags.""" + if not (self.allowed_control_tags or self.allowed_object_tags): + return LabelInterface(self.label_config) + + control_tags = { + tag: self._label_interface._controls[tag] + for tag in (self.allowed_control_tags or self._label_interface._controls) + } + + object_tags = { + tag: self._label_interface._objects[tag] + for tag in (self.allowed_object_tags or self._label_interface._objects) + } + + interface = LabelInterface.create_instance(tags={**control_tags, **object_tags}) + logger.debug( + f"Filtered labeling config based on allowed tags: {interface.config}" + ) + return interface + + @property + def ner_tags(self) -> List[ControlTag]: + """Get NER tags from the label config.""" + control_tag_names = self.allowed_control_tags or list( + self._label_interface._controls.keys() + ) + return [ + tag + for tag_name in control_tag_names + if (tag := self._label_interface.get_control(tag_name)).tag.lower() + in {"labels", "hypertextlabels"} + and ( + not self.allowed_object_tags + or all( + object_tag.tag in self.allowed_object_tags + for object_tag in tag.objects + ) + ) + ] + + @property + def image_tags(self) -> List[ObjectTag]: + """Get image tags from the label config.""" + object_tag_names = self.allowed_object_tags or list( + self._label_interface._objects.keys() + ) + return [ + tag + for tag_name in object_tag_names + if (tag := self._label_interface.get_object(tag_name)).tag.lower() + == "image" + ] + + def _process_ner_tags(self, df: DataTable, result: DataTable) -> DataTable: + """Process NER tags in the result DataTable.""" + for ner_tag in self.ner_tags: + if not ner_tag.objects: + continue + + input_field = ner_tag.objects[0].value.lstrip("$") + output_field = ner_tag.name + + # Join input and output data + combined_df = pd.concat([df, result], axis=1) + + # Validate and fix NER output format + result = validate_ner_tag_format(combined_df, input_field, output_field) + + # Extract indices if missing + result = extract_missing_indices(result, input_field, output_field) + + return result + + def process( + self, + data: Union[pd.DataFrame, List[Dict], DataTable], + extra_context: Optional[Dict[str, Any]] = None, + ) -> DataTable: + """Process a batch of data through the Label Studio processor.""" + # Convert input to DataTable + df = ( + DataTable(data) + if isinstance(data, list) + else ( + DataTable.from_dataframe(data) + if isinstance(data, pd.DataFrame) and not isinstance(data, DataTable) + else data + ) + ) + + # Get dynamic response model from schema + with json_schema_to_pydantic(self.field_schema) as ResponseModel: + self.response_model = ResponseModel + result = super().process(data, extra_context) + return self._process_ner_tags(df, result) + + async def aprocess( + self, + data: Union[pd.DataFrame, List[Dict], DataTable], + extra_context: Optional[Dict[str, Any]] = None, + ) -> DataTable: + """Process a batch of data through the Label Studio processor asynchronously.""" + # Convert input to DataTable + df = ( + DataTable(data) + if isinstance(data, list) + else ( + DataTable.from_dataframe(data) + if isinstance(data, pd.DataFrame) and not isinstance(data, DataTable) + else data + ) + ) + + # Get dynamic response model from schema + with json_schema_to_pydantic(self.field_schema) as ResponseModel: + self.response_model = ResponseModel + result = await super().aprocess(data, extra_context) + return self._process_ner_tags(df, result) diff --git a/adala/core/processor.py b/adala/core/processor.py new file mode 100644 index 00000000..7be8a954 --- /dev/null +++ b/adala/core/processor.py @@ -0,0 +1,183 @@ +import logging +from typing import Any, Dict, List, Optional, Type, Union, Callable +import pandas as pd +from pydantic import BaseModel, Field + +from adala.utils.internal_data import DataTable +from adala.runtimes.batch_llm import BatchLLMRuntime + +logger = logging.getLogger(__name__) + + +class DataProcessor: + """ + A simplified processor for data labeling and transformation tasks. + + This class provides a direct interface for processing data through LLMs in batch mode, + without the complex wrapping of agents, skills, and environments in the original codebase. + """ + + def __init__( + self, + prompt_template: str, + response_model: Type[BaseModel], + runtime: Optional[BatchLLMRuntime] = None, + **runtime_kwargs + ): + """ + Initialize the data processor. + + Args: + prompt_template: Template for generating prompts to send to the LLM + response_model: Pydantic model for parsing structured LLM outputs + runtime: BatchLLMRuntime instance (created automatically if not provided) + runtime_kwargs: Additional arguments to pass to BatchLLMRuntime if created + """ + self.prompt_template = prompt_template + self.response_model = response_model + + # Create runtime if not provided + if runtime is None: + self.runtime = BatchLLMRuntime(**runtime_kwargs) + else: + self.runtime = runtime + + # Extra fields to include in prompt rendering + self.extra_fields = {} + + def add_context(self, **kwargs): + """ + Add context fields that will be included when rendering the prompt template. + + Args: + **kwargs: Key-value pairs of context variables + """ + self.extra_fields.update(kwargs) + return self + + def process( + self, + data: Union[pd.DataFrame, List[Dict], DataTable], + extra_context: Optional[Dict[str, Any]] = None + ) -> DataTable: + """ + Process a batch of data through the LLM. + + Args: + data: Input data to process (DataFrame, DataTable, or list of dicts) + extra_context: Additional context fields to include for this batch only + + Returns: + DataTable with inputs and results + """ + # Combine base extra_fields with batch-specific extra_context + context = {**self.extra_fields} + if extra_context: + context.update(extra_context) + + # Process the batch + return self.runtime.process_batch( + data=data, + prompt_template=self.prompt_template, + response_model=self.response_model, + extra_fields=context + ) + + async def aprocess( + self, + data: Union[pd.DataFrame, List[Dict], DataTable], + extra_context: Optional[Dict[str, Any]] = None + ) -> DataTable: + """ + Process a batch of data through the LLM asynchronously. + + Args: + data: Input data to process (DataFrame, DataTable, or list of dicts) + extra_context: Additional context fields to include for this batch only + + Returns: + DataTable with inputs and results + """ + # Combine base extra_fields with batch-specific extra_context + context = {**self.extra_fields} + if extra_context: + context.update(extra_context) + + # Process the batch asynchronously + return await self.runtime.aprocess_batch( + data=data, + prompt_template=self.prompt_template, + response_model=self.response_model, + extra_fields=context + ) + + +class Classifier(DataProcessor): + """ + A specialized data processor for classification tasks. + """ + + def __init__( + self, + instructions: str, + labels: List[str], + input_field: str = "text", + output_field: str = "label", + description: Optional[str] = None, + **kwargs + ): + """ + Initialize a classifier. + + Args: + instructions: Instructions for the classification task + labels: List of valid labels for classification + input_field: Name of the input field containing the text to classify + output_field: Name of the output field to store the classification result + description: Optional description of the classifier + **kwargs: Additional arguments to pass to DataProcessor + """ + # Create the classification schema + class ClassificationResult(BaseModel): + model_config = {"json_schema_extra": {"title": description or "Classification Result"}} + + # Dynamic field for classification output + _label_field: str = Field(alias=output_field, description=f"Classification label, one of: {', '.join(labels)}") + + # Validation to ensure the label is one of the allowed values + @property + def label(self) -> str: + return getattr(self, output_field) + + def model_post_init(self, __context): + label = getattr(self, output_field) + if label not in labels: + valid_labels = "', '".join(labels) + raise ValueError(f"Invalid label: '{label}'. Valid labels are: '{valid_labels}'") + + # Create the attribute dynamically + setattr(ClassificationResult, output_field, Field(..., description=f"Classification label, one of: {', '.join(labels)}")) + + # Create the prompt template + prompt_template = f""" +{instructions} + +Valid labels: {', '.join(labels)} + +Text: {{{input_field}}} + +Please classify the text and respond with only one of the valid labels. +""" + + # Initialize the data processor + super().__init__( + prompt_template=prompt_template, + response_model=ClassificationResult, + **kwargs + ) + + # Store additional properties + self.labels = labels + self.input_field = input_field + self.output_field = output_field + self.description = description \ No newline at end of file diff --git a/adala/environments/__init__.py b/adala/environments/__init__.py index 960f33ec..df49d066 100644 --- a/adala/environments/__init__.py +++ b/adala/environments/__init__.py @@ -1,6 +1,2 @@ -from .base import Environment, AsyncEnvironment, EnvironmentFeedback -from .static_env import StaticEnvironment -from .console import ConsoleEnvironment -from .web import WebStaticEnvironment -from .code_env import SimpleCodeValidationEnvironment -from .kafka import AsyncKafkaEnvironment +"""Legacy module, use adala.core instead.""" +from adala.legacy.environments import * \ No newline at end of file diff --git a/adala/legacy/__init__.py b/adala/legacy/__init__.py new file mode 100644 index 00000000..6d7d5697 --- /dev/null +++ b/adala/legacy/__init__.py @@ -0,0 +1,5 @@ +"""Legacy components of the Adala framework. + +These components are maintained for backward compatibility. +New code should use the adala.core module instead. +""" diff --git a/adala/legacy/agents/__init__.py b/adala/legacy/agents/__init__.py new file mode 100644 index 00000000..5ad4e77c --- /dev/null +++ b/adala/legacy/agents/__init__.py @@ -0,0 +1,10 @@ +import warnings + +warnings.warn( + "The adala.agents module is deprecated and will be removed in a future version. " + "Use adala.core.DataProcessor instead.", + DeprecationWarning, + stacklevel=2 +) + +from .base import Agent, create_agent_from_file, create_agent_from_dict diff --git a/adala/agents/base.py b/adala/legacy/agents/base.py similarity index 100% rename from adala/agents/base.py rename to adala/legacy/agents/base.py diff --git a/adala/legacy/environments/__init__.py b/adala/legacy/environments/__init__.py new file mode 100644 index 00000000..e5ac5251 --- /dev/null +++ b/adala/legacy/environments/__init__.py @@ -0,0 +1,15 @@ +import warnings + +warnings.warn( + "The adala.environments module is deprecated and will be removed in a future version. " + "Use pandas.DataFrame or adala.core.DataTable directly instead.", + DeprecationWarning, + stacklevel=2 +) + +from .base import Environment, AsyncEnvironment, EnvironmentFeedback +from .static_env import StaticEnvironment +from .console import ConsoleEnvironment +from .web import WebStaticEnvironment +from .code_env import SimpleCodeValidationEnvironment +from .kafka import AsyncKafkaEnvironment diff --git a/adala/environments/base.py b/adala/legacy/environments/base.py similarity index 100% rename from adala/environments/base.py rename to adala/legacy/environments/base.py diff --git a/adala/environments/code_env.py b/adala/legacy/environments/code_env.py similarity index 100% rename from adala/environments/code_env.py rename to adala/legacy/environments/code_env.py diff --git a/adala/environments/console.py b/adala/legacy/environments/console.py similarity index 100% rename from adala/environments/console.py rename to adala/legacy/environments/console.py diff --git a/adala/environments/kafka.py b/adala/legacy/environments/kafka.py similarity index 100% rename from adala/environments/kafka.py rename to adala/legacy/environments/kafka.py diff --git a/adala/environments/servers/__init__.py b/adala/legacy/environments/servers/__init__.py similarity index 100% rename from adala/environments/servers/__init__.py rename to adala/legacy/environments/servers/__init__.py diff --git a/adala/environments/servers/base.py b/adala/legacy/environments/servers/base.py similarity index 100% rename from adala/environments/servers/base.py rename to adala/legacy/environments/servers/base.py diff --git a/adala/environments/servers/discord_bot.py b/adala/legacy/environments/servers/discord_bot.py similarity index 100% rename from adala/environments/servers/discord_bot.py rename to adala/legacy/environments/servers/discord_bot.py diff --git a/adala/environments/static_env.py b/adala/legacy/environments/static_env.py similarity index 100% rename from adala/environments/static_env.py rename to adala/legacy/environments/static_env.py diff --git a/adala/environments/web.py b/adala/legacy/environments/web.py similarity index 100% rename from adala/environments/web.py rename to adala/legacy/environments/web.py diff --git a/adala/legacy/memories/__init__.py b/adala/legacy/memories/__init__.py new file mode 100644 index 00000000..d41be8af --- /dev/null +++ b/adala/legacy/memories/__init__.py @@ -0,0 +1,11 @@ +import warnings + +warnings.warn( + "The adala.memories module is deprecated and will be removed in a future version.", + DeprecationWarning, + stacklevel=2 +) + +from .file_memory import FileMemory +from .vectordb import VectorDBMemory +from .base import Memory diff --git a/adala/memories/base.py b/adala/legacy/memories/base.py similarity index 100% rename from adala/memories/base.py rename to adala/legacy/memories/base.py diff --git a/adala/memories/file_memory.py b/adala/legacy/memories/file_memory.py similarity index 100% rename from adala/memories/file_memory.py rename to adala/legacy/memories/file_memory.py diff --git a/adala/memories/vectordb.py b/adala/legacy/memories/vectordb.py similarity index 100% rename from adala/memories/vectordb.py rename to adala/legacy/memories/vectordb.py diff --git a/adala/legacy/skills/__init__.py b/adala/legacy/skills/__init__.py new file mode 100644 index 00000000..964b9ea4 --- /dev/null +++ b/adala/legacy/skills/__init__.py @@ -0,0 +1,16 @@ +import warnings + +warnings.warn( + "The adala.skills module is deprecated and will be removed in a future version. " + "Use adala.core.DataProcessor or adala.core.Classifier instead.", + DeprecationWarning, + stacklevel=2 +) + +from .skillset import SkillSet, LinearSkillSet, ParallelSkillSet +from .collection.classification import ClassificationSkill +from .collection.entity_extraction import EntityExtraction +from .collection.rag import RAGSkill +from .collection.ontology_creation import OntologyCreator, OntologyMerger +from .collection.label_studio import LabelStudioSkill +from ._base import Skill, TransformSkill, AnalysisSkill, SynthesisSkill diff --git a/adala/skills/_base.py b/adala/legacy/skills/_base.py similarity index 100% rename from adala/skills/_base.py rename to adala/legacy/skills/_base.py diff --git a/adala/skills/collection/__init__.py b/adala/legacy/skills/collection/__init__.py similarity index 100% rename from adala/skills/collection/__init__.py rename to adala/legacy/skills/collection/__init__.py diff --git a/adala/skills/collection/classification.py b/adala/legacy/skills/collection/classification.py similarity index 100% rename from adala/skills/collection/classification.py rename to adala/legacy/skills/collection/classification.py diff --git a/adala/skills/collection/entity_extraction.py b/adala/legacy/skills/collection/entity_extraction.py similarity index 100% rename from adala/skills/collection/entity_extraction.py rename to adala/legacy/skills/collection/entity_extraction.py diff --git a/adala/skills/collection/label_studio.py b/adala/legacy/skills/collection/label_studio.py similarity index 100% rename from adala/skills/collection/label_studio.py rename to adala/legacy/skills/collection/label_studio.py diff --git a/adala/skills/collection/ontology_creation.py b/adala/legacy/skills/collection/ontology_creation.py similarity index 100% rename from adala/skills/collection/ontology_creation.py rename to adala/legacy/skills/collection/ontology_creation.py diff --git a/adala/skills/collection/prompt_improvement.py b/adala/legacy/skills/collection/prompt_improvement.py similarity index 100% rename from adala/skills/collection/prompt_improvement.py rename to adala/legacy/skills/collection/prompt_improvement.py diff --git a/adala/skills/collection/qa.py b/adala/legacy/skills/collection/qa.py similarity index 100% rename from adala/skills/collection/qa.py rename to adala/legacy/skills/collection/qa.py diff --git a/adala/skills/collection/rag.py b/adala/legacy/skills/collection/rag.py similarity index 100% rename from adala/skills/collection/rag.py rename to adala/legacy/skills/collection/rag.py diff --git a/adala/skills/collection/summarization.py b/adala/legacy/skills/collection/summarization.py similarity index 100% rename from adala/skills/collection/summarization.py rename to adala/legacy/skills/collection/summarization.py diff --git a/adala/skills/collection/text_generation.py b/adala/legacy/skills/collection/text_generation.py similarity index 100% rename from adala/skills/collection/text_generation.py rename to adala/legacy/skills/collection/text_generation.py diff --git a/adala/skills/collection/translation.py b/adala/legacy/skills/collection/translation.py similarity index 100% rename from adala/skills/collection/translation.py rename to adala/legacy/skills/collection/translation.py diff --git a/adala/skills/skillset.py b/adala/legacy/skills/skillset.py similarity index 100% rename from adala/skills/skillset.py rename to adala/legacy/skills/skillset.py diff --git a/adala/memories/__init__.py b/adala/memories/__init__.py index 4e8c8242..3302fd39 100644 --- a/adala/memories/__init__.py +++ b/adala/memories/__init__.py @@ -1,3 +1,2 @@ -from .file_memory import FileMemory -from .vectordb import VectorDBMemory -from .base import Memory +"""Legacy module, use adala.core instead.""" +from adala.legacy.memories import * \ No newline at end of file diff --git a/adala/runtimes/batch_llm.py b/adala/runtimes/batch_llm.py new file mode 100644 index 00000000..3f232ecb --- /dev/null +++ b/adala/runtimes/batch_llm.py @@ -0,0 +1,323 @@ +import asyncio +import logging +from typing import Any, Dict, List, Optional, Type, Union + +import pandas as pd +from pydantic import BaseModel, Field +from tqdm import tqdm + +from adala.utils.internal_data import DataTable, InternalDataFrame +from adala.utils.parse import partial_str_format + +# Configure litellm for batch processing +import litellm +from litellm.exceptions import AuthenticationError + +logger = logging.getLogger(__name__) + + +class BatchLLMRuntime: + """A simplified runtime for batch processing with LLMs.""" + + def __init__( + self, + model: str = "gpt-4o-mini", + max_tokens: int = 1000, + temperature: float = 0.0, + batch_size: int = 10, + concurrency: int = 4, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + verbose: bool = False, + **model_kwargs + ): + """ + Initialize the batch LLM runtime. + + Args: + model: The LLM model to use + max_tokens: Maximum tokens to generate + temperature: Temperature for sampling + batch_size: Number of items to process in a batch + concurrency: Number of concurrent requests + api_key: API key for model provider + base_url: Base URL for API endpoint + verbose: Whether to print verbose logs + model_kwargs: Additional model parameters + """ + self.model = model + self.max_tokens = max_tokens + self.temperature = temperature + self.batch_size = batch_size + self.concurrency = concurrency + self.api_key = api_key + self.base_url = base_url + self.verbose = verbose + self.model_kwargs = model_kwargs + + # Validate model availability + self._check_model() + + def _check_model(self): + """Verify that the model is accessible with current credentials.""" + try: + litellm.completion( + model=self.model, + messages=[{"role": "user", "content": "Test"}], + max_tokens=10 + ) + except AuthenticationError: + raise ValueError(f"Model '{self.model}' is not available with your API key and settings.") + except Exception as e: + logger.warning(f"Failed to check model availability: {e}") + + def process_batch( + self, + data: Union[pd.DataFrame, List[Dict], DataTable], + prompt_template: str, + response_model: Type[BaseModel], + extra_fields: Optional[Dict[str, Any]] = None + ) -> DataTable: + """ + Process a batch of data through the LLM. + + Args: + data: Input data to process (DataFrame, DataTable, or list of dicts) + prompt_template: Template for generating prompts + response_model: Pydantic model for structured output parsing + extra_fields: Additional fields to include in prompt rendering + + Returns: + DataTable with results + """ + # Convert input to DataTable if necessary + if isinstance(data, list): + df = DataTable(data) + elif isinstance(data, pd.DataFrame) and not isinstance(data, DataTable): + df = DataTable.from_dataframe(data) + else: + df = data + + if df.empty: + return DataTable() + + # Get batch records + records = df.to_records() + extra_fields = extra_fields or {} + + # Process synchronously in smaller batches + results = [] + for i in range(0, len(records), self.batch_size): + batch = records[i:i + self.batch_size] + batch_results = self._process_records( + batch, + prompt_template=prompt_template, + response_model=response_model, + extra_fields=extra_fields + ) + results.extend(batch_results) + + # Create result dataframe and preserve index + result_df = DataTable(results) + if not result_df.empty and len(result_df) == len(df): + result_df.index = df.index + + return result_df + + def _process_records( + self, + records: List[Dict], + prompt_template: str, + response_model: Type[BaseModel], + extra_fields: Dict[str, Any] + ) -> List[Dict]: + """Process a batch of records.""" + import instructor + from instructor import Mode + + # Initialize the instructor client + client = instructor.from_litellm(litellm.completion, mode=Mode.TOOLS) + + # Format prompts for each record + formatted_prompts = [] + for record in records: + # Combine record with extra fields + context = {**record, **extra_fields} + # Format the prompt template with record data + prompt = partial_str_format(prompt_template, **context) + formatted_prompts.append(prompt) + + if self.verbose: + for i, prompt in enumerate(formatted_prompts): + logger.info(f"Prompt {i}:\n{prompt}") + + results = [] + + # Process each prompt + for i, prompt in enumerate(tqdm(formatted_prompts, desc="Processing records", disable=not self.verbose)): + try: + # Call LLM using instructor for structured output + response = client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + response_model=response_model, + max_tokens=self.max_tokens, + temperature=self.temperature, + **self.model_kwargs + ) + + # Convert response to dict and merge with original record + response_dict = response.model_dump() + result = {**records[i], **response_dict} + results.append(result) + + except Exception as e: + logger.error(f"Error processing record {i}: {e}") + # Include error information in result + result = { + **records[i], + "_error": str(e) + } + results.append(result) + + return results + + async def aprocess_batch( + self, + data: Union[pd.DataFrame, List[Dict], DataTable], + prompt_template: str, + response_model: Type[BaseModel], + extra_fields: Optional[Dict[str, Any]] = None + ) -> DataTable: + """ + Process a batch of data through the LLM asynchronously. + + Args: + data: Input data to process (DataFrame, DataTable, or list of dicts) + prompt_template: Template for generating prompts + response_model: Pydantic model for structured output parsing + extra_fields: Additional fields to include in prompt rendering + + Returns: + DataTable with results + """ + # Convert input to DataTable if necessary + if isinstance(data, list): + df = DataTable(data) + elif isinstance(data, pd.DataFrame) and not isinstance(data, DataTable): + df = DataTable.from_dataframe(data) + else: + df = data + + if df.empty: + return DataTable() + + # Get batch records + records = df.to_records() + extra_fields = extra_fields or {} + + # Process asynchronously + results = await self._aprocess_records( + records, + prompt_template=prompt_template, + response_model=response_model, + extra_fields=extra_fields + ) + + # Create result dataframe and preserve index + result_df = DataTable(results) + if not result_df.empty and len(result_df) == len(df): + result_df.index = df.index + + return result_df + + async def _aprocess_records( + self, + records: List[Dict], + prompt_template: str, + response_model: Type[BaseModel], + extra_fields: Dict[str, Any] + ) -> List[Dict]: + """Process records asynchronously.""" + import instructor + from instructor import Mode + + # Import AsyncOpenAI for async operations + from openai import AsyncOpenAI + + # Initialize the instructor client + async_client = instructor.from_openai( + AsyncOpenAI(api_key=self.api_key, base_url=self.base_url), + mode=Mode.TOOLS + ) + + # Format prompts for each record + tasks = [] + for i, record in enumerate(records): + # Combine record with extra fields + context = {**record, **extra_fields} + # Format the prompt template with record data + prompt = partial_str_format(prompt_template, **context) + + if self.verbose and i < 3: # Only log first few prompts + logger.info(f"Prompt {i}:\n{prompt}") + + # Create task + task = self._process_single_record_async( + record=record, + prompt=prompt, + response_model=response_model, + client=async_client, + index=i + ) + tasks.append(task) + + # Process in batches with concurrency limit + results = [] + for i in range(0, len(tasks), self.concurrency): + batch = tasks[i:i + self.concurrency] + batch_results = await asyncio.gather(*batch) + results.extend(batch_results) + + # Sort results by original index + results.sort(key=lambda x: x.get('_original_index', 0)) + # Remove temporary index field + for result in results: + if '_original_index' in result: + del result['_original_index'] + + return results + + async def _process_single_record_async( + self, + record: Dict, + prompt: str, + response_model: Type[BaseModel], + client: Any, + index: int + ) -> Dict: + """Process a single record asynchronously.""" + try: + # Call LLM using instructor for structured output + response = await client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + response_model=response_model, + max_tokens=self.max_tokens, + temperature=self.temperature, + **self.model_kwargs + ) + + # Convert response to dict and merge with original record + response_dict = response.model_dump() + result = {**record, **response_dict, '_original_index': index} + return result + + except Exception as e: + logger.error(f"Error processing record {index}: {e}") + # Include error information in result + return { + **record, + "_error": str(e), + '_original_index': index + } \ No newline at end of file diff --git a/adala/skills/__init__.py b/adala/skills/__init__.py index 6cd7c3fa..be784972 100644 --- a/adala/skills/__init__.py +++ b/adala/skills/__init__.py @@ -1,7 +1,2 @@ -from .skillset import SkillSet, LinearSkillSet, ParallelSkillSet -from .collection.classification import ClassificationSkill -from .collection.entity_extraction import EntityExtraction -from .collection.rag import RAGSkill -from .collection.ontology_creation import OntologyCreator, OntologyMerger -from .collection.label_studio import LabelStudioSkill -from ._base import Skill, TransformSkill, AnalysisSkill, SynthesisSkill +"""Legacy module, use adala.core instead.""" +from adala.legacy.skills import * \ No newline at end of file diff --git a/adala/utils/internal_data.py b/adala/utils/internal_data.py index 254d5c27..cee4d5c1 100644 --- a/adala/utils/internal_data.py +++ b/adala/utils/internal_data.py @@ -1,16 +1,40 @@ import pandas as pd -from typing import List, Dict, Any, Union, Iterable +from typing import List, Dict, Any, Union, Iterable, Optional, Type -Record = Dict[str, str] +Record = Dict[str, Any] -# Internal data tables representation. Replace this with Dask or Polars in the future. -InternalDataFrame = pd.DataFrame +# Use pandas DataFrame for internal data representation +class DataTable(pd.DataFrame): + """ + A simple wrapper around pandas DataFrame to provide common batch processing methods. + This provides a direct interface for tabular data handling with LLMs. + """ + + @classmethod + def from_records(cls, data: List[Dict]) -> 'DataTable': + """Create a DataTable from a list of dictionaries.""" + return cls(data) + + @classmethod + def from_dataframe(cls, df: pd.DataFrame) -> 'DataTable': + """Create a DataTable from a pandas DataFrame.""" + return cls(df) + + def to_records(self) -> List[Dict]: + """Convert to a list of dictionaries.""" + return self.to_dict(orient="records") + + @classmethod + def concat(cls, dfs: Iterable['DataTable'], **kwargs) -> 'DataTable': + """Concatenate multiple DataTables.""" + return cls(pd.concat(dfs, **kwargs)) + +# For backward compatibility +InternalDataFrame = DataTable InternalSeries = pd.Series - def InternalDataFrame_encoder(df: InternalDataFrame) -> List: - return df.to_dict(orient="records") - + return df.to_records() def InternalDataFrameConcat( dfs: Iterable[InternalDataFrame], **kwargs @@ -24,4 +48,4 @@ def InternalDataFrameConcat( Returns: InternalDataFrame: The concatenated dataframe. """ - return pd.concat(dfs, **kwargs) + return DataTable.concat(dfs, **kwargs) diff --git a/examples/simplified_classification.py b/examples/simplified_classification.py new file mode 100644 index 00000000..eb7d7ecb --- /dev/null +++ b/examples/simplified_classification.py @@ -0,0 +1,107 @@ +""" +Simplified Classification Example + +This example demonstrates the simplified API for classification tasks, +which provides a more direct way to process tabular data through LLMs. +""" + +import pandas as pd +from pydantic import BaseModel, Field +from adala import Classifier, DataTable # Import directly from top-level package + +# Create sample data +train_df = pd.DataFrame([ + ["The mic is great.", "Subjective"], + ["Will order from them again!", "Subjective"], + ["Not loud enough and doesn't turn on like it should.", "Objective"], + ["The phone doesn't seem to accept anything except CBR mp3s.", "Objective"], + ["All three broke within two months of use.", "Objective"] +], columns=["text", "ground_truth"]) + +# Create test data +test_df = pd.DataFrame([ + "Doesn't hold charge.", + "Excellent bluetooth headset", + "I love this thing!", + "VERY DISAPPOINTED." +], columns=['text']) + +# Create a classifier with minimal configuration +classifier = Classifier( + instructions="Classify a product review as either expressing 'Subjective' or 'Objective' statements.", + labels=["Subjective", "Objective"], + input_field="text", + output_field="prediction", + description="Subjectivity classification for product reviews", + model="gpt-3.5-turbo", + temperature=0.0, + verbose=True +) + +# Add context about what subjective vs objective means +classifier.add_context( + definition=""" + Subjective statements express personal feelings, emotions, or preferences. + Objective statements describe factual details about product functionality or performance, + even when based on personal experiences. + """ +) + +# Process the test data +results = classifier.process(test_df) + +# Print the results +print("\nClassification Results:") +print(results[["text", "prediction"]]) + +""" +Example of using a custom response model: +""" + +# Define a custom response model +class SentimentAnalysis(BaseModel): + sentiment: str = Field(..., description="The sentiment of the text (Positive, Negative, or Neutral)") + confidence: float = Field(..., description="Confidence score between 0 and 1") + reasoning: str = Field(..., description="Reasoning behind the sentiment classification") + +# Create a data processor for sentiment analysis +from adala import DataProcessor + +sentiment_analyzer = DataProcessor( + prompt_template=""" + Analyze the sentiment of the following text and determine if it's Positive, Negative, or Neutral. + + Text: {text} + + Provide your sentiment analysis with a confidence score and reasoning. + """, + response_model=SentimentAnalysis, + model="gpt-3.5-turbo", + temperature=0.2 +) + +# Process the sentiment +sentiment_results = sentiment_analyzer.process(test_df[:2]) + +# Print the sentiment results +print("\nSentiment Analysis Results:") +for i, row in sentiment_results.iterrows(): + print(f"\nText: {row['text']}") + print(f"Sentiment: {row['sentiment']} (Confidence: {row['confidence']:.2f})") + print(f"Reasoning: {row['reasoning']}") + +""" +Example of asynchronous processing: +""" + +import asyncio + +async def run_async_example(): + print("\nRunning Async Example...") + async_results = await classifier.aprocess(test_df[2:]) + print("\nAsync Classification Results:") + print(async_results[["text", "prediction"]]) + +if __name__ == "__main__": + # Run the async example + asyncio.run(run_async_example()) \ No newline at end of file diff --git a/examples/simplified_label_studio.py b/examples/simplified_label_studio.py new file mode 100644 index 00000000..3eace259 --- /dev/null +++ b/examples/simplified_label_studio.py @@ -0,0 +1,90 @@ +""" +Simplified Label Studio Example + +This example demonstrates how to use the simplified LabelStudioProcessor +for annotation tasks using Label Studio's schema. +""" + +import pandas as pd +from adala import LabelStudioProcessor, DataTable + +# Define a simple Label Studio configuration for NER +LABEL_CONFIG = """ + + + + + +""" + +# Create sample data +data = pd.DataFrame([ + {"text": "John works at Apple in San Francisco."}, + {"text": "Microsoft has an office in Seattle where Sarah is working."}, + {"text": "The president visited New York last week."} +]) + +# Create the Label Studio processor +processor = LabelStudioProcessor( + label_config=LABEL_CONFIG, + instructions=""" + Annotate the text with named entities. + + - Person: Names of people (e.g., "John", "Sarah") + - Organization: Names of companies and organizations (e.g., "Apple", "Microsoft") + - Location: Names of cities, countries, and other locations (e.g., "San Francisco", "Seattle") + + Please annotate all instances in the text. + """, + model="gpt-3.5-turbo", + temperature=0.0, + verbose=True +) + +# Process the data +print("\nProcessing data with Label Studio schema...") +results = processor.process(data) + +# Print the results in a readable format +print("\nAnnotation Results:") +for i, row in results.iterrows(): + print(f"\nText: {row['text']}") + print("Entities:") + for entity in row.get('ner_tags', []): + if isinstance(entity, dict): + print(f" - {entity.get('text', 'N/A')} [{entity.get('labels', [''])[0]}] " + f"({entity.get('start', 'N/A')}-{entity.get('end', 'N/A')})") + +""" +Example with image input: +""" + +# Define a Label Studio configuration for image classification +IMAGE_LABEL_CONFIG = """ + + + + + + + + + + +""" + +# Create sample image data +image_data = pd.DataFrame([ + {"image_url": "https://example.com/images/cat.jpg"}, + {"image_url": "https://example.com/images/mountain.jpg"}, + {"image_url": "https://example.com/images/person.jpg"} +]) + +print("\n\nImage Classification Example:") +print("This example would work with actual image URLs and a vision model") +print(f"Label Studio Config: {IMAGE_LABEL_CONFIG}") +print(f"Sample Data: {image_data.head()}") +print("To run this example, you would need:\n1. Real image URLs\n2. A vision-capable model like gpt-4-vision") \ No newline at end of file