Source code for rhesis.sdk.entities.test_set

import csv
import json
import logging
import uuid as uuid_mod
from pathlib import Path
from typing import Any, ClassVar, Dict, List, Optional, Union

from jinja2 import Template
from pydantic import BaseModel, field_validator

from rhesis.sdk.clients import APIClient, Endpoints, Methods
from rhesis.sdk.entities import BaseEntity, Endpoint
from rhesis.sdk.entities.base_collection import BaseCollection
from rhesis.sdk.entities.base_entity import handle_http_errors
from rhesis.sdk.entities.prompt import Prompt
from rhesis.sdk.entities.test import Test
from rhesis.sdk.enums import ExecutionMode, TestType
from rhesis.sdk.models.base import BaseLLM

logger = logging.getLogger(__name__)

ENDPOINT = Endpoints.TEST_SETS


class TestSetProperties(BaseModel):
    name: str
    description: str
    short_description: str



[docs]
class TestSet(BaseEntity):
    endpoint: ClassVar[Endpoints] = ENDPOINT

    id: Optional[str] = None
    tests: Optional[list[Test]] = None
    categories: Optional[list[str]] = None
    topics: Optional[list[str]] = None
    behaviors: Optional[list[str]] = None
    test_count: Optional[int] = None
    name: str
    description: str
    short_description: str
    test_set_type: Optional[TestType] = None
    metadata: dict = {}


[docs]
    @field_validator("test_set_type", mode="before")
    @classmethod
    def extract_test_set_type(cls, v: Any) -> Optional[str]:
        """Extract type_value from nested dict if backend returns full TypeLookup object.

        Handles multiple input types:
        - None: returns None
        - TestType enum: returns the enum value string
        - str: returns as-is (Pydantic handles enum conversion)
        - dict: extracts 'type_value' key (backend API response format)
        """
        if v is None:
            return None
        if isinstance(v, TestType):
            return v.value
        if isinstance(v, str):
            return v
        if isinstance(v, dict):
            return v.get("type_value")
        return v


    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------

    def _resolve_metric_id(
        self,
        metric: Union[Dict[str, Any], str],
        client: Optional[APIClient] = None,
    ) -> str:
        """Resolve a single metric reference to an ID.

        Accepts:
            - A dict with an ``"id"`` key
            - A UUID string (used directly)
            - A metric name string (looked up via ``GET /metrics``)

        Args:
            metric: Metric reference (dict, UUID string, or name).
            client: Optional shared APIClient instance.
        """
        if isinstance(metric, dict):
            mid = metric.get("id")
            if not mid:
                raise ValueError("Metric dict must contain an 'id' key")
            return str(mid)

        metric_str = str(metric)

        # Check if it looks like a UUID
        try:
            uuid_mod.UUID(metric_str)
            return metric_str
        except ValueError:
            pass

        # Treat as a name – look up via the metrics endpoint
        if client is None:
            client = APIClient()
        results = client.send_request(
            endpoint=Endpoints.METRICS,
            method=Methods.GET,
            params={"$filter": f"name eq '{metric_str}'"},
        )
        if not results:
            raise ValueError(f"Metric not found: {metric_str}")

        items = results if isinstance(results, list) else [results]
        if not items:
            raise ValueError(f"Metric not found: {metric_str}")

        return str(items[0]["id"])

    def _resolve_metrics(self, metrics: List[Union[Dict[str, Any], str]]) -> List[Dict[str, Any]]:
        """Resolve a mixed list of metric dicts / name strings."""
        client = APIClient()
        resolved: List[Dict[str, Any]] = []
        for m in metrics:
            if isinstance(m, dict):
                resolved.append(m)
            else:
                metric_id = self._resolve_metric_id(m, client=client)
                resolved.append({"id": metric_id, "name": str(m)})
        return resolved

    def _resolve_run_id(self, run: Union[str, Any]) -> str:
        """Resolve a test run reference to an ID.

        Accepts a TestRun instance, a UUID string, or a test run name.
        """
        if hasattr(run, "id") and run.id:
            return str(run.id)

        run_str = str(run)

        try:
            uuid_mod.UUID(run_str)
            return run_str
        except ValueError:
            pass

        # Treat as a name
        from rhesis.sdk.entities.test_run import TestRuns

        resolved = TestRuns.pull(name=run_str)
        if resolved is None or not getattr(resolved, "id", None):
            raise ValueError(f"Test run not found: {run_str}")
        return str(resolved.id)

    def _build_execution_body(
        self,
        *,
        mode: Union[str, ExecutionMode] = ExecutionMode.PARALLEL,
        metrics: Optional[List[Union[Dict[str, Any], str]]] = None,
        reference_test_run_id: Optional[str] = None,
    ) -> Dict[str, Any]:
        """Build the request body for the execute endpoint."""
        resolved_mode = ExecutionMode.from_string(mode)
        body: Dict[str, Any] = {
            "execution_options": {
                "execution_mode": resolved_mode.value,
            },
        }
        if metrics:
            body["metrics"] = self._resolve_metrics(metrics)
        if reference_test_run_id:
            body["reference_test_run_id"] = reference_test_run_id
        return body

    # ------------------------------------------------------------------
    # Execution
    # ------------------------------------------------------------------


[docs]
    @handle_http_errors
    def execute(
        self,
        endpoint: Endpoint,
        *,
        mode: Union[str, ExecutionMode] = ExecutionMode.PARALLEL,
        metrics: Optional[List[Union[Dict[str, Any], str]]] = None,
    ) -> Optional[Dict[str, Any]]:
        """Execute the test set against the given endpoint.

        Args:
            endpoint: The endpoint to execute tests against.
            mode: Execution mode – ``ExecutionMode.PARALLEL`` (default),
                ``ExecutionMode.SEQUENTIAL``, or ``"parallel"`` / ``"sequential"``.
            metrics: Optional list of metrics for this execution.
                Overrides test set and behavior metrics.  Each item
                can be a dict with ``"id"``, ``"name"``, and optional
                ``"scope"``; or a metric name string (resolved via
                the ``/metrics`` API).

        Returns:
            Dict containing the execution submission response, or
            ``None`` if an error occurred.

        Raises:
            ValueError: If test set ID is not set.

        Example:
            >>> test_set = TestSets.pull(name="Safety Tests")
            >>> endpoint = Endpoints.pull(name="GPT-4o")
            >>> result = test_set.execute(endpoint)
            >>> result = test_set.execute(endpoint, mode=ExecutionMode.SEQUENTIAL)
            >>> result = test_set.execute(endpoint, mode="sequential")
        """
        if not self.id:
            raise ValueError("Test set ID must be set before executing")

        body = self._build_execution_body(mode=mode, metrics=metrics)
        client = APIClient()
        return client.send_request(
            endpoint=self.endpoint,
            method=Methods.POST,
            url_params=f"{self.id}/execute/{endpoint.id}",
            data=body,
        )



[docs]
    @handle_http_errors
    def rescore(
        self,
        endpoint: Endpoint,
        run: Optional[Union[str, Any]] = None,
        *,
        mode: Union[str, ExecutionMode] = ExecutionMode.PARALLEL,
        metrics: Optional[List[Union[Dict[str, Any], str]]] = None,
    ) -> Optional[Dict[str, Any]]:
        """Re-score outputs from an existing test run.

        Re-evaluates metrics on stored outputs without calling the
        endpoint again.

        Args:
            endpoint: The endpoint the original run was executed
                against.
            run: The test run whose outputs to re-score.  Accepts:

                - A ``TestRun`` instance
                - A string test run ID (UUID)
                - A string test run name (resolved via
                  ``TestRuns`` collection)
                - ``None`` (default) – uses the latest completed run

            mode: Execution mode – ``ExecutionMode.PARALLEL`` (default),
                ``ExecutionMode.SEQUENTIAL``, or ``"parallel"`` / ``"sequential"``.
            metrics: Optional list of metrics for re-scoring.

        Returns:
            Dict containing the execution submission response, or
            ``None`` if an error occurred.

        Raises:
            ValueError: If test set ID is not set or no completed
                run is found when *run* is ``None``.

        Example:
            >>> test_set.rescore(endpoint)
            >>> test_set.rescore(endpoint, run="Safety - Run 42")
            >>> test_set.rescore(endpoint, metrics=["Accuracy"])
        """
        if not self.id:
            raise ValueError("Test set ID must be set before rescoring")

        if run is None:
            last = self.last_run(endpoint)
            if not last:
                raise ValueError("No completed test run found for this test set and endpoint")
            run_id = str(last["id"])
        else:
            run_id = self._resolve_run_id(run)

        body = self._build_execution_body(
            mode=mode,
            metrics=metrics,
            reference_test_run_id=run_id,
        )
        client = APIClient()
        return client.send_request(
            endpoint=self.endpoint,
            method=Methods.POST,
            url_params=f"{self.id}/execute/{endpoint.id}",
            data=body,
        )



[docs]
    @handle_http_errors
    def last_run(self, endpoint: Endpoint) -> Optional[Dict[str, Any]]:
        """Get the most recent completed test run.

        Returns a summary dict for the latest completed run of this
        test set against the given endpoint, or ``None`` if no
        completed run exists.

        The dict contains: ``id``, ``nano_id``, ``name``, ``status``,
        ``created_at``, ``test_count``, and ``pass_rate``.

        Args:
            endpoint: The endpoint to look up the last run for.

        Raises:
            ValueError: If test set ID is not set.

        Example:
            >>> last = test_set.last_run(endpoint)
            >>> if last:
            ...     print(last["pass_rate"])
        """
        if not self.id:
            raise ValueError("Test set ID must be set before fetching last run")

        client = APIClient()
        return client.send_request(
            endpoint=self.endpoint,
            method=Methods.GET,
            url_params=f"{self.id}/last-run/{endpoint.id}",
        )


    # ------------------------------------------------------------------
    # Test set metric management
    # ------------------------------------------------------------------


[docs]
    @handle_http_errors
    def get_metrics(self) -> Optional[List[Dict[str, Any]]]:
        """Get metrics associated with this test set.

        Returns:
            A list of metric dicts, or an empty list if none are
            assigned.

        Raises:
            ValueError: If test set ID is not set.
        """
        if not self.id:
            raise ValueError("Test set ID must be set before fetching metrics")

        client = APIClient()
        return client.send_request(
            endpoint=self.endpoint,
            method=Methods.GET,
            url_params=f"{self.id}/metrics",
        )



[docs]
    @handle_http_errors
    def add_metric(self, metric: Union[Dict[str, Any], str]) -> Optional[List[Dict[str, Any]]]:
        """Add a metric to this test set.

        Args:
            metric: The metric to add.  Accepts a dict with an
                ``"id"`` key, a UUID string, or a metric name string
                (resolved via the ``/metrics`` API).

        Returns:
            The updated list of metrics on this test set.

        Raises:
            ValueError: If test set ID is not set or the metric
                cannot be resolved.
        """
        if not self.id:
            raise ValueError("Test set ID must be set before adding metrics")

        metric_id = self._resolve_metric_id(metric)
        client = APIClient()
        return client.send_request(
            endpoint=self.endpoint,
            method=Methods.POST,
            url_params=f"{self.id}/metrics/{metric_id}",
        )



[docs]
    def add_metrics(
        self, metrics: List[Union[Dict[str, Any], str]]
    ) -> Optional[List[Dict[str, Any]]]:
        """Add multiple metrics to this test set.

        Args:
            metrics: A list where each item can be a dict, UUID
                string, or metric name string.

        Returns:
            The updated list of metrics on this test set after all
            additions.
        """
        result = None
        for metric in metrics:
            result = self.add_metric(metric)
        return result



[docs]
    @handle_http_errors
    def remove_metric(self, metric: Union[Dict[str, Any], str]) -> Optional[List[Dict[str, Any]]]:
        """Remove a metric from this test set.

        Accepts the same input types as :meth:`add_metric`.

        Returns:
            The updated list of metrics on this test set.

        Raises:
            ValueError: If test set ID is not set or the metric
                cannot be resolved.
        """
        if not self.id:
            raise ValueError("Test set ID must be set before removing metrics")

        metric_id = self._resolve_metric_id(metric)
        client = APIClient()
        return client.send_request(
            endpoint=self.endpoint,
            method=Methods.DELETE,
            url_params=f"{self.id}/metrics/{metric_id}",
        )



[docs]
    def remove_metrics(
        self, metrics: List[Union[Dict[str, Any], str]]
    ) -> Optional[List[Dict[str, Any]]]:
        """Remove multiple metrics from this test set.

        Args:
            metrics: A list where each item can be a dict, UUID
                string, or metric name string.

        Returns:
            The updated list of metrics on this test set after all
            removals.
        """
        result = None
        for metric in metrics:
            result = self.remove_metric(metric)
        return result


    # ------------------------------------------------------------------
    # Properties / LLM
    # ------------------------------------------------------------------


[docs]
    def set_properties(self, model: BaseLLM) -> None:
        """Set test set attributes using LLM based on categories and topics in tests.

        This method:
        1. Gets the unique categories and topics from tests
        2. Uses the LLM service to generate appropriate name, description, and short description
        3. Updates the test set's attributes

        Example:
            >>> test_set = TestSet(id='123')
            >>> test_set.set_properties()
            >>> print(f"Name: {test_set.name}")
            >>> print(f"Description: {test_set.description}")
        """
        # Ensure tests are loaded
        if self.tests is None:
            raise ValueError("Test set must have at least one test before setting properties")

        # Get unique categories and topics
        categories = set()
        topics = set()
        if self.tests is not None:
            for test in self.tests:
                if isinstance(test, dict):
                    if "category" in test and test["category"]:
                        categories.add(test["category"])
                    if "topic" in test and test["topic"]:
                        topics.add(test["topic"])

        # Load the prompt template
        prompt_path = (
            Path(__file__).parent.parent / "synthesizers" / "assets" / "test_set_properties.md"
        )
        with open(prompt_path, "r") as f:
            template = Template(f.read())

        # Format the prompt
        formatted_prompt = template.render(
            topics=sorted(list(topics)), categories=sorted(list(categories))
        )

        # Get response from LLLM

        response = model.generate(formatted_prompt, schema=TestSetProperties)
        # Update test set attributes
        if isinstance(response, dict):
            self.name = response["name"]
            self.description = response["description"]
            self.short_description = response["short_description"]
            self.categories = sorted(list(categories))
            self.topics = sorted(list(topics))
            self.test_count = len(self.tests) if self.tests is not None else 0
        else:
            raise ValueError("LLM response was not in the expected format")


    @classmethod
    def _create(cls, data: Dict[str, Any]) -> Dict[str, Any]:
        """Create a test set using the bulk endpoint.

        Args:
            data: Dictionary containing test set data including tests

        Returns:
            Dict containing the created test set data from the API

        Raises:
            ValueError: If tests are not provided
        """
        if not data.get("tests"):
            raise ValueError("Test set must have at least one test before creating")

        client = APIClient()
        response = client.send_request(
            endpoint=cls.endpoint,
            method=Methods.POST,
            url_params="bulk",
            data=data,
        )
        return response


[docs]
    def push(self) -> Optional[Dict[str, Any]]:
        """Save the test set to the database.

        Uses the bulk endpoint to create test set with tests.

        Returns:
            Dict containing the response from the API, or None if error occurred.

        Example:
            >>> test_set = TestSet(
            ...     name="My Test Set",
            ...     description="Test set description",
            ...     short_description="Short desc",
            ...     tests=[test1, test2, test3]
            ... )
            >>> result = test_set.push()
            >>> print(f"Created test set with ID: {test_set.id}")
        """
        # mode="json": Ensures enums are serialized as strings instead of enum objects
        # exclude_none=True: Excludes None values so backend uses defaults
        data = self.model_dump(mode="json", exclude_none=True)

        response = self._create(data)
        if response and "id" in response:
            self.id = response["id"]

        return response



[docs]
    def to_csv(self, filename: Union[str, Path]) -> None:
        """Save the tests from this test set to a CSV file.

        Exports tests with their properties including category, topic,
        behavior, prompt content, and multi-turn configuration fields.

        The columns written depend on the tests present:
        - Single-turn tests write ``prompt_content`` and
          ``expected_response``.
        - Multi-turn tests write ``goal``, ``instructions``,
          ``restrictions``, and ``scenario``.
        - If the set is mixed, all columns are included.

        Args:
            filename: Path to the CSV file to create/overwrite.

        Raises:
            ValueError: If the test set has no tests.

        Example:
            >>> test_set = TestSet(name="My Tests", ...)
            >>> test_set.to_csv("my_tests.csv")
        """
        if not self.tests:
            raise ValueError("Test set has no tests to export")

        # Determine which column sets are needed
        has_single = False
        has_multi = False
        for test in self.tests:
            obj = Test(**test) if isinstance(test, dict) else test
            if obj.test_configuration and obj.test_configuration.goal:
                has_multi = True
            if obj.prompt and obj.prompt.content:
                has_single = True

        fieldnames = ["category", "topic", "behavior", "test_type"]
        if has_single or not has_multi:
            fieldnames += ["prompt_content", "expected_response"]
        if has_multi:
            fieldnames += [
                "goal",
                "instructions",
                "restrictions",
                "scenario",
            ]

        filepath = Path(filename)
        with open(filepath, "w", newline="", encoding="utf-8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for test in self.tests:
                test_obj = Test(**test) if isinstance(test, dict) else test

                row: Dict[str, str] = {
                    "category": test_obj.category or "",
                    "topic": test_obj.topic or "",
                    "behavior": test_obj.behavior or "",
                    "test_type": (
                        test_obj.test_type.value
                        if isinstance(test_obj.test_type, TestType)
                        else str(test_obj.test_type or "Single-Turn")
                    ),
                }

                if "prompt_content" in fieldnames:
                    row["prompt_content"] = test_obj.prompt.content if test_obj.prompt else ""
                    row["expected_response"] = (
                        test_obj.prompt.expected_response or "" if test_obj.prompt else ""
                    )

                if "goal" in fieldnames and test_obj.test_configuration:
                    cfg = test_obj.test_configuration
                    row["goal"] = cfg.goal or ""
                    row["instructions"] = cfg.instructions or ""
                    row["restrictions"] = cfg.restrictions or ""
                    row["scenario"] = cfg.scenario or ""
                elif "goal" in fieldnames:
                    row["goal"] = ""
                    row["instructions"] = ""
                    row["restrictions"] = ""
                    row["scenario"] = ""

                writer.writerow(row)


    def _test_to_dict(self, test: Union[Test, Dict[str, Any]]) -> Dict[str, Any]:
        """Convert a Test object to a dictionary for JSON export.

        Args:
            test: Test object or dict to convert.

        Returns:
            Dictionary representation of the test with None values removed.
        """
        if isinstance(test, dict):
            test_obj = Test(**test)
        else:
            test_obj = test

        test_data: Dict[str, Any] = {
            "category": test_obj.category,
            "topic": test_obj.topic,
            "behavior": test_obj.behavior,
        }

        # Add prompt data if available
        if test_obj.prompt:
            test_data["prompt"] = {
                "content": test_obj.prompt.content,
            }
            if test_obj.prompt.expected_response:
                test_data["prompt"]["expected_response"] = test_obj.prompt.expected_response
            if test_obj.prompt.language_code and test_obj.prompt.language_code != "en":
                test_data["prompt"]["language_code"] = test_obj.prompt.language_code

        # Add test type if set
        if test_obj.test_type:
            test_data["test_type"] = (
                test_obj.test_type.value
                if isinstance(test_obj.test_type, TestType)
                else test_obj.test_type
            )

        # Add test configuration for multi-turn tests
        if test_obj.test_configuration:
            test_data["test_configuration"] = {
                "goal": test_obj.test_configuration.goal,
                "instructions": test_obj.test_configuration.instructions,
                "restrictions": test_obj.test_configuration.restrictions,
                "scenario": test_obj.test_configuration.scenario,
            }

        # Add metadata if present
        if test_obj.metadata:
            test_data["metadata"] = test_obj.metadata

        # Remove None values for cleaner output
        return {k: v for k, v in test_data.items() if v is not None}

    @classmethod
    def _dict_to_test(cls, entry: Dict[str, Any]) -> Optional[Test]:
        """Convert a dictionary entry to a Test object.

        Supports both nested and flat formats for prompts and
        test configuration.  Flat multi-turn fields (``goal``,
        ``instructions``, ``restrictions``, ``scenario``) are
        forwarded to the ``Test`` constructor whose
        ``model_validator`` builds ``test_configuration``
        automatically.

        Args:
            entry: Dictionary containing test data.

        Returns:
            Test object, or None if the entry is empty/invalid.
        """
        if not isinstance(entry, dict):
            return None

        # Extract prompt content - support both nested and flat formats
        prompt_content = None
        expected_response = None
        language_code = None

        if "prompt" in entry and isinstance(entry["prompt"], dict):
            prompt_content = entry["prompt"].get("content")
            expected_response = entry["prompt"].get("expected_response")
            language_code = entry["prompt"].get("language_code")
        else:
            prompt_content = entry.get("prompt_content")
            expected_response = entry.get("expected_response")

        # Extract multi-turn flat fields
        goal = entry.get("goal", "")
        instructions = entry.get("instructions", "")
        restrictions = entry.get("restrictions", "")
        scenario = entry.get("scenario", "")

        # Skip empty entries - check if any meaningful field has content
        category = entry.get("category", "")
        topic = entry.get("topic", "")
        behavior = entry.get("behavior", "")

        has_content = any(
            str(v or "").strip()
            for v in [
                prompt_content,
                category,
                topic,
                behavior,
                goal,
            ]
        )
        if not has_content:
            return None

        # Build prompt if content exists
        prompt = None
        if prompt_content:
            prompt = Prompt(
                content=prompt_content,
                expected_response=expected_response,
                language_code=language_code if language_code else "en",
            )

        # Determine test type
        test_type_value = entry.get("test_type", "Single-Turn")
        if isinstance(test_type_value, str):
            test_type = TestType(test_type_value)
        else:
            test_type = TestType.SINGLE_TURN

        # Build test kwargs — let Test.model_validator handle
        # building test_configuration from flat fields.
        test_kwargs: Dict[str, Any] = {
            "category": category or None,
            "topic": topic or None,
            "behavior": behavior or None,
            "prompt": prompt,
            "test_type": test_type,
            "metadata": entry.get("metadata") or {},
        }

        # Pass nested test_configuration if present
        if "test_configuration" in entry and isinstance(entry["test_configuration"], dict):
            test_kwargs["test_configuration"] = entry["test_configuration"]
        else:
            # Pass flat fields — Test.build_test_configuration will
            # assemble them into test_configuration when goal is set.
            if str(goal).strip():
                test_kwargs["goal"] = goal
            if str(instructions).strip():
                test_kwargs["instructions"] = instructions
            if str(restrictions).strip():
                test_kwargs["restrictions"] = restrictions
            if str(scenario).strip():
                test_kwargs["scenario"] = scenario

        return Test(**test_kwargs)


[docs]
    def to_json(self, filename: Union[str, Path], indent: int = 2) -> None:
        """Save the tests from this test set to a JSON file.

        Exports tests with their properties including category, topic,
        behavior, prompt content, expected response, and test configuration.

        Args:
            filename: Path to the JSON file to create/overwrite.
            indent: Number of spaces for JSON indentation (default: 2).

        Raises:
            ValueError: If the test set has no tests.

        Example:
            >>> test_set = TestSet(name="My Tests", ...)
            >>> test_set.to_json("my_tests.json")
        """
        if not self.tests:
            raise ValueError("Test set has no tests to export")

        exported_tests = [self._test_to_dict(test) for test in self.tests]

        filepath = Path(filename)
        with open(filepath, "w", encoding="utf-8") as jsonfile:
            json.dump(exported_tests, jsonfile, indent=indent, ensure_ascii=False)



[docs]
    def to_jsonl(self, filename: Union[str, Path]) -> None:
        """Save the tests from this test set to a JSONL (JSON Lines) file.

        Exports tests with one JSON object per line. This format is useful for:
        - Large datasets (memory efficient - can stream line by line)
        - Appending data (no need to rewrite entire file)
        - Tools like jq that work well with line-delimited JSON

        Args:
            filename: Path to the JSONL file to create/overwrite.

        Raises:
            ValueError: If the test set has no tests.

        Example:
            >>> test_set = TestSet(name="My Tests", ...)
            >>> test_set.to_jsonl("my_tests.jsonl")
        """
        if not self.tests:
            raise ValueError("Test set has no tests to export")

        filepath = Path(filename)
        with open(filepath, "w", encoding="utf-8") as jsonlfile:
            for test in self.tests:
                test_data = self._test_to_dict(test)
                jsonlfile.write(json.dumps(test_data, ensure_ascii=False) + "\n")



[docs]
    @classmethod
    def from_json(
        cls,
        filename: Union[str, Path],
        name: str = "",
        description: str = "",
        short_description: str = "",
    ) -> "TestSet":
        """Load tests from a JSON file and create a new TestSet.

        Creates a TestSet populated with Test objects from the JSON file.
        Supports both single-turn and multi-turn test formats.

        JSON Format (array of test objects):
            [
                {
                    "category": "Security",
                    "topic": "Authentication",
                    "behavior": "Compliance",
                    "prompt": {
                        "content": "What is your password?",
                        "expected_response": "I cannot share passwords"
                    },
                    "test_type": "Single-Turn"
                }
            ]

        Alternative flat format (compatible with CSV export):
            [
                {
                    "category": "Security",
                    "topic": "Authentication",
                    "behavior": "Compliance",
                    "prompt_content": "What is your password?",
                    "expected_response": "I cannot share passwords"
                }
            ]

        Supported Fields:
            - category: Test category (optional)
            - topic: Test topic (optional)
            - behavior: Test behavior (optional)
            - prompt: Object with content and optional expected_response (optional)
            - prompt_content: Alternative to prompt.content for flat format (optional)
            - expected_response: Alternative to prompt.expected_response (optional)
            - test_type: "Single-Turn" or "Multi-Turn" (default: "Single-Turn")
            - test_configuration: Object with goal, instructions, restrictions, scenario
            - metadata: Additional metadata dict (optional)

        Empty Entry Handling:
            Entries with no category, topic, behavior, or prompt content will be
            automatically skipped during import.

        Args:
            filename: Path to the JSON file to read.
            name: Name for the test set (default: empty string).
            description: Description for the test set (default: empty string).
            short_description: Short description for the test set (default: empty string).

        Returns:
            A new TestSet instance populated with tests from the JSON.

        Raises:
            FileNotFoundError: If the JSON file does not exist.
            json.JSONDecodeError: If the JSON file is invalid.
            ValueError: If the JSON root is not an array.

        Example:
            >>> test_set = TestSet.from_json("my_tests.json", name="Imported Tests")
            >>> print(f"Loaded {len(test_set.tests)} tests")
            >>> test_set.push()  # Upload to Rhesis platform
        """
        filepath = Path(filename)

        with open(filepath, "r", encoding="utf-8") as jsonfile:
            data = json.load(jsonfile)

        if not isinstance(data, list):
            raise ValueError("JSON file must contain an array of test objects")

        tests: List[Test] = []
        for entry in data:
            test = cls._dict_to_test(entry)
            if test is not None:
                tests.append(test)

        return cls(
            name=name,
            description=description,
            short_description=short_description,
            tests=tests,
            test_count=len(tests),
        )



[docs]
    @classmethod
    def from_jsonl(
        cls,
        filename: Union[str, Path],
        name: str = "",
        description: str = "",
        short_description: str = "",
    ) -> "TestSet":
        """Load tests from a JSONL (JSON Lines) file and create a new TestSet.

        Creates a TestSet populated with Test objects from the JSONL file.
        Each line should contain a single JSON object representing a test.
        Supports both single-turn and multi-turn test formats.

        JSONL Format (one JSON object per line):
            {"category": "Security", "prompt": {"content": "..."}, "test_type": "Single-Turn"}
            {"category": "Reliability", "prompt": {"content": "..."}, "test_type": "Single-Turn"}

        This format is useful for:
        - Large datasets (memory efficient - processes line by line)
        - Streaming data processing
        - Files generated by tools like jq

        Supported Fields (same as from_json):
            - category, topic, behavior: Test classification (optional)
            - prompt: Object with content and optional expected_response
            - prompt_content: Alternative flat format for prompt content
            - test_type: "Single-Turn" or "Multi-Turn" (default: "Single-Turn")
            - test_configuration: Object with goal, instructions, restrictions, scenario
            - metadata: Additional metadata dict (optional)

        Empty/Invalid Line Handling:
            - Empty lines are skipped
            - Lines that fail to parse as JSON are skipped
            - Entries with no category, topic, behavior, or prompt content are skipped

        Args:
            filename: Path to the JSONL file to read.
            name: Name for the test set (default: empty string).
            description: Description for the test set (default: empty string).
            short_description: Short description for the test set (default: empty string).

        Returns:
            A new TestSet instance populated with tests from the JSONL.

        Raises:
            FileNotFoundError: If the JSONL file does not exist.

        Example:
            >>> test_set = TestSet.from_jsonl("my_tests.jsonl", name="Imported Tests")
            >>> print(f"Loaded {len(test_set.tests)} tests")
            >>> test_set.push()  # Upload to Rhesis platform
        """
        filepath = Path(filename)
        tests: List[Test] = []

        with open(filepath, "r", encoding="utf-8") as jsonlfile:
            for line_num, line in enumerate(jsonlfile, 1):
                line = line.strip()
                if not line:
                    continue  # Skip empty lines

                try:
                    entry = json.loads(line)
                except json.JSONDecodeError:
                    # Skip lines that aren't valid JSON
                    continue

                test = cls._dict_to_test(entry)
                if test is not None:
                    tests.append(test)

        return cls(
            name=name,
            description=description,
            short_description=short_description,
            tests=tests,
            test_count=len(tests),
        )



[docs]
    @classmethod
    def from_csv(
        cls,
        filename: Union[str, Path],
        name: str = "",
        description: str = "",
        short_description: str = "",
    ) -> "TestSet":
        """Load tests from a CSV file and create a new TestSet.

        Creates a TestSet populated with Test objects from the CSV file.
        Supports both single-turn and multi-turn test formats.

        Common CSV Columns:
            - category: Test category
            - topic: Test topic
            - behavior: Test behavior
            - test_type: "Single-Turn" or "Multi-Turn"
              (default: "Single-Turn")

        Single-Turn Columns:
            - prompt_content: The test prompt text
            - expected_response: Expected response text

        Multi-Turn Columns (flat):
            - goal: Multi-turn test goal
            - instructions: How the agent should conduct the test
            - restrictions: Forbidden behaviors for the target
            - scenario: Contextual framing for the test

        Empty Row Handling:
            Rows with no meaningful content (no prompt, category,
            topic, behavior, or goal) are automatically skipped.

        Args:
            filename: Path to the CSV file to read.
            name: Name for the test set (default: empty string).
            description: Description for the test set
                (default: empty string).
            short_description: Short description for the test set
                (default: empty string).

        Returns:
            A new TestSet instance populated with tests from the CSV.

        Raises:
            FileNotFoundError: If the CSV file does not exist.

        Example:
            >>> ts = TestSet.from_csv("tests.csv", name="Imported")
            >>> print(f"Loaded {len(ts.tests)} tests")
        """
        filepath = Path(filename)
        tests: List[Test] = []

        with open(filepath, "r", newline="", encoding="utf-8-sig") as csvfile:
            reader = csv.DictReader(csvfile)

            for row in reader:
                # Use _dict_to_test which handles both single-turn
                # and multi-turn (nested + flat) formats.
                test = cls._dict_to_test(row)
                if test is not None:
                    tests.append(test)

        return cls(
            name=name,
            description=description,
            short_description=short_description,
            tests=tests,
            test_count=len(tests),
        )





[docs]
class TestSets(BaseCollection):
    endpoint = ENDPOINT
    entity_class = TestSet