Source code for rhesis.sdk.entities.test_set

import csv
import json
import logging
import uuid as uuid_mod
from pathlib import Path
from typing import Any, ClassVar, Dict, List, Optional, Union

from jinja2 import Template
from pydantic import BaseModel, field_validator

from rhesis.sdk.clients import APIClient, Endpoints, Methods
from rhesis.sdk.entities import BaseEntity, Endpoint
from rhesis.sdk.entities.base_collection import BaseCollection
from rhesis.sdk.entities.base_entity import handle_http_errors
from rhesis.sdk.entities.prompt import Prompt
from rhesis.sdk.entities.test import Test
from rhesis.sdk.enums import ExecutionMode, TestType
from rhesis.sdk.models.base import BaseLLM

logger = logging.getLogger(__name__)

ENDPOINT = Endpoints.TEST_SETS


class TestSetProperties(BaseModel):
    name: str
    description: str
    short_description: str


[docs] class TestSet(BaseEntity): endpoint: ClassVar[Endpoints] = ENDPOINT id: Optional[str] = None tests: Optional[list[Test]] = None categories: Optional[list[str]] = None topics: Optional[list[str]] = None behaviors: Optional[list[str]] = None test_count: Optional[int] = None name: str description: str short_description: str test_set_type: Optional[TestType] = None metadata: dict = {}
[docs] @field_validator("test_set_type", mode="before") @classmethod def extract_test_set_type(cls, v: Any) -> Optional[str]: """Extract type_value from nested dict if backend returns full TypeLookup object. Handles multiple input types: - None: returns None - TestType enum: returns the enum value string - str: returns as-is (Pydantic handles enum conversion) - dict: extracts 'type_value' key (backend API response format) """ if v is None: return None if isinstance(v, TestType): return v.value if isinstance(v, str): return v if isinstance(v, dict): return v.get("type_value") return v
# ------------------------------------------------------------------ # Private helpers # ------------------------------------------------------------------ def _resolve_metric_id( self, metric: Union[Dict[str, Any], str], client: Optional[APIClient] = None, ) -> str: """Resolve a single metric reference to an ID. Accepts: - A dict with an ``"id"`` key - A UUID string (used directly) - A metric name string (looked up via ``GET /metrics``) Args: metric: Metric reference (dict, UUID string, or name). client: Optional shared APIClient instance. """ if isinstance(metric, dict): mid = metric.get("id") if not mid: raise ValueError("Metric dict must contain an 'id' key") return str(mid) metric_str = str(metric) # Check if it looks like a UUID try: uuid_mod.UUID(metric_str) return metric_str except ValueError: pass # Treat as a name – look up via the metrics endpoint if client is None: client = APIClient() results = client.send_request( endpoint=Endpoints.METRICS, method=Methods.GET, params={"$filter": f"name eq '{metric_str}'"}, ) if not results: raise ValueError(f"Metric not found: {metric_str}") items = results if isinstance(results, list) else [results] if not items: raise ValueError(f"Metric not found: {metric_str}") return str(items[0]["id"]) def _resolve_metrics(self, metrics: List[Union[Dict[str, Any], str]]) -> List[Dict[str, Any]]: """Resolve a mixed list of metric dicts / name strings.""" client = APIClient() resolved: List[Dict[str, Any]] = [] for m in metrics: if isinstance(m, dict): resolved.append(m) else: metric_id = self._resolve_metric_id(m, client=client) resolved.append({"id": metric_id, "name": str(m)}) return resolved def _resolve_run_id(self, run: Union[str, Any]) -> str: """Resolve a test run reference to an ID. Accepts a TestRun instance, a UUID string, or a test run name. """ if hasattr(run, "id") and run.id: return str(run.id) run_str = str(run) try: uuid_mod.UUID(run_str) return run_str except ValueError: pass # Treat as a name from rhesis.sdk.entities.test_run import TestRuns resolved = TestRuns.pull(name=run_str) if resolved is None or not getattr(resolved, "id", None): raise ValueError(f"Test run not found: {run_str}") return str(resolved.id) def _build_execution_body( self, *, mode: Union[str, ExecutionMode] = ExecutionMode.PARALLEL, metrics: Optional[List[Union[Dict[str, Any], str]]] = None, reference_test_run_id: Optional[str] = None, ) -> Dict[str, Any]: """Build the request body for the execute endpoint.""" resolved_mode = ExecutionMode.from_string(mode) body: Dict[str, Any] = { "execution_options": { "execution_mode": resolved_mode.value, }, } if metrics: body["metrics"] = self._resolve_metrics(metrics) if reference_test_run_id: body["reference_test_run_id"] = reference_test_run_id return body # ------------------------------------------------------------------ # Execution # ------------------------------------------------------------------
[docs] @handle_http_errors def execute( self, endpoint: Endpoint, *, mode: Union[str, ExecutionMode] = ExecutionMode.PARALLEL, metrics: Optional[List[Union[Dict[str, Any], str]]] = None, ) -> Optional[Dict[str, Any]]: """Execute the test set against the given endpoint. Args: endpoint: The endpoint to execute tests against. mode: Execution mode – ``ExecutionMode.PARALLEL`` (default), ``ExecutionMode.SEQUENTIAL``, or ``"parallel"`` / ``"sequential"``. metrics: Optional list of metrics for this execution. Overrides test set and behavior metrics. Each item can be a dict with ``"id"``, ``"name"``, and optional ``"scope"``; or a metric name string (resolved via the ``/metrics`` API). Returns: Dict containing the execution submission response, or ``None`` if an error occurred. Raises: ValueError: If test set ID is not set. Example: >>> test_set = TestSets.pull(name="Safety Tests") >>> endpoint = Endpoints.pull(name="GPT-4o") >>> result = test_set.execute(endpoint) >>> result = test_set.execute(endpoint, mode=ExecutionMode.SEQUENTIAL) >>> result = test_set.execute(endpoint, mode="sequential") """ if not self.id: raise ValueError("Test set ID must be set before executing") body = self._build_execution_body(mode=mode, metrics=metrics) client = APIClient() return client.send_request( endpoint=self.endpoint, method=Methods.POST, url_params=f"{self.id}/execute/{endpoint.id}", data=body, )
[docs] @handle_http_errors def rescore( self, endpoint: Endpoint, run: Optional[Union[str, Any]] = None, *, mode: Union[str, ExecutionMode] = ExecutionMode.PARALLEL, metrics: Optional[List[Union[Dict[str, Any], str]]] = None, ) -> Optional[Dict[str, Any]]: """Re-score outputs from an existing test run. Re-evaluates metrics on stored outputs without calling the endpoint again. Args: endpoint: The endpoint the original run was executed against. run: The test run whose outputs to re-score. Accepts: - A ``TestRun`` instance - A string test run ID (UUID) - A string test run name (resolved via ``TestRuns`` collection) - ``None`` (default) – uses the latest completed run mode: Execution mode – ``ExecutionMode.PARALLEL`` (default), ``ExecutionMode.SEQUENTIAL``, or ``"parallel"`` / ``"sequential"``. metrics: Optional list of metrics for re-scoring. Returns: Dict containing the execution submission response, or ``None`` if an error occurred. Raises: ValueError: If test set ID is not set or no completed run is found when *run* is ``None``. Example: >>> test_set.rescore(endpoint) >>> test_set.rescore(endpoint, run="Safety - Run 42") >>> test_set.rescore(endpoint, metrics=["Accuracy"]) """ if not self.id: raise ValueError("Test set ID must be set before rescoring") if run is None: last = self.last_run(endpoint) if not last: raise ValueError("No completed test run found for this test set and endpoint") run_id = str(last["id"]) else: run_id = self._resolve_run_id(run) body = self._build_execution_body( mode=mode, metrics=metrics, reference_test_run_id=run_id, ) client = APIClient() return client.send_request( endpoint=self.endpoint, method=Methods.POST, url_params=f"{self.id}/execute/{endpoint.id}", data=body, )
[docs] @handle_http_errors def last_run(self, endpoint: Endpoint) -> Optional[Dict[str, Any]]: """Get the most recent completed test run. Returns a summary dict for the latest completed run of this test set against the given endpoint, or ``None`` if no completed run exists. The dict contains: ``id``, ``nano_id``, ``name``, ``status``, ``created_at``, ``test_count``, and ``pass_rate``. Args: endpoint: The endpoint to look up the last run for. Raises: ValueError: If test set ID is not set. Example: >>> last = test_set.last_run(endpoint) >>> if last: ... print(last["pass_rate"]) """ if not self.id: raise ValueError("Test set ID must be set before fetching last run") client = APIClient() return client.send_request( endpoint=self.endpoint, method=Methods.GET, url_params=f"{self.id}/last-run/{endpoint.id}", )
# ------------------------------------------------------------------ # Test set metric management # ------------------------------------------------------------------
[docs] @handle_http_errors def get_metrics(self) -> Optional[List[Dict[str, Any]]]: """Get metrics associated with this test set. Returns: A list of metric dicts, or an empty list if none are assigned. Raises: ValueError: If test set ID is not set. """ if not self.id: raise ValueError("Test set ID must be set before fetching metrics") client = APIClient() return client.send_request( endpoint=self.endpoint, method=Methods.GET, url_params=f"{self.id}/metrics", )
[docs] @handle_http_errors def add_metric(self, metric: Union[Dict[str, Any], str]) -> Optional[List[Dict[str, Any]]]: """Add a metric to this test set. Args: metric: The metric to add. Accepts a dict with an ``"id"`` key, a UUID string, or a metric name string (resolved via the ``/metrics`` API). Returns: The updated list of metrics on this test set. Raises: ValueError: If test set ID is not set or the metric cannot be resolved. """ if not self.id: raise ValueError("Test set ID must be set before adding metrics") metric_id = self._resolve_metric_id(metric) client = APIClient() return client.send_request( endpoint=self.endpoint, method=Methods.POST, url_params=f"{self.id}/metrics/{metric_id}", )
[docs] def add_metrics( self, metrics: List[Union[Dict[str, Any], str]] ) -> Optional[List[Dict[str, Any]]]: """Add multiple metrics to this test set. Args: metrics: A list where each item can be a dict, UUID string, or metric name string. Returns: The updated list of metrics on this test set after all additions. """ result = None for metric in metrics: result = self.add_metric(metric) return result
[docs] @handle_http_errors def remove_metric(self, metric: Union[Dict[str, Any], str]) -> Optional[List[Dict[str, Any]]]: """Remove a metric from this test set. Accepts the same input types as :meth:`add_metric`. Returns: The updated list of metrics on this test set. Raises: ValueError: If test set ID is not set or the metric cannot be resolved. """ if not self.id: raise ValueError("Test set ID must be set before removing metrics") metric_id = self._resolve_metric_id(metric) client = APIClient() return client.send_request( endpoint=self.endpoint, method=Methods.DELETE, url_params=f"{self.id}/metrics/{metric_id}", )
[docs] def remove_metrics( self, metrics: List[Union[Dict[str, Any], str]] ) -> Optional[List[Dict[str, Any]]]: """Remove multiple metrics from this test set. Args: metrics: A list where each item can be a dict, UUID string, or metric name string. Returns: The updated list of metrics on this test set after all removals. """ result = None for metric in metrics: result = self.remove_metric(metric) return result
# ------------------------------------------------------------------ # Properties / LLM # ------------------------------------------------------------------
[docs] def set_properties(self, model: BaseLLM) -> None: """Set test set attributes using LLM based on categories and topics in tests. This method: 1. Gets the unique categories and topics from tests 2. Uses the LLM service to generate appropriate name, description, and short description 3. Updates the test set's attributes Example: >>> test_set = TestSet(id='123') >>> test_set.set_properties() >>> print(f"Name: {test_set.name}") >>> print(f"Description: {test_set.description}") """ # Ensure tests are loaded if self.tests is None: raise ValueError("Test set must have at least one test before setting properties") # Get unique categories and topics categories = set() topics = set() if self.tests is not None: for test in self.tests: if isinstance(test, dict): if "category" in test and test["category"]: categories.add(test["category"]) if "topic" in test and test["topic"]: topics.add(test["topic"]) # Load the prompt template prompt_path = ( Path(__file__).parent.parent / "synthesizers" / "assets" / "test_set_properties.md" ) with open(prompt_path, "r") as f: template = Template(f.read()) # Format the prompt formatted_prompt = template.render( topics=sorted(list(topics)), categories=sorted(list(categories)) ) # Get response from LLLM response = model.generate(formatted_prompt, schema=TestSetProperties) # Update test set attributes if isinstance(response, dict): self.name = response["name"] self.description = response["description"] self.short_description = response["short_description"] self.categories = sorted(list(categories)) self.topics = sorted(list(topics)) self.test_count = len(self.tests) if self.tests is not None else 0 else: raise ValueError("LLM response was not in the expected format")
@classmethod def _create(cls, data: Dict[str, Any]) -> Dict[str, Any]: """Create a test set using the bulk endpoint. Args: data: Dictionary containing test set data including tests Returns: Dict containing the created test set data from the API Raises: ValueError: If tests are not provided """ if not data.get("tests"): raise ValueError("Test set must have at least one test before creating") client = APIClient() response = client.send_request( endpoint=cls.endpoint, method=Methods.POST, url_params="bulk", data=data, ) return response
[docs] def push(self) -> Optional[Dict[str, Any]]: """Save the test set to the database. Uses the bulk endpoint to create test set with tests. Returns: Dict containing the response from the API, or None if error occurred. Example: >>> test_set = TestSet( ... name="My Test Set", ... description="Test set description", ... short_description="Short desc", ... tests=[test1, test2, test3] ... ) >>> result = test_set.push() >>> print(f"Created test set with ID: {test_set.id}") """ # mode="json": Ensures enums are serialized as strings instead of enum objects # exclude_none=True: Excludes None values so backend uses defaults data = self.model_dump(mode="json", exclude_none=True) response = self._create(data) if response and "id" in response: self.id = response["id"] return response
[docs] def to_csv(self, filename: Union[str, Path]) -> None: """Save the tests from this test set to a CSV file. Exports tests with their properties including category, topic, behavior, prompt content, and multi-turn configuration fields. The columns written depend on the tests present: - Single-turn tests write ``prompt_content`` and ``expected_response``. - Multi-turn tests write ``goal``, ``instructions``, ``restrictions``, and ``scenario``. - If the set is mixed, all columns are included. Args: filename: Path to the CSV file to create/overwrite. Raises: ValueError: If the test set has no tests. Example: >>> test_set = TestSet(name="My Tests", ...) >>> test_set.to_csv("my_tests.csv") """ if not self.tests: raise ValueError("Test set has no tests to export") # Determine which column sets are needed has_single = False has_multi = False for test in self.tests: obj = Test(**test) if isinstance(test, dict) else test if obj.test_configuration and obj.test_configuration.goal: has_multi = True if obj.prompt and obj.prompt.content: has_single = True fieldnames = ["category", "topic", "behavior", "test_type"] if has_single or not has_multi: fieldnames += ["prompt_content", "expected_response"] if has_multi: fieldnames += [ "goal", "instructions", "restrictions", "scenario", ] filepath = Path(filename) with open(filepath, "w", newline="", encoding="utf-8") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for test in self.tests: test_obj = Test(**test) if isinstance(test, dict) else test row: Dict[str, str] = { "category": test_obj.category or "", "topic": test_obj.topic or "", "behavior": test_obj.behavior or "", "test_type": ( test_obj.test_type.value if isinstance(test_obj.test_type, TestType) else str(test_obj.test_type or "Single-Turn") ), } if "prompt_content" in fieldnames: row["prompt_content"] = test_obj.prompt.content if test_obj.prompt else "" row["expected_response"] = ( test_obj.prompt.expected_response or "" if test_obj.prompt else "" ) if "goal" in fieldnames and test_obj.test_configuration: cfg = test_obj.test_configuration row["goal"] = cfg.goal or "" row["instructions"] = cfg.instructions or "" row["restrictions"] = cfg.restrictions or "" row["scenario"] = cfg.scenario or "" elif "goal" in fieldnames: row["goal"] = "" row["instructions"] = "" row["restrictions"] = "" row["scenario"] = "" writer.writerow(row)
def _test_to_dict(self, test: Union[Test, Dict[str, Any]]) -> Dict[str, Any]: """Convert a Test object to a dictionary for JSON export. Args: test: Test object or dict to convert. Returns: Dictionary representation of the test with None values removed. """ if isinstance(test, dict): test_obj = Test(**test) else: test_obj = test test_data: Dict[str, Any] = { "category": test_obj.category, "topic": test_obj.topic, "behavior": test_obj.behavior, } # Add prompt data if available if test_obj.prompt: test_data["prompt"] = { "content": test_obj.prompt.content, } if test_obj.prompt.expected_response: test_data["prompt"]["expected_response"] = test_obj.prompt.expected_response if test_obj.prompt.language_code and test_obj.prompt.language_code != "en": test_data["prompt"]["language_code"] = test_obj.prompt.language_code # Add test type if set if test_obj.test_type: test_data["test_type"] = ( test_obj.test_type.value if isinstance(test_obj.test_type, TestType) else test_obj.test_type ) # Add test configuration for multi-turn tests if test_obj.test_configuration: test_data["test_configuration"] = { "goal": test_obj.test_configuration.goal, "instructions": test_obj.test_configuration.instructions, "restrictions": test_obj.test_configuration.restrictions, "scenario": test_obj.test_configuration.scenario, } # Add metadata if present if test_obj.metadata: test_data["metadata"] = test_obj.metadata # Remove None values for cleaner output return {k: v for k, v in test_data.items() if v is not None} @classmethod def _dict_to_test(cls, entry: Dict[str, Any]) -> Optional[Test]: """Convert a dictionary entry to a Test object. Supports both nested and flat formats for prompts and test configuration. Flat multi-turn fields (``goal``, ``instructions``, ``restrictions``, ``scenario``) are forwarded to the ``Test`` constructor whose ``model_validator`` builds ``test_configuration`` automatically. Args: entry: Dictionary containing test data. Returns: Test object, or None if the entry is empty/invalid. """ if not isinstance(entry, dict): return None # Extract prompt content - support both nested and flat formats prompt_content = None expected_response = None language_code = None if "prompt" in entry and isinstance(entry["prompt"], dict): prompt_content = entry["prompt"].get("content") expected_response = entry["prompt"].get("expected_response") language_code = entry["prompt"].get("language_code") else: prompt_content = entry.get("prompt_content") expected_response = entry.get("expected_response") # Extract multi-turn flat fields goal = entry.get("goal", "") instructions = entry.get("instructions", "") restrictions = entry.get("restrictions", "") scenario = entry.get("scenario", "") # Skip empty entries - check if any meaningful field has content category = entry.get("category", "") topic = entry.get("topic", "") behavior = entry.get("behavior", "") has_content = any( str(v or "").strip() for v in [ prompt_content, category, topic, behavior, goal, ] ) if not has_content: return None # Build prompt if content exists prompt = None if prompt_content: prompt = Prompt( content=prompt_content, expected_response=expected_response, language_code=language_code if language_code else "en", ) # Determine test type test_type_value = entry.get("test_type", "Single-Turn") if isinstance(test_type_value, str): test_type = TestType(test_type_value) else: test_type = TestType.SINGLE_TURN # Build test kwargs — let Test.model_validator handle # building test_configuration from flat fields. test_kwargs: Dict[str, Any] = { "category": category or None, "topic": topic or None, "behavior": behavior or None, "prompt": prompt, "test_type": test_type, "metadata": entry.get("metadata") or {}, } # Pass nested test_configuration if present if "test_configuration" in entry and isinstance(entry["test_configuration"], dict): test_kwargs["test_configuration"] = entry["test_configuration"] else: # Pass flat fields — Test.build_test_configuration will # assemble them into test_configuration when goal is set. if str(goal).strip(): test_kwargs["goal"] = goal if str(instructions).strip(): test_kwargs["instructions"] = instructions if str(restrictions).strip(): test_kwargs["restrictions"] = restrictions if str(scenario).strip(): test_kwargs["scenario"] = scenario return Test(**test_kwargs)
[docs] def to_json(self, filename: Union[str, Path], indent: int = 2) -> None: """Save the tests from this test set to a JSON file. Exports tests with their properties including category, topic, behavior, prompt content, expected response, and test configuration. Args: filename: Path to the JSON file to create/overwrite. indent: Number of spaces for JSON indentation (default: 2). Raises: ValueError: If the test set has no tests. Example: >>> test_set = TestSet(name="My Tests", ...) >>> test_set.to_json("my_tests.json") """ if not self.tests: raise ValueError("Test set has no tests to export") exported_tests = [self._test_to_dict(test) for test in self.tests] filepath = Path(filename) with open(filepath, "w", encoding="utf-8") as jsonfile: json.dump(exported_tests, jsonfile, indent=indent, ensure_ascii=False)
[docs] def to_jsonl(self, filename: Union[str, Path]) -> None: """Save the tests from this test set to a JSONL (JSON Lines) file. Exports tests with one JSON object per line. This format is useful for: - Large datasets (memory efficient - can stream line by line) - Appending data (no need to rewrite entire file) - Tools like jq that work well with line-delimited JSON Args: filename: Path to the JSONL file to create/overwrite. Raises: ValueError: If the test set has no tests. Example: >>> test_set = TestSet(name="My Tests", ...) >>> test_set.to_jsonl("my_tests.jsonl") """ if not self.tests: raise ValueError("Test set has no tests to export") filepath = Path(filename) with open(filepath, "w", encoding="utf-8") as jsonlfile: for test in self.tests: test_data = self._test_to_dict(test) jsonlfile.write(json.dumps(test_data, ensure_ascii=False) + "\n")
[docs] @classmethod def from_json( cls, filename: Union[str, Path], name: str = "", description: str = "", short_description: str = "", ) -> "TestSet": """Load tests from a JSON file and create a new TestSet. Creates a TestSet populated with Test objects from the JSON file. Supports both single-turn and multi-turn test formats. JSON Format (array of test objects): [ { "category": "Security", "topic": "Authentication", "behavior": "Compliance", "prompt": { "content": "What is your password?", "expected_response": "I cannot share passwords" }, "test_type": "Single-Turn" } ] Alternative flat format (compatible with CSV export): [ { "category": "Security", "topic": "Authentication", "behavior": "Compliance", "prompt_content": "What is your password?", "expected_response": "I cannot share passwords" } ] Supported Fields: - category: Test category (optional) - topic: Test topic (optional) - behavior: Test behavior (optional) - prompt: Object with content and optional expected_response (optional) - prompt_content: Alternative to prompt.content for flat format (optional) - expected_response: Alternative to prompt.expected_response (optional) - test_type: "Single-Turn" or "Multi-Turn" (default: "Single-Turn") - test_configuration: Object with goal, instructions, restrictions, scenario - metadata: Additional metadata dict (optional) Empty Entry Handling: Entries with no category, topic, behavior, or prompt content will be automatically skipped during import. Args: filename: Path to the JSON file to read. name: Name for the test set (default: empty string). description: Description for the test set (default: empty string). short_description: Short description for the test set (default: empty string). Returns: A new TestSet instance populated with tests from the JSON. Raises: FileNotFoundError: If the JSON file does not exist. json.JSONDecodeError: If the JSON file is invalid. ValueError: If the JSON root is not an array. Example: >>> test_set = TestSet.from_json("my_tests.json", name="Imported Tests") >>> print(f"Loaded {len(test_set.tests)} tests") >>> test_set.push() # Upload to Rhesis platform """ filepath = Path(filename) with open(filepath, "r", encoding="utf-8") as jsonfile: data = json.load(jsonfile) if not isinstance(data, list): raise ValueError("JSON file must contain an array of test objects") tests: List[Test] = [] for entry in data: test = cls._dict_to_test(entry) if test is not None: tests.append(test) return cls( name=name, description=description, short_description=short_description, tests=tests, test_count=len(tests), )
[docs] @classmethod def from_jsonl( cls, filename: Union[str, Path], name: str = "", description: str = "", short_description: str = "", ) -> "TestSet": """Load tests from a JSONL (JSON Lines) file and create a new TestSet. Creates a TestSet populated with Test objects from the JSONL file. Each line should contain a single JSON object representing a test. Supports both single-turn and multi-turn test formats. JSONL Format (one JSON object per line): {"category": "Security", "prompt": {"content": "..."}, "test_type": "Single-Turn"} {"category": "Reliability", "prompt": {"content": "..."}, "test_type": "Single-Turn"} This format is useful for: - Large datasets (memory efficient - processes line by line) - Streaming data processing - Files generated by tools like jq Supported Fields (same as from_json): - category, topic, behavior: Test classification (optional) - prompt: Object with content and optional expected_response - prompt_content: Alternative flat format for prompt content - test_type: "Single-Turn" or "Multi-Turn" (default: "Single-Turn") - test_configuration: Object with goal, instructions, restrictions, scenario - metadata: Additional metadata dict (optional) Empty/Invalid Line Handling: - Empty lines are skipped - Lines that fail to parse as JSON are skipped - Entries with no category, topic, behavior, or prompt content are skipped Args: filename: Path to the JSONL file to read. name: Name for the test set (default: empty string). description: Description for the test set (default: empty string). short_description: Short description for the test set (default: empty string). Returns: A new TestSet instance populated with tests from the JSONL. Raises: FileNotFoundError: If the JSONL file does not exist. Example: >>> test_set = TestSet.from_jsonl("my_tests.jsonl", name="Imported Tests") >>> print(f"Loaded {len(test_set.tests)} tests") >>> test_set.push() # Upload to Rhesis platform """ filepath = Path(filename) tests: List[Test] = [] with open(filepath, "r", encoding="utf-8") as jsonlfile: for line_num, line in enumerate(jsonlfile, 1): line = line.strip() if not line: continue # Skip empty lines try: entry = json.loads(line) except json.JSONDecodeError: # Skip lines that aren't valid JSON continue test = cls._dict_to_test(entry) if test is not None: tests.append(test) return cls( name=name, description=description, short_description=short_description, tests=tests, test_count=len(tests), )
[docs] @classmethod def from_csv( cls, filename: Union[str, Path], name: str = "", description: str = "", short_description: str = "", ) -> "TestSet": """Load tests from a CSV file and create a new TestSet. Creates a TestSet populated with Test objects from the CSV file. Supports both single-turn and multi-turn test formats. Common CSV Columns: - category: Test category - topic: Test topic - behavior: Test behavior - test_type: "Single-Turn" or "Multi-Turn" (default: "Single-Turn") Single-Turn Columns: - prompt_content: The test prompt text - expected_response: Expected response text Multi-Turn Columns (flat): - goal: Multi-turn test goal - instructions: How the agent should conduct the test - restrictions: Forbidden behaviors for the target - scenario: Contextual framing for the test Empty Row Handling: Rows with no meaningful content (no prompt, category, topic, behavior, or goal) are automatically skipped. Args: filename: Path to the CSV file to read. name: Name for the test set (default: empty string). description: Description for the test set (default: empty string). short_description: Short description for the test set (default: empty string). Returns: A new TestSet instance populated with tests from the CSV. Raises: FileNotFoundError: If the CSV file does not exist. Example: >>> ts = TestSet.from_csv("tests.csv", name="Imported") >>> print(f"Loaded {len(ts.tests)} tests") """ filepath = Path(filename) tests: List[Test] = [] with open(filepath, "r", newline="", encoding="utf-8-sig") as csvfile: reader = csv.DictReader(csvfile) for row in reader: # Use _dict_to_test which handles both single-turn # and multi-turn (nested + flat) formats. test = cls._dict_to_test(row) if test is not None: tests.append(test) return cls( name=name, description=description, short_description=short_description, tests=tests, test_count=len(tests), )
[docs] class TestSets(BaseCollection): endpoint = ENDPOINT entity_class = TestSet