Introduction

Testing AI applications differs fundamentally from testing deterministic software. Model outputs are probabilistic, edge cases are infinite, and a passing unit test does not guarantee correct behavior in production. A comprehensive AI testing strategy combines traditional software testing with AI-specific evaluation methodologies to catch regressions, hallucinations, and performance degradations before they reach users.

Testing Strategies for AI Applications

Evaluation Datasets

Curate high-quality evaluation datasets that reflect real-world usage:

from dataclasses import dataclass

from typing import List, Callable

import json

@dataclass

class TestCase:

id: str

input: str

expected_output: str

domain: str

difficulty: str # "easy", "medium", "hard"

tags: List[str]

For non-deterministic evaluation

criteria: List[Callable[[str], bool]]

class EvalDataset:

def init(self, name: str):

self.name = name

self.test_cases: List[TestCase] = []

def add_golden_set(self, path: str):

"""Load curated golden test cases."""

with open(path) as f:

data = json.load(f)

for item in data:

self.test_cases.append(TestCase(

id=item["id"],

input=item["input"],

expected_output=item["expected_output"],

domain=item.get("domain", "general"),

difficulty=item.get("difficulty", "medium"),

tags=item.get("tags", []),

criteria=[

lambda output, expected=item["expected_output"]:

expected.lower() in output.lower(),

],

))

def add_adversarial(self, path: str):

"""Load adversarial test cases (edge cases, jailbreaks)."""

with open(path) as f:

data = json.load(f)

for item in data:

self.test_cases.append(TestCase(

id=f"adv_{item['id']}",

input=item["input"],

expected_output="",

domain="adversarial",

difficulty="hard",

tags=["adversarial"],

criteria=[

lambda output:

"I cannot" in output or "I'm unable" in output,

],

))

Regression Testing

Automated regression testing catches model behavior changes:

class ModelRegressionTest:

def init(self, eval_dataset: EvalDataset):

self.dataset = eval_dataset

self.results_history: List[dict] = []

async def run_regression_suite(

self,

model_name: str,

previous_results: dict = None,

) -> dict:

results = {

"model": model_name,

"timestamp": datetime.utcnow().isoformat(),

"total": len(self.dataset.test_cases),

"passed": 0,

"failed": 0,

"failures": [],

"score": 0.0,

}

for test_case in self.dataset.test_cases:

try:

output = await self._invoke_model(model_name, test_case.input)

Evaluate against all criteria

passed = all(

criterion(output) for criterion in test_case.criteria

)

if passed:

results["passed"] += 1

else:

results["failed"] += 1

results["failures"].append({

"id": test_case.id,

"input": test_case.input,

"expected": test_case.expected_output,

"actual": output,

"domain": test_case.domain,

})

except Exception as e:

results["failed"] += 1

results["failures"].append({

"id": test_case.id,

"error": str(e),

})

results["score"] = results["passed"] / results["total"]

Compare with previous run

if previous_results:

score_delta = results["score"] - previous_results["score"]

results["regression"] = score_delta < -0.02

results["score_delta"] = score_delta

return results

def fail_pipeline_if_regression(self, results: dict):

"""Fail CI if model regressed beyond threshold."""

if results.get("regression", False):

raise Exception(

f"Model regression detected! "

f"Score dropped from "

f"{results.get('previous_score', 1.0):.2%} to "

f"{results['score']:.2%}"

)

A/B Evaluation

Compare model versions side by side with structured evaluation:

class ABEvaluation:

def init(self, judge_model: str = "claude-opus-4-20260512"):

self.judge = judge_model

async def evaluate_pair(

self,

prompt: str,

output_a: str,

output_b: str,

criteria: List[str],

) -> dict:

"""Use an independent judge model to compare outputs."""

evaluation_prompt = f"""

Compare these two AI responses to the same prompt.

Prompt: "{prompt}"

Response A: "{output_a}"

Response B: "{output_b}"

Evaluate on these criteria: {', '.join(criteria)}

For each criterion, state which response is better (A, B, or tie)

and provide a brief justification.

"""

response = client.messages.create(

model=self.judge,

max_tokens=1024,

messages=[{"role": "user", "content": evaluation_prompt}],

)

return self._parse_evaluation(response.content[0].text)

async def batch_evaluate(

self,

prompts: List[str],

model_a: str,

model_b: str,

criteria: List[str],

) -> dict:

results = {"model_a_wins": 0, "model_b_wins": 0, "ties": 0}

for prompt in prompts:

output_a = await self._invoke(model_a, prompt)

output_b = await self._invoke(model_b, prompt)

evaluation = await self.evaluate_pair(

prompt, output_a, output_b, criteria

)

Aggregate across criteria

winner = evaluation["winner"]

results[f"{winner}_wins"] += 1

results["total"] = len(prompts)

results["a_win_rate"] = results["model_a_wins"] / results["total"]

results["b_win_rate"] = results["model_b_wins"] / results["total"]

return results

Hallucination Detection

Automated hallucination checks verify factual accuracy:

class HallucinationDetector:

def init(self, knowledge_base: Callable):

self.kb = knowledge_base

async def check_factual_claims(self, output: str) -> List[dict]:

"""Extract factual claims and verify them against a knowledge base."""

1\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\. Extract atomic claims

claims = await self._extract_claims(output)

verified_claims = []

for claim in claims:

2\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\. Search knowledge base

evidence = await self.kb.search(claim["text"])

3\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\. Verify claim against evidence

verification = await self._verify_claim(

claim["text"],

evidence,

claim["context"],

)

verified_claims.append({

"claim": claim["text"],

"confidence": verification["confidence"],

"supported": verification["supported"],

"evidence": evidence[:3],

"context": claim["context"],

})

return verified_claims

async def _verify_claim(

self,

claim: str,

evidence: List[str],

context: str,

) -> dict:

prompt = f"""

Claim: "{claim}"

Context: "{context}"

Evidence: {' '.join(evidence[:3])}

Is this claim supported by the evidence?

Respond with JSON:

{{"supported": bool, "confidence": 0.0-1.0, "reasoning": "..."}}

"""

response = await self._llm_call(prompt)

return json.loads(response)

def compute_hallucination_rate(

self, verified_claims: List[dict]

) -> float:

unsupported = sum(

1 for c in verified_claims if not c["supported"]

)

total = len(verified_claims)

return unsupported / total if total > 0 else 0.0

Prompt Testing

Version-control prompts with structured testing:

class PromptRegistry:

def init(self):

self.prompts = {}

def register(

self,

name: str,

template: str,

version: str,

tests: List[Callable] = None,

):

self.prompts[name] = {

"template": template,

"version": version,

"tests": tests or [],

"performance": [],

}

async def test_prompt(

self, name: str, test_cases: List[dict]

) -> dict:

prompt = self.prompts[name]

results = []

for case in test_cases:

Render template with test inputs

rendered = prompt["template"].format(**case["inputs"])

output = await self._invoke(rendered)

Run tests

test_results = [

test(output) for test in prompt["tests"]

]

results.append({

"case": case["name"],

"output": output,

"tests_passed": all(test_results),

"test_details": test_results,

})

return {

"prompt": name,

"version": prompt["version"],

"pass_rate": sum(r["tests_passed"] for r in results) / len(results),

"results": results,

}

Performance Testing

Benchmark latency, throughput, and cost across model versions:

class AIPerformanceTest:

async def benchmark(

self,

model: str,

concurrency: int = 10,

requests: int = 100,

) -> dict:

import time

import asyncio

semaphore = asyncio.Semaphore(concurrency)

latencies = []

async def single_request():

async with semaphore:

start = time.monotonic()

await self._invoke_model(model, "Test prompt")

latencies.append(time.monotonic() - start)

tasks = [single_request() for _ in range(requests)]

await asyncio.gather(*tasks)

latencies.sort()

return {

"model": model,

"p50": latencies[len(latencies) // 2],

"p95": latencies[int(len(latencies) * 0.95)],

"p99": latencies[int(len(latencies) * 0.99)],

"avg": sum(latencies) / len(latencies),

"throughput": requests / sum(latencies),

"total_cost": self._calculate_cost(model, requests),

}

A mature AI testing pipeline runs golden set regression on every PR, A/B evaluations before model upgrades, hallucination detection on every production response, and performance benchmarks weekly. No single metric captures model quality; combine automated tests with human evaluation for production releases.