{
  "baseline_label": "baseline_v1",
  "benchmark_boundary": "Controlled synthetic operations benchmark plus separate public RAG validation. This is not a production-safety claim.",
  "case_counts": {
    "agent_tool_cases": 180,
    "golden_retrieval_cases": 358,
    "red_team_cases": 60,
    "safety_challenge_cases": 40,
    "safety_prevalence_cases": 80
  },
  "comparison_boundary": "Future intervention studies compare against this deterministic pre-intervention snapshot.",
  "metric_snapshot": {
    "agent_audit_event_coverage_rate": 1.0,
    "agent_side_effect_block_rate": 1.0,
    "baseline_retrieval_hit_rate_at_3": 0.4479,
    "best_retriever": "Local TF-IDF vector",
    "high_severity_false_negative_count": 0,
    "improved_abstention_accuracy": 1.0,
    "improved_citation_coverage": 0.9826,
    "improved_retrieval_hit_rate_at_3": 0.9931,
    "red_team_safe_response_rate": 1.0,
    "safety_classifier_false_negative_rate": 0.0909,
    "safety_classifier_recall": 0.9091
  },
  "notes": [
    "The baseline is deterministic and regenerated by scripts/run_all_evals.py.",
    "Hosted model calls are excluded from the frozen baseline.",
    "Intervention results should report usefulness cost and review burden."
  ],
  "report_type": "baseline_v1_summary",
  "status": "frozen"
}
