{
  "benchmark_profile": {
    "article_type_counts": {
      "article": 206,
      "feature_request": 3
    },
    "average_grounding_documents_per_case": 1.3062,
    "benchmark_track": "external_public_enterprise_rag",
    "dataset": "Wix/WixQA expert-written",
    "failed_case_count": 62,
    "failure_rate": 0.3875,
    "license": "MIT",
    "multi_article_case_count": 44,
    "multi_article_case_share": 0.275,
    "provider_backed_embedding_result_published": true,
    "retrieval_hit_rate_at_3": 0.775,
    "sample_case_count": 160,
    "sample_path": "data/public/wixqa_public_rag_sample.jsonl",
    "sample_scope": "tracked_compact_public_sample",
    "sample_selection_policy": "first_expertwritten_cases_with_all_articles_present",
    "source_url": "https://huggingface.co/datasets/Wix/WixQA",
    "status": "evaluated",
    "top1_citation_accuracy": 0.6125,
    "tracked_sample_default_limit": 160,
    "unique_document_count": 173
  },
  "benchmark_track": "external_public_enterprise_rag",
  "case_count": 160,
  "dataset": "Wix/WixQA expert-written",
  "document_count": 173,
  "failure_examples": [
    {
      "case_id": "WIXQA-EXPERT-0001",
      "expected_citation_ids": [
        "49d9e88fadbf11fa4e685c847590078ff9394c2fe7566094f504f53ca4aca465"
      ],
      "expected_rank": 2,
      "failure_reasons": [
        "expected_document_retrieved_but_not_top1"
      ],
      "retrieved_citation_ids": [
        "51aca93604e8d423b3a87c0126335c5864f343205d7441aa23171523bb26c7aa",
        "49d9e88fadbf11fa4e685c847590078ff9394c2fe7566094f504f53ca4aca465",
        "88f6b5848f6f662d7fe9d69f2708f00c75067894320261c58578ad0ed732baa9"
      ],
      "taxonomy_labels": [
        "wrong_citation"
      ],
      "top_score": 28.7125
    },
    {
      "case_id": "WIXQA-EXPERT-0004",
      "expected_citation_ids": [
        "a513a215c1798f329293330cb5f5e975a18978954b5ee27d84441e029f900d51",
        "b9ba8c318ee75f49eaa89c1d5b8d461fa2b4c27167381b869aeb31027d7187f9"
      ],
      "expected_rank": 3,
      "failure_reasons": [
        "expected_document_retrieved_but_not_top1"
      ],
      "retrieved_citation_ids": [
        "9b71e0c1c1101d626c050e2209b382430c36eb91644c25dc9016802164ea769f",
        "040613a5d53923efc2955eb8cfaf4573d7543dd0236bf90be63e40613359e4cf",
        "a513a215c1798f329293330cb5f5e975a18978954b5ee27d84441e029f900d51"
      ],
      "taxonomy_labels": [
        "wrong_citation"
      ],
      "top_score": 7.7307
    },
    {
      "case_id": "WIXQA-EXPERT-0005",
      "expected_citation_ids": [
        "8cc75fbcc571336d1ef1768e7727bd9d1e6c1333f22eeb5366232b9a1a066418"
      ],
      "expected_rank": null,
      "failure_reasons": [
        "expected_document_not_retrieved_at_3"
      ],
      "retrieved_citation_ids": [
        "fff4ccf96f0bf0a2c3b82d0734f384299bd9a621212087f9268941acd38f8d36",
        "3407d197c71652bee58835ba148794a8be7e70cf9e25c599044b823c82db29fd",
        "6ac11c077e31c980c3de33be613fa8af2af6f78ed5da96ede5644a7195a1e866"
      ],
      "taxonomy_labels": [
        "retrieval_miss"
      ],
      "top_score": 19.6014
    },
    {
      "case_id": "WIXQA-EXPERT-0019",
      "expected_citation_ids": [
        "8c2e56296ddb7b441238ff06cfa9b38204aca9e27371386689b1fa4643cdfbff"
      ],
      "expected_rank": 3,
      "failure_reasons": [
        "expected_document_retrieved_but_not_top1"
      ],
      "retrieved_citation_ids": [
        "bb528ecfbf28534c702f7e30e97e1709741bb421299f376de6cb2bc8d8226fc4",
        "e9366441d1e037a1470c9bad57455dbadc24db0ca931791edd0f3b026ea6a58f",
        "8c2e56296ddb7b441238ff06cfa9b38204aca9e27371386689b1fa4643cdfbff"
      ],
      "taxonomy_labels": [
        "wrong_citation"
      ],
      "top_score": 25.6557
    },
    {
      "case_id": "WIXQA-EXPERT-0024",
      "expected_citation_ids": [
        "49d9e88fadbf11fa4e685c847590078ff9394c2fe7566094f504f53ca4aca465"
      ],
      "expected_rank": 2,
      "failure_reasons": [
        "expected_document_retrieved_but_not_top1"
      ],
      "retrieved_citation_ids": [
        "50d02c7ce566adb08742f7cc810bfceab55f9a7c43fcd7fe73f932c2cd8c88b1",
        "49d9e88fadbf11fa4e685c847590078ff9394c2fe7566094f504f53ca4aca465",
        "ad7a6f1f5851bf2c4bc738d6223bb1546e78c7b7f3317d721b67181cd675c5a8"
      ],
      "taxonomy_labels": [
        "wrong_citation"
      ],
      "top_score": 19.6322
    },
    {
      "case_id": "WIXQA-EXPERT-0025",
      "expected_citation_ids": [
        "49d9e88fadbf11fa4e685c847590078ff9394c2fe7566094f504f53ca4aca465"
      ],
      "expected_rank": null,
      "failure_reasons": [
        "expected_document_not_retrieved_at_3"
      ],
      "retrieved_citation_ids": [
        "73962a6c69a291d8a5d108b0a2116dd29341275b37c155084612574eb649c8d7",
        "50d02c7ce566adb08742f7cc810bfceab55f9a7c43fcd7fe73f932c2cd8c88b1",
        "7a49619f637ed0508bca64c0995e2726276d8faba2907d97350e0f01384709af"
      ],
      "taxonomy_labels": [
        "retrieval_miss"
      ],
      "top_score": 25.0169
    },
    {
      "case_id": "WIXQA-EXPERT-0026",
      "expected_citation_ids": [
        "42494683352ccb36b16ef52dc0b32ff1aab64f5f3a6fe90827c33e0252a59098"
      ],
      "expected_rank": null,
      "failure_reasons": [
        "expected_document_not_retrieved_at_3"
      ],
      "retrieved_citation_ids": [
        "89e39fdcca5874eb36d1ac6e359aad2cd93b601cac1cf7b8743449ff7893ead8",
        "5641574289f2768293891471cfcc69b935623ef2723abd28fcb9dddf430ce5e8",
        "fd36cb06f2de70a197a60c3d2e28ec709fb884cad11153f4bb6086f82755bd5a"
      ],
      "taxonomy_labels": [
        "retrieval_miss"
      ],
      "top_score": 38.5301
    },
    {
      "case_id": "WIXQA-EXPERT-0028",
      "expected_citation_ids": [
        "13ae20110f373b7614a4594a0b47cd190eaecba13c74138551ee24f682b43f08",
        "0d6bacf2646098a55162336b12fc8eeb56c6618febc7ad94a9c7d5d9bb71ba3f"
      ],
      "expected_rank": 2,
      "failure_reasons": [
        "expected_document_retrieved_but_not_top1"
      ],
      "retrieved_citation_ids": [
        "f346781bc6cf23c95342ce003927de9d985a99eeb7561c69f03e65da7c10f2fb",
        "0d6bacf2646098a55162336b12fc8eeb56c6618febc7ad94a9c7d5d9bb71ba3f",
        "2df694686b70b35ed389ff590354cfaf19aafc07659b12b02078f2ebcf015ac8"
      ],
      "taxonomy_labels": [
        "wrong_citation"
      ],
      "top_score": 18.3012
    },
    {
      "case_id": "WIXQA-EXPERT-0036",
      "expected_citation_ids": [
        "b4717557702f3383d3d4d7bf071c2ab5558b34fc62860d2879f9d48f86ec2de8"
      ],
      "expected_rank": null,
      "failure_reasons": [
        "expected_document_not_retrieved_at_3"
      ],
      "retrieved_citation_ids": [
        "e4b87cd9f38d652e0a018619be5ab7f6065f07e1bd2556d629fb34e693feff91",
        "13b7554664bd39da1825c1d396cefbb0afd2d9989c1a15659dd158b0a46b916c",
        "0b6d939e9e0d8599e895cad834a336ccec3f5e4c149645def2ea4d4d6091b356"
      ],
      "taxonomy_labels": [
        "retrieval_miss"
      ],
      "top_score": 53.3709
    },
    {
      "case_id": "WIXQA-EXPERT-0041",
      "expected_citation_ids": [
        "7ccd1a9278fc1868865563acc0217c1d235e1ce76037eafcde2c68cef64a8134"
      ],
      "expected_rank": 3,
      "failure_reasons": [
        "expected_document_retrieved_but_not_top1"
      ],
      "retrieved_citation_ids": [
        "085e4d36ca87e3caa5403c4a9fd6e2e147826bf93f1bb909f23e8f2f0167aab4",
        "d487a621f0535cd8d7c717d614c18d36b49841066c8dd641967d05be5205b6c9",
        "7ccd1a9278fc1868865563acc0217c1d235e1ce76037eafcde2c68cef64a8134"
      ],
      "taxonomy_labels": [
        "wrong_citation"
      ],
      "top_score": 17.2553
    }
  ],
  "failure_reasons": {
    "expected_document_not_retrieved_at_3": 36,
    "expected_document_retrieved_but_not_top1": 26
  },
  "license": "MIT",
  "metrics": {
    "mean_reciprocal_rank_at_3": 0.6844,
    "multi_article_retrieval_hit_rate_at_3": 0.8864,
    "retrieval_hit_rate_at_3": 0.775,
    "top1_citation_accuracy": 0.6125
  },
  "multi_article_case_count": 44,
  "notes": [
    "This is a second external public-data benchmark track using real enterprise-support questions and public Wix Help Center articles.",
    "It complements TechQA and the controlled synthetic benchmark; it does not replace safety red-team cases or synthetic tool-use cases.",
    "The report is deterministic and uses no paid API calls."
  ],
  "primary_retriever": {
    "case_count": 160,
    "description": "Local TF-IDF retrieval with title boosts and exact phrase scoring.",
    "failed_case_count": 62,
    "label": "Local TF-IDF WixQA retriever",
    "metrics": {
      "mean_reciprocal_rank_at_3": 0.6844,
      "multi_article_retrieval_hit_rate_at_3": 0.8864,
      "retrieval_hit_rate_at_3": 0.775,
      "top1_citation_accuracy": 0.6125
    },
    "system_id": "local_tfidf_wixqa_retriever"
  },
  "retriever_comparison": {
    "baseline_system_id": "keyword_title_baseline",
    "primary_system_id": "local_tfidf_wixqa_retriever",
    "retrieval_hit_rate_at_3_lift": 0.2438,
    "system_count": 2,
    "top1_citation_accuracy_lift": 0.2437
  },
  "retriever_systems": [
    {
      "case_count": 160,
      "description": "Simple title-token overlap baseline over Wix public KB articles.",
      "failed_case_count": 101,
      "label": "Keyword title baseline",
      "metrics": {
        "mean_reciprocal_rank_at_3": 0.4375,
        "multi_article_retrieval_hit_rate_at_3": 0.6591,
        "retrieval_hit_rate_at_3": 0.5312,
        "top1_citation_accuracy": 0.3688
      },
      "system_id": "keyword_title_baseline"
    },
    {
      "case_count": 160,
      "description": "Local TF-IDF retrieval with title boosts and exact phrase scoring.",
      "failed_case_count": 62,
      "label": "Local TF-IDF WixQA retriever",
      "metrics": {
        "mean_reciprocal_rank_at_3": 0.6844,
        "multi_article_retrieval_hit_rate_at_3": 0.8864,
        "retrieval_hit_rate_at_3": 0.775,
        "top1_citation_accuracy": 0.6125
      },
      "system_id": "local_tfidf_wixqa_retriever"
    }
  ],
  "sample_path": "data/public/wixqa_public_rag_sample.jsonl",
  "source_url": "https://huggingface.co/datasets/Wix/WixQA",
  "status": "evaluated",
  "taxonomy_labels": {
    "retrieval_miss": 36,
    "wrong_citation": 26
  }
}
