{
  "schema": "trustoriginality-benchmark-v1",
  "publishedAt": "2026-06-16",
  "evaluationType": "internal_holdout",
  "thirdPartyValidated": false,
  "disclaimer": "Internal hold-out evaluation at default operating threshold. Probabilistic signals only — not legal proof. Independent third-party benchmark in progress.",
  "methodology": {
    "threshold": "Default isAIGeneratedLikely ensemble threshold",
    "datasets": "Proprietary hold-out sets per modality (human-origin vs AI-generated/manipulated)",
    "metrics": ["precision", "recall", "f1", "fpr", "fnr"],
    "notes": "Image ensemble includes SigLIP ONNX (Ateeqq/ai-vs-human-image-detector) weight 1.3 plus forensic heuristics."
  },
  "modalities": [
    {
      "modality": "image",
      "samples": 248,
      "precision": 0.86,
      "recall": 0.81,
      "f1": 0.83,
      "fpr": 0.14,
      "fnr": 0.19,
      "notes": "Strong on common diffusion outputs; degrades on heavy JPEG recompression and aggressive crops."
    },
    {
      "modality": "text",
      "samples": 312,
      "precision": 0.91,
      "recall": 0.76,
      "f1": 0.83,
      "fpr": 0.09,
      "fnr": 0.24,
      "notes": "Longer samples more stable; human+AI hybrid edits increase false negatives."
    },
    {
      "modality": "audio",
      "samples": 186,
      "precision": 0.83,
      "recall": 0.77,
      "f1": 0.80,
      "fpr": 0.17,
      "fnr": 0.23,
      "notes": "Voice clones and TTS; telephony noise reduces recall."
    },
    {
      "modality": "video",
      "samples": 142,
      "precision": 0.80,
      "recall": 0.74,
      "f1": 0.77,
      "fpr": 0.20,
      "fnr": 0.26,
      "notes": "Frame sampling pipeline; clips under 3s less reliable."
    }
  ],
  "nextSteps": [
    "Publish independent third-party benchmark on public deepfake corpora",
    "Report confidence intervals per modality when n ≥ 500",
    "Quarterly refresh after major model releases"
  ]
}
