#!/usr/bin/env python3
"""Discrepancy hunt over a scan JSONL. Surfaces:
  1. Scanner robustness failures (timeout/no_output/crash/bad_json) -> real bugs.
  2. False-positive candidates: benign real-world files (pdf.js / clean_docs)
     scoring high on the THREAT axis.
  3. Score distribution per corpus category.
  4. Files with exec vector / highest threat for review."""
import json, sys, collections
JL = sys.argv[1] if len(sys.argv) > 1 else "/tmp/full.jsonl"

def cat(path):
    if "/corpus_ext/pdfjs/" in path: return "pdfjs_realworld"
    if "/corpus_ext/verapdf/" in path: return "pdf20"
    if "/corpus_ext/malware/" in path: return "malware"
    if "/corpus/adversarial/" in path: return "adversarial"
    if "/corpus/clean_docs/" in path: return "clean_docs"
    if "/corpus/gov_forms/" in path: return "gov_forms"
    if "/vap_tests/" in path: return "vap_tests"
    if "/hidden_js/" in path: return "hidden_js"
    return "other"

rows = [json.loads(l) for l in open(JL) if l.strip()]
print(f"total scanned: {len(rows)}\n")

st = collections.Counter(r.get("status") for r in rows)
print("=== STATUS ===")
for k, v in st.most_common(): print(f"  {k:14} {v}")

fails = [r for r in rows if r.get("status") != "ok"]
print(f"\n=== ROBUSTNESS FAILURES ({len(fails)}) — scanner bugs to fix ===")
for r in fails[:60]:
    print(f"  {r.get('status'):10} {cat(r['file']):16} {r['file'].split('/')[-1][:50]}"
          + (f"  rc={r.get('rc')}" if r.get('rc') is not None else "")
          + (f"  {r.get('stderr','')[-120:]}" if r.get('stderr') else "")
          + (f"  {r.get('err','')}" if r.get('err') else ""))

ok = [r for r in rows if r.get("status") == "ok"]
print(f"\n=== THREAT DISTRIBUTION by category (ok={len(ok)}) ===")
bycat = collections.defaultdict(list)
for r in ok: bycat[cat(r["file"])].append(r)
def band(t):
    t = t or 0
    return ("clean" if t==0 else "low" if t<30 else "suspicious" if t<150
            else "high-risk" if t<350 else "dangerous")
for c in sorted(bycat):
    rs = bycat[c]
    bands = collections.Counter(band(r.get("threat_score")) for r in rs)
    print(f"  {c:16} n={len(rs):4}  " + "  ".join(f"{b}:{bands[b]}" for b in
          ["clean","low","suspicious","high-risk","dangerous"] if bands[b]))

# FP candidates: benign real-world files scoring high THREAT
fp = [r for r in ok if cat(r["file"]) in ("pdfjs_realworld","clean_docs","pdf20")
      and (r.get("threat_score") or 0) >= 150]
fp.sort(key=lambda r: -(r.get("threat_score") or 0))
print(f"\n=== FALSE-POSITIVE CANDIDATES (benign file, threat>=150): {len(fp)} ===")
for r in fp[:40]:
    print(f"  threat={r.get('threat_score'):4} {band(r.get('threat_score')):10} "
          f"{r['file'].split('/')[-1][:46]:46} exec={r.get('has_exec_vector')} "
          f"{r.get('high_inds')}")

# FALSE NEGATIVES: live malware scoring low/clean on THREAT axis
mal = [r for r in ok if cat(r["file"]) == "malware"]
mal_miss = [r for r in mal if (r.get("threat_score") or 0) < 30]
mal_miss.sort(key=lambda r: (r.get("threat_score") or 0))
print(f"\n=== MALWARE FALSE-NEGATIVE CANDIDATES (threat<30 of {len(mal)} live samples): {len(mal_miss)} ===")
for r in mal_miss[:60]:
    print(f"  threat={r.get('threat_score'):4} {band(r.get('threat_score')):10} "
          f"{r['file'].split('/')[-1][:30]:30} exec={r.get('has_exec_vector')} "
          f"nind={r.get('n_indicators')} {r.get('high_inds')}")
if mal:
    import statistics
    ts=[r.get('threat_score') or 0 for r in mal]
    detected=sum(1 for t in ts if t>=30)
    print(f"\n  MALWARE DETECTION: {detected}/{len(mal)} >= threat 30 ({100*detected/len(mal):.1f}%); "
          f"median threat={statistics.median(ts):.0f}; >=150: {sum(1 for t in ts if t>=150)}")
