#!/usr/bin/env python3
"""Bulk forensic-scan harness. Resource-limited, timeout-resilient.
A single file timing out, crashing, OOM-ing, or returning garbage NEVER stops
the run — it is recorded and the pool moves to the next file. Results stream to
a JSONL file (flushed per line) so partial progress always survives."""
import json, os, sys, subprocess, tempfile, shutil, time
from concurrent.futures import ThreadPoolExecutor, as_completed

ROOTS = sys.argv[1:-1]
OUT   = sys.argv[-1]
WORKERS = int(os.environ.get('SCAN_WORKERS','2'))
WALL = 75           # python-side backstop; wrapper kills at 60

def find_pdfs(roots):
    fs = []
    for r in roots:
        for dp, _, names in os.walk(r):
            for n in names:
                if n.lower().endswith('.pdf'):
                    fs.append(os.path.join(dp, n))
    return sorted(set(fs))

def scan(path):
    rec = {"file": path, "size": None, "status": None}
    try:
        rec["size"] = os.path.getsize(path)
    except Exception:
        pass
    d = tempfile.mkdtemp(prefix="scan_")
    rj = os.path.join(d, "r.json"); pj = os.path.join(d, "p.json")
    try:
        try:
            p = subprocess.run(["/tmp/scan_one.sh", path, rj, pj],
                               capture_output=True, timeout=WALL)
            rc = p.returncode
        except subprocess.TimeoutExpired:
            rec["status"] = "timeout"; return rec
        if rc in (124, 137):
            rec["status"] = "timeout"; return rec
        if not os.path.exists(rj):
            rec["status"] = "no_output"
            rec["rc"] = rc
            rec["stderr"] = (p.stderr.decode("utf-8", "replace")[-400:] if p.stderr else "")
            return rec
        try:
            r = json.load(open(rj))
        except Exception as e:
            rec["status"] = "bad_json"; rec["err"] = str(e)[:200]; return rec
        rec["status"] = "ok"
        for k in ("threat_score","risk_score","deception_score","structural_score",
                  "risk_level","verdict_driver","has_exec_vector","aggregate_score"):
            rec[k] = r.get(k)
        inds = r.get("indicators", []) or []
        rec["n_indicators"] = len(inds)
        rec["high_inds"] = [i.get("key") for i in inds
                            if isinstance(i, dict) and i.get("risk") in ("high","critical")][:8]
        rec["engines_run"] = r.get("engines_run")
        return rec
    except Exception as e:
        rec["status"] = "harness_error"; rec["err"] = repr(e)[:200]; return rec
    finally:
        shutil.rmtree(d, ignore_errors=True)

def main():
    files = find_pdfs(ROOTS)
    # resume: skip files already recorded in OUT (survives any kill/OOM)
    seen = set()
    if os.path.exists(OUT):
        for l in open(OUT):
            try: seen.add(json.loads(l).get("file"))
            except Exception: pass
    files = [f for f in files if f not in seen]
    total = len(files)
    print(f"[harness] {total} pdfs to do ({len(seen)} already done) -> {OUT}", flush=True)
    done = 0; t0 = time.time()
    counts = {}
    with open(OUT, "a") as fh, ThreadPoolExecutor(max_workers=WORKERS) as ex:
        futs = {ex.submit(scan, f): f for f in files}
        for fut in as_completed(futs):
            try:
                rec = fut.result()
            except Exception as e:
                rec = {"file": futs[fut], "status": "future_error", "err": repr(e)[:200]}
            fh.write(json.dumps(rec) + "\n"); fh.flush()
            counts[rec.get("status")] = counts.get(rec.get("status"), 0) + 1
            done += 1
            if done % 50 == 0 or done == total:
                el = time.time() - t0
                print(f"[{done}/{total}] {el:.0f}s {dict(sorted(counts.items()))}", flush=True)
    print(f"[harness] DONE {done}/{total} in {time.time()-t0:.0f}s :: {dict(sorted(counts.items()))}", flush=True)

if __name__ == "__main__":
    main()
