#!/usr/bin/env python3
"""Generate candidate HackerOne codebase targets from bounty-targets-data.

Reads: data/repos/bounty-targets-data/data/hackerone_data.json
Outputs: JSON list of SOURCE_CODE GitHub repos that are in-scope and bounty-eligible.

Usage:
  python3 scripts/hackerone_scout.py list --limit 50
  python3 scripts/hackerone_scout.py export --out data/hackerone_codebases.json
  python3 scripts/hackerone_scout.py add-to-targets --targets data/targets.json --max-add 10 --keyword ai,llm,agent

Notes:
- HackerOne scopes are defined per program, but this script focuses only on GitHub repos
  explicitly listed as SOURCE_CODE targets.
"""

from __future__ import annotations

import argparse
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any

ROOT = Path(__file__).resolve().parents[1]
BTTD = ROOT / "data" / "repos" / "bounty-targets-data" / "data" / "hackerone_data.json"


@dataclass(frozen=True)
class Codebase:
    program_handle: str
    program_name: str
    repo_url: str
    asset_type: str
    eligible_for_bounty: bool
    max_severity: str | None


def _load_h1() -> list[dict[str, Any]]:
    if not BTTD.exists():
        raise SystemExit(f"Missing {BTTD} (clone bounty-targets-data first)")
    return json.loads(BTTD.read_text("utf-8", errors="replace"))


def _norm_repo(url: str) -> str:
    url = url.strip()
    url = url.rstrip("/")
    # Normalize github URLs like https://github.com/org/repo/tree/main/... -> https://github.com/org/repo
    m = re.match(r"^(https?://github\.com/[^/]+/[^/]+)(?:/.*)?$", url, flags=re.I)
    if m:
        return m.group(1)
    return url


def extract_codebases(*, offers_bounties_only: bool = True) -> list[Codebase]:
    programs = _load_h1()
    out: list[Codebase] = []

    for p in programs:
        if offers_bounties_only and not p.get("offers_bounties"):
            continue

        targets = (p.get("targets") or {}).get("in_scope") or []
        for t in targets:
            if not isinstance(t, dict):
                continue
            if t.get("asset_type") != "SOURCE_CODE":
                continue
            ident = (t.get("asset_identifier") or "").strip()
            if "github.com" not in ident.lower():
                continue
            if not t.get("eligible_for_bounty"):
                continue

            out.append(
                Codebase(
                    program_handle=p.get("handle") or "",
                    program_name=p.get("name") or "",
                    repo_url=_norm_repo(ident),
                    asset_type=t.get("asset_type") or "",
                    eligible_for_bounty=bool(t.get("eligible_for_bounty")),
                    max_severity=t.get("max_severity"),
                )
            )

    # Dedupe by repo_url (keep first)
    seen: set[str] = set()
    deduped: list[Codebase] = []
    for c in out:
        if c.repo_url in seen:
            continue
        seen.add(c.repo_url)
        deduped.append(c)

    return deduped


def _load_targets_json(path: Path) -> dict[str, Any]:
    obj = json.loads(path.read_text("utf-8", errors="replace"))
    if not isinstance(obj, dict) or "targets" not in obj or not isinstance(obj["targets"], list):
        raise SystemExit(f"Unexpected targets.json shape: {path}")
    return obj


def _guess_clone_path(repo_url: str) -> str:
    # clone into data/repos/<repo>
    name = repo_url.rstrip("/").split("/")[-1]
    return str((ROOT / "data" / "repos" / name).resolve())


def cmd_list(args: argparse.Namespace) -> int:
    codebases = extract_codebases(offers_bounties_only=not args.include_vdp)
    if args.keyword:
        kw = re.compile(args.keyword, re.I)
        codebases = [c for c in codebases if kw.search(c.repo_url) or kw.search(c.program_handle) or kw.search(c.program_name)]

    for c in codebases[: args.limit]:
        print(f"{c.program_handle}\t{c.repo_url}")
    print(f"\n(total: {len(codebases)})")
    return 0


def cmd_export(args: argparse.Namespace) -> int:
    codebases = extract_codebases(offers_bounties_only=not args.include_vdp)
    rows = [
        {
            "program_handle": c.program_handle,
            "program_name": c.program_name,
            "platform": "hackerone",
            "repo_url": c.repo_url,
            "max_severity": c.max_severity,
        }
        for c in codebases
    ]
    out_path = Path(args.out)
    out_path.write_text(json.dumps(rows, indent=2, sort_keys=True) + "\n")
    print(f"wrote {out_path} ({len(rows)} repos)")
    return 0


def cmd_add_to_targets(args: argparse.Namespace) -> int:
    targets_path = Path(args.targets)
    obj = _load_targets_json(targets_path)

    existing = {t.get("repo_url") for t in obj["targets"] if isinstance(t, dict)}

    codebases = extract_codebases(offers_bounties_only=not args.include_vdp)

    # Keyword filter is a comma-separated list, treated as OR.
    if args.keyword:
        pats = [p.strip() for p in args.keyword.split(",") if p.strip()]
        if pats:
            kw = re.compile("|".join(re.escape(p) for p in pats), re.I)
            codebases = [c for c in codebases if kw.search(c.repo_url) or kw.search(c.program_handle) or kw.search(c.program_name)]

    added = 0
    for c in codebases:
        if c.repo_url in existing:
            continue
        obj["targets"].append(
            {
                "program": c.program_name or c.program_handle,
                "platform": "hackerone",
                "repo_url": c.repo_url,
                "clone_path": _guess_clone_path(c.repo_url),
                "language": None,
                "bounty_min": None,
                "bounty_max": None,
                "focus": ["auth", "injection", "ssrf", "rce", "idor"],
                "notes": f"Imported from bounty-targets-data (handle={c.program_handle}, max_severity={c.max_severity})",
            }
        )
        existing.add(c.repo_url)
        added += 1
        if added >= args.max_add:
            break

    targets_path.write_text(json.dumps(obj, indent=2, sort_keys=False) + "\n")
    print(f"updated {targets_path}: added {added}")
    return 0


def main() -> int:
    ap = argparse.ArgumentParser()
    sub = ap.add_subparsers(dest="cmd", required=True)

    ap_list = sub.add_parser("list")
    ap_list.add_argument("--limit", type=int, default=50)
    ap_list.add_argument("--keyword", type=str, default=None, help="regex")
    ap_list.add_argument("--include-vdp", action="store_true", help="include non-bounty programs")
    ap_list.set_defaults(func=cmd_list)

    ap_export = sub.add_parser("export")
    ap_export.add_argument("--out", type=str, required=True)
    ap_export.add_argument("--include-vdp", action="store_true")
    ap_export.set_defaults(func=cmd_export)

    ap_add = sub.add_parser("add-to-targets")
    ap_add.add_argument("--targets", type=str, default="data/targets.json")
    ap_add.add_argument("--max-add", type=int, default=10)
    ap_add.add_argument("--keyword", type=str, default=None, help="comma-separated substrings")
    ap_add.add_argument("--include-vdp", action="store_true")
    ap_add.set_defaults(func=cmd_add_to_targets)

    args = ap.parse_args()
    return args.func(args)


if __name__ == "__main__":
    raise SystemExit(main())
