#!/usr/bin/env python3
"""Extract top-paying HackerOne programs from a scraped export and map to code repos.

Input file is a semi-structured JSON-ish dump (may be invalid JSON). We parse it
by scanning for program blocks and extracting a small set of fields.

Usage:
  python3 scripts/hackerone_top_paying.py list --file hackerone-network-top-paying.jsonl --top 25
  python3 scripts/hackerone_top_paying.py repos --file hackerone-network-top-paying.jsonl --top 25 --max-repos 20
  python3 scripts/hackerone_top_paying.py add-to-targets --file hackerone-network-top-paying.jsonl --top 10 --max-add 20

Requires bounty-targets-data clone at:
  data/repos/bounty-targets-data/data/hackerone_data.json
"""

from __future__ import annotations

import argparse
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any

ROOT = Path(__file__).resolve().parents[1]

BTTD_H1 = ROOT / "data" / "repos" / "bounty-targets-data" / "data" / "hackerone_data.json"


@dataclass
class Program:
    handle: str
    name: str | None
    url: str | None
    currency: str | None
    min_bounty: int | None
    max_bounty: int | None
    offers_bounties: bool | None
    critical_multiplier: float | None

    @property
    def effective_max_critical(self) -> float:
        mult = self.critical_multiplier or 1.0
        return float(self.max_bounty or 0) * mult


def _read_text(path: Path) -> str:
    return path.read_text("utf-8", errors="replace")


def _extract_program_chunks(text: str) -> list[str]:
    """Return raw chunks that look like a program entry.

    The input file isn't reliable JSON, so we can't json.load() it. Instead we
    locate likely program objects and do brace-balanced extraction.
    """
    def extract_object(start: int) -> str | None:
        # Brace-balanced object extraction with basic JSON string handling.
        if start >= len(text) or text[start] != "{":
            return None
        depth = 0
        in_str = False
        esc = False
        for j in range(start, len(text)):
            ch = text[j]
            if in_str:
                if esc:
                    esc = False
                elif ch == "\\":
                    esc = True
                elif ch == '"':
                    in_str = False
                continue

            if ch == '"':
                in_str = True
                continue
            if ch == "{":
                depth += 1
                continue
            if ch == "}":
                depth -= 1
                if depth == 0:
                    return text[start : j + 1]
                continue
        return None

    chunks: list[str] = []

    # Pretty-printed objects (line-started).
    for m in re.finditer(r'(?m)^[ \t]*\{[ \t]*"id"\s*:', text):
        obj = extract_object(m.start())
        if not obj:
            continue
        if '"handle"' not in obj:
            continue
        if '"minimum_bounty_table_value"' not in obj or '"maximum_bounty_table_value"' not in obj:
            continue
        chunks.append(obj)

    # Compact objects (inline).
    for m in re.finditer(r'\{\s*"id"\s*:\s*"', text):
        obj = extract_object(m.start())
        if not obj:
            continue
        if '"handle"' not in obj:
            continue
        if '"minimum_bounty_table_value"' not in obj or '"maximum_bounty_table_value"' not in obj:
            continue
        chunks.append(obj)

    # Deduplicate identical chunks.
    seen: set[str] = set()
    out: list[str] = []
    for c in chunks:
        if c in seen:
            continue
        seen.add(c)
        out.append(c)
    return out


def _g(chunk: str, pat: str) -> str | None:
    m = re.search(pat, chunk)
    return m.group(1) if m else None


def parse_programs(text: str) -> list[Program]:
    chunks = _extract_program_chunks(text)
    programs: dict[str, Program] = {}

    for c in chunks:
        handle = _g(c, r'"handle"\s*:\s*"([^"]+)"')
        if not handle:
            continue

        # Prefer scalar fields if present.
        name = _g(c, r'"name"\s*:\s*"([^"]+)"')
        url = _g(c, r'"url"\s*:\s*"(https://hackerone\.com/[^"]+)"')
        currency = _g(c, r'"currency"\s*:\s*"([a-zA-Z]{3})"')
        offers_bounties = _g(c, r'"offers_bounties"\s*:\s*(true|false)')

        minv = _g(c, r'"minimum_bounty_table_value"\s*:\s*(\d+)')
        maxv = _g(c, r'"maximum_bounty_table_value"\s*:\s*(\d+)')

        # campaign critical multiplier may exist
        crit = _g(c, r'"campaign"\s*:\s*\{[^}]*?"critical"\s*:\s*([0-9.]+)')

        p = Program(
            handle=handle,
            name=name,
            url=url,
            currency=currency.lower() if currency else None,
            min_bounty=int(minv) if minv else None,
            max_bounty=int(maxv) if maxv else None,
            offers_bounties=(offers_bounties == "true") if offers_bounties else None,
            critical_multiplier=float(crit) if crit else None,
        )

        # Keep the best-known entry per handle (largest max_bounty)
        prev = programs.get(handle)
        if not prev or (p.max_bounty or 0) > (prev.max_bounty or 0):
            programs[handle] = p

    out = list(programs.values())
    out.sort(key=lambda x: x.effective_max_critical, reverse=True)
    return out


def load_bttd_h1() -> dict[str, dict[str, Any]]:
    if not BTTD_H1.exists():
        raise SystemExit(f"Missing {BTTD_H1} (clone bounty-targets-data first)")
    obj = json.loads(BTTD_H1.read_text("utf-8", errors="replace"))
    return {p.get("handle"): p for p in obj if isinstance(p, dict)}

def normalize_github_repo_url(asset_identifier: str) -> str | None:
    """Return normalized https://github.com/<owner>/<repo> or None if not a concrete repo."""
    s = (asset_identifier or "").strip()
    if not s:
        return None

    # Common variants seen in bounty-targets-data exports.
    s = re.sub(r"^https?://www\.github\.com/", "https://github.com/", s, flags=re.I)
    s = re.sub(r"^www\.github\.com/", "https://github.com/", s, flags=re.I)
    if s.lower().startswith("github.com/"):
        s = "https://" + s

    # git@github.com:owner/repo(.git)
    m = re.match(r"^git@github\.com:([^/]+)/([^/#?]+?)(?:\.git)?$", s, flags=re.I)
    if m:
        owner, repo = m.group(1), m.group(2)
        if "*" in owner or "*" in repo:
            return None
        return f"https://github.com/{owner}/{repo}"

    # https://github.com/owner/repo(/...)
    m = re.match(r"^(https?://github\.com/[^/]+/[^/]+)", s, flags=re.I)
    if not m:
        return None
    url = m.group(1).rstrip("/")
    owner = url.split("/")[-2]
    repo = url.split("/")[-1]
    if "*" in owner or "*" in repo:
        return None
    return url


def source_code_repos_for(handle: str, by_handle: dict[str, dict[str, Any]]) -> list[str]:
    p = by_handle.get(handle)
    if not p:
        return []
    targets = (p.get("targets") or {}).get("in_scope") or []
    repos: list[str] = []
    for t in targets:
        if not isinstance(t, dict):
            continue
        if t.get("asset_type") != "SOURCE_CODE":
            continue
        if not t.get("eligible_for_bounty"):
            continue
        ident = (t.get("asset_identifier") or "").strip()
        r = normalize_github_repo_url(ident)
        if not r:
            continue
        repos.append(r)
    # dedupe keep order
    seen=set(); out=[]
    for r in repos:
        if r in seen: continue
        seen.add(r)
        out.append(r)
    return out


def cmd_list(args: argparse.Namespace) -> int:
    text = _read_text(Path(args.file))
    programs = parse_programs(text)
    if args.offers_bounties_only:
        programs = [p for p in programs if p.offers_bounties]

    for p in programs[: args.top]:
        mult = p.critical_multiplier or 1.0
        mx = p.max_bounty or 0
        eff = p.effective_max_critical
        cur = p.currency or ""
        print(f"{p.handle}\tmax={mx}{cur}\tcrit_x{mult}\teff_max={eff:.0f}\t{p.name or ''}")
    print(f"\n(total parsed: {len(programs)})")
    return 0


def cmd_repos(args: argparse.Namespace) -> int:
    text = _read_text(Path(args.file))
    programs = parse_programs(text)
    if args.offers_bounties_only:
        programs = [p for p in programs if p.offers_bounties]

    by_handle = load_bttd_h1()

    printed = 0
    for p in programs:
        repos = source_code_repos_for(p.handle, by_handle)
        if not repos:
            continue
        mult = p.critical_multiplier or 1.0
        mx = p.max_bounty or 0
        cur = p.currency or ""
        print(f"{p.handle}\tmax={mx}{cur}\tcrit_x{mult}\tcode_repos={len(repos)}\t{p.name or ''}")
        for r in repos[: args.max_repos]:
            print(f"  {r}")
        printed += 1
        if printed >= args.top:
            break

    if printed == 0:
        print("(no code repos found for the extracted programs)")
    return 0


def cmd_add_to_targets(args: argparse.Namespace) -> int:
    text = _read_text(Path(args.file))
    programs = parse_programs(text)
    programs = [p for p in programs if p.offers_bounties]

    by_handle = load_bttd_h1()

    targets_path = ROOT / "data" / "targets.json"
    obj = json.loads(targets_path.read_text("utf-8", errors="replace"))
    existing = {t.get("repo_url") for t in obj.get("targets", []) if isinstance(t, dict)}

    added = 0
    for p in programs[: args.top]:
        repos = source_code_repos_for(p.handle, by_handle)
        if not repos:
            continue
        for r in repos:
            if r in existing:
                continue
            repo_name = r.rstrip("/").split("/")[-1]
            obj["targets"].append(
                {
                    "program": p.name or p.handle,
                    "platform": "hackerone",
                    "repo_url": r,
                    "clone_path": str((ROOT / "data" / "repos" / repo_name).resolve()),
                    "language": None,
                    "bounty_min": p.min_bounty,
                    "bounty_max": p.max_bounty,
                    "focus": ["auth", "injection", "ssrf", "rce", "idor"],
                    "notes": f"Top-paying program scrape ({Path(args.file).name}): handle={p.handle}, max={p.max_bounty}{p.currency or ''}, crit_x{p.critical_multiplier or 1.0}",
                }
            )
            existing.add(r)
            added += 1
            if added >= args.max_add:
                break
        if added >= args.max_add:
            break

    targets_path.write_text(json.dumps(obj, indent=2, sort_keys=False) + "\n")
    print(f"updated {targets_path}: added {added}")
    return 0


def main() -> int:
    ap = argparse.ArgumentParser()
    sub = ap.add_subparsers(dest="cmd", required=True)

    ap_list = sub.add_parser("list")
    ap_list.add_argument("--file", required=True)
    ap_list.add_argument("--top", type=int, default=25)
    ap_list.add_argument("--offers-bounties-only", action="store_true")
    ap_list.set_defaults(func=cmd_list)

    ap_repos = sub.add_parser("repos")
    ap_repos.add_argument("--file", required=True)
    ap_repos.add_argument("--top", type=int, default=25, help="number of programs with code repos to print")
    ap_repos.add_argument("--max-repos", type=int, default=20)
    ap_repos.add_argument("--offers-bounties-only", action="store_true")
    ap_repos.set_defaults(func=cmd_repos)

    ap_add = sub.add_parser("add-to-targets")
    ap_add.add_argument("--file", required=True)
    ap_add.add_argument("--top", type=int, default=10)
    ap_add.add_argument("--max-add", type=int, default=20)
    ap_add.set_defaults(func=cmd_add_to_targets)

    args = ap.parse_args()
    return args.func(args)


if __name__ == "__main__":
    raise SystemExit(main())