#!/usr/bin/env python3
"""
Extract a huntr submission "paste bundle" from a report markdown file in this repo.

Usage:
  python3 scripts/huntr_paste.py reports/MLflow-FileRead-2026-02-12.md
"""

from __future__ import annotations

import argparse
import re
from pathlib import Path


def _extract_h1(text: str) -> str | None:
    for line in text.splitlines():
        if line.startswith("# "):
            return line[2:].strip()
    return None


def _extract_meta(text: str) -> dict[str, str]:
    # Parse lines like "key: value" under "## meta" until the next "## " header.
    m = re.search(r"(?m)^## meta\s*$", text)
    if not m:
        return {}
    rest = text[m.end() :]
    # Stop at the next section header OR the start of the copy-paste fenced bundle.
    end = re.search(r"(?m)^(##\s+\S|```+)", rest)
    block = rest[: end.start()] if end else rest

    meta: dict[str, str] = {}
    for line in block.splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        meta[k.strip()] = v.strip()
    return meta


def _extract_inline_meta(text: str) -> dict[str, str]:
    """
    Support the newer lightweight report header format used in this repo:

      STATUS: DRAFT
      program: X | platform: huntr | repo: org/name | commit: abc123
    """
    head = "\n".join(text.splitlines()[:20])
    out: dict[str, str] = {}

    m = re.search(r"(?mi)^STATUS:\s*(.+?)\s*$", head)
    if m:
        out["status"] = m.group(1).strip()

    m = re.search(r"(?mi)^program:\s*(.+?)\s*$", head)
    if m:
        parts = [p.strip() for p in m.group(1).split("|")]
        # First component is typically the program name; remaining are key/value pairs.
        if parts:
            out["program"] = parts[0]
        for p in parts[1:]:
            if ":" not in p:
                continue
            k, v = p.split(":", 1)
            out[k.strip().lower()] = v.strip()
    return out


def _extract_first_fenced_block(after_header: str) -> str | None:
    # Find the first backtick-fenced block after a header. Supports both ``` and ````.
    start = re.search(r"(?m)^(```+)[^\n]*\n", after_header)
    if not start:
        return None
    fence = start.group(1)
    rest = after_header[start.end() :]
    end = re.search(rf"(?m)^{re.escape(fence)}\s*$", rest)
    if not end:
        return None
    return rest[: end.start()]


def _extract_section_fence(text: str, header_re: str) -> str | None:
    m = re.search(header_re, text, flags=re.MULTILINE)
    if not m:
        return None
    return _extract_first_fenced_block(text[m.end() :])


def _extract_section_body(text: str, header_re: str, *, stop_re: str = r"(?m)^#{1,6}\s+\S") -> str | None:
    """
    Extract the plain (non-fenced) body text after a header until the next header of same/higher level.
    Used for newer report templates where Description/Impact are not inside fenced blocks.
    """
    m = re.search(header_re, text, flags=re.MULTILINE)
    if not m:
        return None
    rest = text[m.end() :]
    # Stop at the next header line (by default, any header). For the newer
    # report template, we often want to stop at the next same-level `### ...`
    # section so that inner "# Description" / "# Proof of Concept" headings
    # remain part of the extracted body.
    end = re.search(stop_re, rest)
    body = rest[: end.start()] if end else rest
    return body.strip() or None


def _parse_colon_fields(block: str) -> dict[str, str]:
    # Parse "Field: value" lines. For multi-line fields (CVSS/Occurrences/References),
    # keep them as raw by letting the caller print the full block.
    out: dict[str, str] = {}
    for line in block.splitlines():
        if not line.strip():
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        k = k.strip()
        v = v.strip()
        if k and v and k not in out:
            out[k] = v
    return out


def _iter_fenced_blocks(text: str) -> list[str]:
    """
    Return all backtick-fenced blocks (```...``` or ````...````), in file order.
    """
    blocks: list[str] = []
    for m in re.finditer(r"(?m)^(```+)[^\n]*\n", text):
        fence = m.group(1)
        end = re.search(rf"(?m)^{re.escape(fence)}\s*$", text[m.end() :])
        if not end:
            continue
        blocks.append(text[m.end() : m.end() + end.start()])
    return blocks


def _extract_paste_bundle(text: str) -> str | None:
    """
    New repo format: after meta, reports include exactly one copy-pasteable bundle
    in a fenced block. We detect it by the presence of core huntr fields.
    """
    for block in _iter_fenced_blocks(text):
        if "Package Manager:" in block and "Title:" in block and "Description:" in block:
            return block.rstrip()
    return None


def _parse_huntr_bundle(block: str) -> dict[str, str]:
    """
    Parse a copy-paste bundle and extract known huntr form fields reliably,
    including multi-line sections like CVSS/Occurrences/References and the
    Description markdown body.
    """
    known = {
        "Repository URL",
        "Repository",
        "Package Manager",
        "Version Affected",
        "Vulnerability Type",
        "CVSS",
        "Title",
        "Impact",
        "Description",
        "Occurrences",
        "References",
    }

    lines = block.splitlines()
    starts: list[tuple[str, int, str]] = []
    for i, line in enumerate(lines):
        if not line or line.startswith(" "):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        k = k.strip()
        if k in known:
            starts.append((k, i, v.lstrip()))

    if not starts:
        return {}

    # Turn starts into slices.
    out: dict[str, str] = {}
    for idx, (k, i, v0) in enumerate(starts):
        j = starts[idx + 1][1] if idx + 1 < len(starts) else len(lines)
        body_lines = lines[i + 1 : j]

        if k == "Description":
            out["Description Markdown"] = "\n".join(body_lines).strip()
            out["Description"] = v0.strip()  # usually empty
            continue

        if k in {"CVSS", "Occurrences", "References"}:
            raw = [f"{k}: {v0}".rstrip()]
            raw.extend(body_lines)
            out[f"{k} (Raw)"] = "\n".join(raw).rstrip()
            continue

        out[k] = v0.strip()

    # Normalize repo key
    if "Repository URL" in out and "Repository" not in out:
        out["Repository"] = out["Repository URL"]

    return out


def _extract_markdown_section(md: str, heading: str) -> str | None:
    """
    Extract a markdown section body by heading name (case-insensitive).
    Supports headings like:
      ## Impact
      # Impact
      ### Impact
    Returns the body until the next heading of same or higher level.
    """
    if not md:
        return None

    target = heading.strip().lower()
    lines = md.splitlines()

    start_idx = None
    level = None
    for i, line in enumerate(lines):
        s = line.lstrip()
        if not s.startswith("#"):
            continue
        # Count heading level and normalize title.
        hashes = len(s) - len(s.lstrip("#"))
        title = s[hashes:].strip().lower()
        if title == target:
            start_idx = i + 1
            level = hashes
            break

    if start_idx is None or level is None:
        return None

    end_idx = len(lines)
    for j in range(start_idx, len(lines)):
        s = lines[j].lstrip()
        if not s.startswith("#"):
            continue
        hashes = len(s) - len(s.lstrip("#"))
        if hashes <= level:
            end_idx = j
            break

    body = "\n".join(lines[start_idx:end_idx]).strip()
    return body or None


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("report_md", type=Path)
    args = ap.parse_args()

    path: Path = args.report_md
    text = path.read_text("utf-8", errors="replace")

    title = _extract_h1(text) or "(missing title)"
    meta = _extract_meta(text) or _extract_inline_meta(text)

    bundle_block = _extract_paste_bundle(text)
    huntr_block = _extract_section_fence(text, r"(?m)^## huntr_submission\s*$")
    # Back-compat with the newer report template.
    huntr_form_fields = _extract_section_fence(text, r"(?m)^## huntr Form Fields\s*$")
    desc_md = _extract_section_fence(text, r"(?m)^### Description field.*\s*$")

    print(f"REPORT: {path}")
    print()
    print("=== Title ===")
    print(title)
    print()

    if meta:
        print("=== Meta ===")
        for k in ("platform", "program", "asset", "date", "status"):
            if k in meta:
                print(f"{k}: {meta[k]}")
        for k in sorted(set(meta) - {"platform", "program", "asset", "date", "status"}):
            print(f"{k}: {meta[k]}")
        print()

    # Preferred (current): copy-paste bundle fenced block after meta.
    # Legacy A: `## huntr_submission` fenced block contains everything.
    # Legacy B: `## huntr Form Fields` fenced block + separate sections.
    if bundle_block:
        fields = _parse_huntr_bundle(bundle_block)
        desc_fallback = fields.get("Description Markdown") or None
        impact_fallback = _extract_markdown_section(desc_fallback or "", "Impact")
        occurrences_fallback = fields.get("Occurrences (Raw)") or None
        if not occurrences_fallback:
            occ = _extract_markdown_section(desc_fallback or "", "Occurrences")
            if occ:
                occurrences_fallback = "Occurrences:\n" + occ

        references_fallback = fields.get("References (Raw)") or None
        if not references_fallback:
            refs = _extract_markdown_section(desc_fallback or "", "References")
            if refs:
                references_fallback = "References:\n" + refs
        cvss_source = fields.get("CVSS (Raw)") or ""
    elif huntr_block:
        fields = _parse_colon_fields(huntr_block)
        impact_fallback = None
        # Prefer the legacy fenced markdown, but allow the newer `### Description`
        # section body as fallback (many reports use that even with huntr_submission).
        desc_fallback = desc_md or _extract_section_body(
            text, r"(?m)^### Description\s*$", stop_re=r"(?m)^###\s+\S"
        )
        occurrences_fallback = None
        references_fallback = None
        cvss_source = huntr_block
    elif huntr_form_fields:
        fields = _parse_colon_fields(huntr_form_fields)
        impact_fallback = _extract_section_body(text, r"(?m)^### Impact\s*$", stop_re=r"(?m)^###\s+\S")
        desc_fallback = _extract_section_body(text, r"(?m)^### Description\s*$", stop_re=r"(?m)^###\s+\S")
        occurrences_fallback = _extract_section_fence(text, r"(?m)^### Occurrences\s*$")
        references_fallback = _extract_section_fence(text, r"(?m)^### References\s*$")
        cvss_source = huntr_form_fields
    else:
        print("=== huntr_submission ===")
        print("(missing huntr paste bundle / `## huntr_submission` / `## huntr Form Fields` in this report)")
        return 0

    def show(label: str, key: str) -> None:
        print(f"=== {label} ===")
        print(fields.get(key, "(missing)"))
        print()

    show("Repository URL", "Repository")
    show("Package Manager", "Package Manager")
    show("Version Affected", "Version Affected")
    show("Vulnerability Type", "Vulnerability Type")

    # CVSS is usually multi-line; print the raw subsection.
    print("=== CVSS (Raw) ===")
    if bundle_block:
        print((cvss_source or "").strip() or "(missing)")
    else:
        cvss_lines: list[str] = []
        in_cvss = False
        for line in cvss_source.splitlines():
            if line.strip().startswith("CVSS:"):
                in_cvss = True
                cvss_lines.append(line)
                continue
            if in_cvss and re.match(r"^[A-Za-z][A-Za-z0-9 _-]*:\s*", line):
                # next top-level field
                break
            if in_cvss:
                cvss_lines.append(line)
        print("\n".join(cvss_lines).strip() or "(missing)")
    print()

    show("Title (Huntr Field)", "Title")
    print("=== Impact ===")
    print(fields.get("Impact") or impact_fallback or "(missing)")
    print()

    print("=== Description (Huntr Field) ===")
    print(fields.get("Description", "(missing)"))
    print()

    print("=== Description Markdown (If Present) ===")
    if desc_fallback:
        print(desc_fallback.rstrip())
    else:
        print("(missing description body)")
    print()

    # Occurrences and References are multi-line; print raw sections.
    def print_raw_section(section_name: str) -> None:
        print(f"=== {section_name} (Raw) ===")
        if section_name == "Occurrences" and occurrences_fallback:
            print(occurrences_fallback.rstrip())
            print()
            return
        if section_name == "References" and references_fallback:
            print(references_fallback.rstrip())
            print()
            return
        if bundle_block:
            print("(not present in bundle)")
            print()
            return
        lines: list[str] = []
        in_section = False
        # Only the preferred template has these inside the huntr_submission fenced block.
        if not huntr_block:
            print("(missing)")
            print()
            return
        for line in huntr_block.splitlines():
            if line.strip().startswith(f"{section_name}:"):
                in_section = True
                lines.append(line)
                continue
            if in_section and re.match(r"^[A-Za-z][A-Za-z0-9 _-]*:\s*", line):
                break
            if in_section:
                lines.append(line)
        print("\n".join(lines).strip() or "(missing)")
        print()

    print_raw_section("Occurrences")
    print_raw_section("References")

    return 0


if __name__ == "__main__":
    raise SystemExit(main())