#!/usr/bin/env python3
import email.utils
import hashlib
import json
import os
import re
import subprocess
import time
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta, timezone
from html import escape
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

BASE_DIR = Path(__file__).resolve().parent
SOURCES_PATH = BASE_DIR / 'feed-sources.json'
DATA_DIR = BASE_DIR / 'data'
OUTPUT_DIR = Path('/var/www/rss-offline')
MAX_ENTRIES = int(os.environ.get('RSS_OFFLINE_MAX_ENTRIES', '250'))
DAYS = int(os.environ.get('RSS_OFFLINE_DAYS', '14'))
TIMEOUT = 20
HEADERS = {'user-agent': 'Mozilla/5.0 (rss-offline builder)'}
DEFAULT_CAPTURE_MODE = os.environ.get('RSS_OFFLINE_CAPTURE_MODE', 'rendered').strip().lower() or 'rendered'
RENDER_BUDGET_MS = int(os.environ.get('RSS_OFFLINE_RENDER_BUDGET_MS', '8000'))
MAX_RENDERED_ENTRIES = int(os.environ.get('RSS_OFFLINE_MAX_RENDERED_ENTRIES', '8'))
MAX_ASSETS_PER_PAGE = int(os.environ.get('RSS_OFFLINE_MAX_ASSETS_PER_PAGE', '24'))
MAX_ASSET_BYTES = int(os.environ.get('RSS_OFFLINE_MAX_ASSET_BYTES', '1500000'))
ALLOWED_ASSET_EXTENSIONS = ('.css', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.ico')
CHROMIUM_BIN = os.environ.get('RSS_OFFLINE_CHROMIUM_BIN', '/usr/bin/chromium')
PDF_DIRNAME = os.environ.get('RSS_OFFLINE_PDF_DIRNAME', 'pdf')
PDF_MAX_ARTICLES = int(os.environ.get('RSS_OFFLINE_PDF_MAX_ARTICLES', '8'))
PDF_BASENAME = os.environ.get('RSS_OFFLINE_PDF_BASENAME', 'rss-offline')
PDF_PAGE_TIMEOUT = int(os.environ.get('RSS_OFFLINE_PDF_PAGE_TIMEOUT', '90'))
PDF_MAX_SOURCE_BYTES = int(os.environ.get('RSS_OFFLINE_PDF_MAX_SOURCE_BYTES', '2500000'))
STATE_PATH = DATA_DIR / 'build-state.json'
STATE_KEEP_DAYS = int(os.environ.get('RSS_OFFLINE_STATE_KEEP_DAYS', '45'))


def fetch(url: str) -> requests.Response:
    response = requests.get(url, timeout=TIMEOUT, headers=HEADERS)
    response.raise_for_status()
    return response


def text(node, path_options):
    for path in path_options:
        found = node.find(path)
        if found is not None and found.text and found.text.strip():
            return found.text.strip()
    return ''


def entry_links(item):
    for tag in ['link', '{http://www.w3.org/2005/Atom}link']:
        for node in item.findall(tag):
            href = node.get('href') or (node.text or '').strip()
            if href:
                rel = node.get('rel', 'alternate')
                if rel in ('alternate', ''):
                    return href
    return ''


def parse_timestamp(raw: str) -> int:
    raw = (raw or '').strip()
    if not raw:
        return 0
    try:
        return int(email.utils.parsedate_to_datetime(raw).timestamp())
    except Exception:
        pass
    try:
        return int(datetime.fromisoformat(raw.replace('Z', '+00:00')).timestamp())
    except Exception:
        return 0


def slug(url: str) -> str:
    return hashlib.sha256(url.encode()).hexdigest()[:16]


def asset_rel_path(entry_key: str, asset_url: str, content_type: str = '') -> str:
    parsed = urlparse(asset_url)
    raw = (parsed.path or '').lower()
    ext = Path(raw).suffix
    if not ext:
        if 'css' in content_type:
            ext = '.css'
        elif 'javascript' in content_type:
            ext = '.js'
        elif 'svg' in content_type:
            ext = '.svg'
        elif 'png' in content_type:
            ext = '.png'
        elif 'jpeg' in content_type:
            ext = '.jpg'
        elif 'webp' in content_type:
            ext = '.webp'
        elif 'gif' in content_type:
            ext = '.gif'
        elif 'woff2' in content_type:
            ext = '.woff2'
        elif 'woff' in content_type:
            ext = '.woff'
        elif 'html' in content_type:
            ext = '.html'
        else:
            ext = '.bin'
    name = hashlib.sha256(asset_url.encode()).hexdigest()[:16] + ext
    return f'assets/{entry_key}/{name}'


def fetch_asset(url: str) -> requests.Response:
    response = requests.get(url, timeout=TIMEOUT, headers=HEADERS, stream=True)
    response.raise_for_status()
    return response


def should_keep_asset(node, parsed):
    path_lower = parsed.path.lower()
    if node.name == 'link' and 'stylesheet' in ' '.join(node.get('rel', [])).lower():
        return True
    if node.name != 'img':
        return False
    return any(path_lower.endswith(ext) for ext in ALLOWED_ASSET_EXTENSIONS)


def mirror_assets(entry, soup, entry_key: str):
    page_origin = urlparse(entry['link']).netloc
    mirrored = {}
    total_bytes = 0
    count = 0

    def candidates():
        for tag, attr in [('link', 'href'), ('img', 'src')]:
            for node in soup.find_all(tag):
                ref = node.get(attr)
                if ref:
                    yield node, attr, ref

    for node, attr, ref in candidates():
        if count >= MAX_ASSETS_PER_PAGE or total_bytes >= MAX_ASSET_BYTES:
            break
        asset_url = urljoin(entry['link'], ref)
        parsed = urlparse(asset_url)
        if parsed.scheme not in ('http', 'https'):
            continue
        if parsed.netloc != page_origin:
            continue
        if not should_keep_asset(node, parsed):
            continue
        try:
            rel = mirrored.get(asset_url)
            if rel is None:
                res = fetch_asset(asset_url)
                content_type = res.headers.get('content-type', '').lower()
                rel = asset_rel_path(entry_key, asset_url, content_type)
                data = b''
                limit_hit = False
                for chunk in res.iter_content(65536):
                    if not chunk:
                        continue
                    data += chunk
                    if len(data) + total_bytes > MAX_ASSET_BYTES:
                        limit_hit = True
                        break
                res.close()
                if limit_hit or not data:
                    continue
                out = OUTPUT_DIR / rel
                out.parent.mkdir(parents=True, exist_ok=True)
                out.write_bytes(data)
                mirrored[asset_url] = rel
                total_bytes += len(data)
                count += 1
            node[attr] = '/rss-offline/' + rel
        except Exception:
            continue


def render_page(url: str) -> str:
    cmd = [
        CHROMIUM_BIN,
        '--headless=new',
        '--disable-gpu',
        '--no-sandbox',
        '--run-all-compositor-stages-before-draw',
        f'--virtual-time-budget={RENDER_BUDGET_MS}',
        '--dump-dom',
        url,
    ]
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=max(TIMEOUT, (RENDER_BUDGET_MS // 1000) + 10), check=True)
    html = result.stdout.strip()
    if not html:
        raise RuntimeError('empty rendered DOM')
    return html


def capture_page(entry, rendered_allowed: bool):
    preferred = (entry.get('capture_mode') or DEFAULT_CAPTURE_MODE).lower()
    if preferred == 'rendered' and not rendered_allowed:
        preferred = 'simple'
    modes = [preferred] + [mode for mode in ('rendered', 'simple') if mode != preferred and (mode != 'rendered' or rendered_allowed)]
    errors = []
    for mode in modes:
        try:
            if mode == 'rendered':
                return render_page(entry['link']), 'rendered'
            if mode == 'simple':
                return fetch(entry['link']).text, 'simple'
        except Exception as e:
            errors.append(f'{mode}:{e}')
    raise RuntimeError(' ; '.join(errors))


def make_local_copy(entry, page_html: str, entry_key: str) -> str:
    soup = BeautifulSoup(page_html, 'html.parser')

    if soup.head is None:
        head = soup.new_tag('head')
        if soup.html is None:
            html = soup.new_tag('html')
            html.append(head)
            body = soup.new_tag('body')
            for child in list(soup.contents):
                body.append(child.extract())
            html.append(body)
            soup.append(html)
        else:
            soup.html.insert(0, head)

    if soup.head.find('meta', attrs={'charset': True}) is None:
        soup.head.insert(0, soup.new_tag('meta', charset='utf-8'))
    if soup.head.find('meta', attrs={'name': 'viewport'}) is None:
        soup.head.append(soup.new_tag('meta', attrs={'name': 'viewport', 'content': 'width=device-width, initial-scale=1'}))

    for base in soup.head.find_all('base'):
        base.decompose()
    soup.head.insert(0, soup.new_tag('base', href=entry['link']))

    banner_css = (
        'body{margin-top:72px!important}'
        '.rss-offline-banner{position:fixed;top:0;left:0;right:0;z-index:2147483647;'
        'background:#07111f;color:#fff;padding:12px 16px;font:14px/1.4 system-ui,sans-serif;'
        'box-shadow:0 1px 0 rgba(255,255,255,.08)}'
        '.rss-offline-banner a{color:#8dc3ff;text-decoration:none}'
        '.rss-offline-banner strong{color:#fff}'
        '.rss-offline-banner .muted{opacity:.78}'
    )
    style = soup.new_tag('style')
    style.string = banner_css
    soup.head.append(style)

    mirror_assets(entry, soup, entry_key)

    banner = BeautifulSoup(
        f'<div class="rss-offline-banner"><a href="/rss-offline/">← rss-offline</a> · '
        f'<a href="{escape(entry["link"], quote=True)}">abrir original</a> · '
        f'<strong>{escape(entry["title"])} [{escape(entry["captured_mode"])}]</strong> · '
        f'<span class="muted">{escape(entry["feed"])} · {escape(entry["date_label"])} </span></div>',
        'html.parser'
    )
    body = soup.body or soup
    body.insert(0, banner)

    return '<!doctype html>\n' + str(soup)


def load_sources():
    sources = json.loads(SOURCES_PATH.read_text())
    for source in sources:
        source['captureMode'] = (source.get('captureMode') or DEFAULT_CAPTURE_MODE).lower()
    return sources


def isolate_anchor_section(soup: BeautifulSoup, entry: dict):
    parsed = urlparse(entry['link'])
    fragment = (parsed.fragment or '').strip()
    if not fragment:
        return soup
    target = soup.find(id=fragment)
    if target is None:
        target = soup.find(attrs={'name': fragment})
    if target is None:
        return soup
    container = target
    for _ in range(6):
        parent = container.parent
        if parent is None or not getattr(parent, 'name', None):
            break
        text_len = len(parent.get_text(' ', strip=True))
        if 200 < text_len < 20000:
            container = parent
            break
        container = parent
    wrapper = BeautifulSoup('<!doctype html><html><head></head><body></body></html>', 'html.parser')
    wrapper.body.append(BeautifulSoup(str(container), 'html.parser'))
    return wrapper


def pick_print_root(soup: BeautifulSoup, entry: dict):
    host = urlparse(entry['link']).netloc.lower()
    selectors = []
    if host.endswith('github.blog'):
        selectors = [
            'section.post__content',
            '[class*="post__content"]',
            'main .post__content',
        ]
    elif host.endswith('digitalocean.com'):
        selectors = [
            '[class*="MarkdownStyles__StyledMarkdown"]',
            '[class*="blog-slug__StyledPost"]',
            '[class*="Sidebar__StyledSidebarContent"]',
        ]
    elif host.endswith('substack.com'):
        selectors = [
            '.available-content',
            '.body.markup',
            'article.newsletter-post',
        ]
    elif host.endswith('tailscale.com'):
        selectors = [
            '.changelog-entry',
            '.Markdown--changelog',
        ]
    elif host.endswith('gomakethings.com'):
        selectors = [
            'article',
            'main',
        ]
    elif host.endswith('leadershipintech.com'):
        selectors = [
            '.campaign',
        ]
    elif host.endswith('martinalderson.com'):
        selectors = [
            '.post-content',
            'article.post',
        ]
    elif host.endswith('openclaw.ai'):
        selectors = [
            '.article-content',
            'article.article',
        ]
    elif host.endswith('world.hey.com'):
        selectors = [
            '.trix-content',
            'article',
        ]
    elif host.endswith('mariozechner.at'):
        selectors = [
            'article',
        ]

    for selector in selectors:
        nodes = soup.select(selector)
        if not nodes:
            continue
        best = max(nodes, key=lambda n: len(n.get_text(' ', strip=True)))
        if len(best.get_text(' ', strip=True)) >= 300:
            wrapper = BeautifulSoup('<!doctype html><html><head></head><body></body></html>', 'html.parser')
            wrapper.body.append(BeautifulSoup(str(best), 'html.parser'))
            return wrapper
    return soup


def clean_pdf_soup(soup: BeautifulSoup):
    selectors = [
        'script', 'noscript', 'iframe', 'video', 'audio', 'form', 'button',
        'nav', 'footer', 'aside',
        '[aria-modal="true"]', '[role="dialog"]',
        '.modal', '.overlay', '.backdrop', '.drawer', '.popup',
        '.hash-anchor',
        '[class*="overlay"]', '[class*="backdrop"]',
        '[style*="backdrop-filter"]'
    ]
    for selector in selectors:
        for tag in soup.select(selector):
            tag.decompose()
    for tag in soup.find_all(True):
        attrs = tag.attrs or {}
        style = (attrs.get('style') or '').lower().replace(' ', '')
        klass = ' '.join(attrs.get('class', [])).lower()
        if ('position:fixed' in style or 'position:sticky' in style) and (
            'overlay' in klass or 'backdrop' in klass or 'modal' in klass or
            'inset:0' in style or ('top:0' in style and 'left:0' in style and 'right:0' in style and 'bottom:0' in style)
        ):
            tag.decompose()
            continue
        if tag.name == 'svg' and tag.parent and getattr(tag.parent, 'name', None) in {'a', 'button'}:
            tag.decompose()
            continue
        if tag.name == 'a' and (attrs.get('href') or '').startswith('#'):
            if tag.get_text(' ', strip=True):
                tag.unwrap()
            else:
                tag.decompose()
            continue
        allowed_attrs = {
            'a': {'href'},
            'img': {'src', 'alt'},
            'source': {'src', 'srcset', 'type'},
            'td': {'colspan', 'rowspan'},
            'th': {'colspan', 'rowspan'},
            'ol': {'start'},
        }.get(tag.name, set())
        for attr in list(attrs):
            if attr not in allowed_attrs:
                del tag.attrs[attr]
    return soup


def make_pdf_printable_copy(entry: dict):
    src_path = OUTPUT_DIR / f'article-{slug(entry["link"])}.html'
    pdf_dir = OUTPUT_DIR / PDF_DIRNAME
    pdf_dir.mkdir(parents=True, exist_ok=True)
    printable_path = pdf_dir / f'article-{slug(entry["link"])}.print.html'
    soup = BeautifulSoup(src_path.read_text(), 'html.parser')
    soup = isolate_anchor_section(soup, entry)
    soup = pick_print_root(soup, entry)
    soup = clean_pdf_soup(soup)

    body = soup.body or soup
    pdf_header = BeautifulSoup(
        f'<section class="rss-offline-pdf-header">'
        f'<p class="rss-offline-pdf-feed">{escape(entry["feed"])}</p>'
        f'<h1>{escape(entry["title"])}</h1>'
        f'<p class="rss-offline-pdf-meta">{escape(entry["date_label"])} · '
        f'<a href="{escape(entry["link"], quote=True)}">abrir original</a></p>'
        f'</section>',
        'html.parser'
    )
    body.insert(0, pdf_header)

    html = str(soup)
    html = html.replace('/rss-offline/', 'file:///var/www/rss-offline/')
    html = html.replace('position:fixed;top:0;left:0;right:0;', 'position:static;')
    html = html.replace('body{margin-top:72px!important}', 'body{margin-top:0!important}')
    html = html.replace('</head>', '<style>@page{size:140mm 216mm;margin:12mm 11mm}html{font-size:24px}html,body{background:#fff!important;color:#111827!important}body{margin:0!important;font:400 1rem/1.78 ui-serif,Georgia,Cambria,"Times New Roman",serif;-webkit-print-color-adjust:exact;print-color-adjust:exact}body::before,body::after,*::before,*::after{backdrop-filter:none!important;filter:none!important}main,article,section,div{max-width:none!important}p,li,blockquote,figcaption,td,th{font-size:1rem;line-height:1.78}p,ul,ol,blockquote,pre,table,figure{margin-top:0;margin-bottom:1.05em}h1,h2,h3,h4{font-family:ui-sans-serif,system-ui,sans-serif;line-height:1.16;color:#111827;break-after:avoid-page;letter-spacing:-0.02em}h1{font-size:2.05rem}h2{font-size:1.6rem;margin-top:2rem;margin-bottom:.6rem}h3{font-size:1.3rem;margin-top:1.55rem;margin-bottom:.5rem}a{color:#0b57d0!important;text-decoration:none!important}img,svg,video,canvas{display:block;max-width:82%!important;height:auto!important;max-height:78mm!important;margin:1rem auto 1.2rem!important;object-fit:contain!important}figure{margin:1rem auto 1.2rem!important;max-width:82%!important}pre,code{font-family:ui-monospace,SFMono-Regular,Menlo,monospace}pre{white-space:pre-wrap;overflow-wrap:anywhere;background:#f8fafc;border:1px solid #e5e7eb;border-radius:8px;padding:12px;font-size:.8rem;line-height:1.5}table{width:100%!important;border-collapse:collapse;font-size:.82rem}th,td{border:1px solid #e5e7eb;padding:6px 8px;vertical-align:top}ul,ol{padding-left:1.25rem}li+li{margin-top:.35em}header,nav,footer,aside,[role="dialog"],[style*="position:fixed"],[style*="position: sticky"]{display:none!important}.rss-offline-pdf-header{margin:0 0 22px;padding:0 0 14px;border-bottom:1px solid rgba(0,0,0,.12)}.rss-offline-pdf-feed,.rss-offline-pdf-meta{margin:0;font:12px/1.45 ui-sans-serif,system-ui,sans-serif;color:#4b5563}.rss-offline-pdf-feed{font-weight:700;letter-spacing:.05em;text-transform:uppercase;color:#111827}.rss-offline-pdf-header h1{margin:4px 0 7px;font:700 26px/1.08 ui-sans-serif,system-ui,sans-serif;color:#111827}</style></head>')
    if len(html.encode()) > PDF_MAX_SOURCE_BYTES:
        raise ValueError('print source too large')
    printable_path.write_text(html)
    return printable_path


def print_pdf(input_html: Path, output_pdf: Path):
    cmd = [
        CHROMIUM_BIN,
        '--headless=new',
        '--disable-gpu',
        '--no-sandbox',
        '--print-to-pdf=' + str(output_pdf),
        '--no-pdf-header-footer',
        '--allow-file-access-from-files',
        '--disable-features=HttpsUpgrades,BlockInsecurePrivateNetworkRequests',
        input_html.as_uri(),
    ]
    subprocess.run(cmd, capture_output=True, text=True, timeout=PDF_PAGE_TIMEOUT, check=True)


def load_state():
    if not STATE_PATH.exists():
        seeded = {}
        json_index = OUTPUT_DIR / 'entries.json'
        if json_index.exists():
            try:
                for entry in json.loads(json_index.read_text()):
                    link = entry.get('link')
                    ts = entry.get('timestamp') or int(time.time())
                    if isinstance(link, str) and isinstance(ts, int):
                        seeded[link] = ts
            except Exception:
                seeded = {}
        return {'seen_links': seeded}
    try:
        raw = json.loads(STATE_PATH.read_text())
    except Exception:
        return {'seen_links': {}}
    seen_links = raw.get('seen_links') or {}
    if not isinstance(seen_links, dict):
        seen_links = {}
    return {'seen_links': seen_links}



def save_state(state: dict):
    cutoff = int((datetime.now(timezone.utc) - timedelta(days=STATE_KEEP_DAYS)).timestamp())
    seen_links = {
        link: ts for link, ts in (state.get('seen_links') or {}).items()
        if isinstance(link, str) and isinstance(ts, int) and ts >= cutoff
    }
    STATE_PATH.write_text(json.dumps({'seen_links': seen_links}, ensure_ascii=False, indent=2))



def build_pdf_pack(entries, build_stamp: str):
    selected = entries[:PDF_MAX_ARTICLES]
    if not selected:
        return None
    pdf_dir = OUTPUT_DIR / PDF_DIRNAME
    pdf_dir.mkdir(parents=True, exist_ok=True)
    article_pdfs = []
    built = []
    for idx, entry in enumerate(selected, start=1):
        try:
            printable = make_pdf_printable_copy(entry)
            pdf_name = f'article-{idx:03d}-{slug(entry["link"])}.pdf'
            pdf_path = pdf_dir / pdf_name
            print_pdf(printable, pdf_path)
            article_pdfs.append(str(pdf_path))
            built.append({
                'title': entry['title'],
                'path': f'/rss-offline/{PDF_DIRNAME}/{pdf_name}',
            })
            print('PDF_OK', entry['link'])
        except Exception as e:
            print('PDF_ERR', entry['link'], e)
    if not article_pdfs:
        return None
    merged_filename = f'{PDF_BASENAME}-{build_stamp}.pdf'
    merged_path = OUTPUT_DIR / merged_filename
    subprocess.run(['pdfunite', *article_pdfs, str(merged_path)], check=True)
    return {
        'path': f'/rss-offline/{merged_filename}',
        'count': len(built),
        'articles': built,
    }


def list_pdf_archive():
    pdfs = []
    pattern = re.compile(rf'^{re.escape(PDF_BASENAME)}-\d{{4}}-\d{{2}}-\d{{2}}(?:_\d{{2}}-\d{{2}})?\.pdf$')
    for path in sorted(OUTPUT_DIR.glob(f'{PDF_BASENAME}-*.pdf'), reverse=True):
        if not pattern.match(path.name):
            continue
        stat = path.stat()
        pdfs.append({
            'name': path.name,
            'path': f'/rss-offline/{path.name}',
            'size_mb': round(stat.st_size / (1024 * 1024), 1),
            'date_label': datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).astimezone().strftime('%Y-%m-%d %H:%M'),
        })
    return pdfs


def build():
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    build_stamp = datetime.now().astimezone().strftime('%Y-%m-%d')
    state = load_state()
    seen_links = set((state.get('seen_links') or {}).keys())
    sources = load_sources()
    cutoff = int((datetime.now(timezone.utc) - timedelta(days=DAYS)).timestamp())
    entries = []

    for source in sources:
        try:
            response = fetch(source['feedUrl'])
            xml_text = response.text
            root = ET.fromstring(xml_text)
        except Exception as e:
            print('FEED_ERR', source['name'], e)
            continue

        items = []
        if root.tag.endswith('rss'):
            channel = root.find('channel')
            items = channel.findall('item') if channel is not None else []
        else:
            items = root.findall('{http://www.w3.org/2005/Atom}entry')

        for item in items:
            title = text(item, ['title', '{http://www.w3.org/2005/Atom}title']) or source['name']
            link = entry_links(item)
            if not link:
                continue
            link = urljoin(source['siteUrl'], link)
            ts = parse_timestamp(text(item, ['pubDate', '{http://www.w3.org/2005/Atom}updated', '{http://www.w3.org/2005/Atom}published']))
            if ts and ts < cutoff:
                continue
            entries.append({
                'feed': source['name'],
                'feed_url': source['feedUrl'],
                'title': title,
                'link': link,
                'timestamp': ts,
                'capture_mode': source['captureMode'],
            })

    entries.sort(key=lambda item: item['timestamp'], reverse=True)
    entries = entries[:MAX_ENTRIES]

    deduped = []
    seen_links = set()
    for entry in entries:
        key = entry['link']
        if key in seen_links:
            continue
        seen_links.add(key)
        deduped.append(entry)
    entries = deduped

    archived = []
    new_entries = []
    rendered_used = 0
    for entry in entries:
        try:
            wants_rendered = (entry.get('capture_mode') or DEFAULT_CAPTURE_MODE) == 'rendered'
            page_html, used_mode = capture_page(entry, rendered_allowed=(rendered_used < MAX_RENDERED_ENTRIES or not wants_rendered))
            key = slug(entry['link'])
            article_file = OUTPUT_DIR / f'article-{key}.html'
            entry['local_path'] = f'/rss-offline/article-{key}.html'
            entry['date_label'] = datetime.fromtimestamp(entry['timestamp'] or time.time(), tz=timezone.utc).astimezone().strftime('%Y-%m-%d %H:%M')
            entry['captured_mode'] = used_mode
            article_file.write_text(make_local_copy(entry, page_html, key))
            if used_mode == 'rendered':
                rendered_used += 1
            archived.append(entry)
            if entry['link'] not in seen_links:
                new_entries.append(entry)
            print('OK', used_mode, entry['link'])
        except Exception as e:
            print('ARTICLE_ERR', entry['link'], e)

    pdf_info = build_pdf_pack(new_entries, build_stamp)
    if pdf_info is None:
        stale_merged = OUTPUT_DIR / f'{PDF_BASENAME}-{build_stamp}.pdf'
        if stale_merged.exists():
            stale_merged.unlink()

    index = OUTPUT_DIR / 'index.html'
    sw = OUTPUT_DIR / 'sw.js'
    offline = OUTPUT_DIR / 'offline.html'
    json_index = OUTPUT_DIR / 'entries.json'

    json_index.write_text(json.dumps(archived, ensure_ascii=False, indent=2))

    for entry in archived:
        state['seen_links'][entry['link']] = entry['timestamp'] or int(time.time())
    save_state(state)

    pdf_archive = list_pdf_archive()
    pdf_cards = []
    for item in pdf_archive:
        pdf_cards.append(f'''<article class="item"><h2><a class="pdf-link" href="{escape(item['path'])}">{escape(item['name'])}</a> <a class="pdf-download" href="{escape(item['path'])}" download>descargar</a></h2><p class="meta">{escape(item['date_label'])} · {item['size_mb']} MB</p></article>''')

    local_paths = [entry['local_path'] for entry in archived]
    index.write_text(f'''<!doctype html><html lang="es"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1"><meta name="theme-color" content="#07111f"><title>rss-offline</title><style>*{{box-sizing:border-box}}body{{margin:0;font-family:system-ui,sans-serif;background:#07111f;color:#fff}}main{{max-width:960px;margin:0 auto;padding:24px}}h1,h2{{margin:0 0 8px}}.muted{{color:rgba(255,255,255,.68)}}.item{{padding:16px 0;border-bottom:1px solid rgba(255,255,255,.08)}}.item h2{{margin:0 0 6px;font-size:1.1rem;display:flex;gap:10px;align-items:center;flex-wrap:wrap}}a{{color:#8dc3ff;text-decoration:none}}a:visited{{color:#c69cff}}a:hover{{text-decoration:underline}}.pdf-link:visited{{color:#d9b8ff}}.pdf-download{{font-size:.78rem;padding:4px 8px;border:1px solid rgba(141,195,255,.35);border-radius:999px}}.meta{{font-size:.92rem;color:rgba(255,255,255,.72)}} .top{{display:flex;justify-content:space-between;gap:12px;align-items:end;flex-wrap:wrap;margin-bottom:18px}}section{{margin-top:28px}}</style></head><body><main><div class="top"><div><h1>rss-offline</h1><p class="muted">PDF diario con artículos nuevos.</p></div><p class="muted">{len(new_entries)} nuevos en PDF · {len(archived)} artículos cacheados · últimos {DAYS} días</p></div><section><h2>Archivo de PDFs</h2>{''.join(pdf_cards) or '<p class="muted">Todavía no hay PDFs.</p>'}</section><script>const RSS_OFFLINE_LOCAL_PATHS={json.dumps(local_paths, ensure_ascii=False)};const RSS_OFFLINE_SCROLL_KEY='rss-offline:index-scroll';const restoreScroll=()=>{{try{{const y=Number(sessionStorage.getItem(RSS_OFFLINE_SCROLL_KEY)||'0');if(y>0)requestAnimationFrame(()=>window.scrollTo(0,y));}}catch(_e){{}}}};const saveScroll=()=>{{try{{sessionStorage.setItem(RSS_OFFLINE_SCROLL_KEY,String(window.scrollY||window.pageYOffset||0));}}catch(_e){{}}}};if('scrollRestoration' in history)history.scrollRestoration='manual';if('serviceWorker' in navigator){{window.addEventListener('load',()=>navigator.serviceWorker.register('/rss-offline/sw.js').catch(()=>{{}}));}}window.addEventListener('pageshow',restoreScroll);window.addEventListener('load',()=>{{restoreScroll();document.querySelectorAll('.pdf-link,.pdf-download').forEach((a)=>a.addEventListener('click',saveScroll));const warm=async()=>{{for(const path of RSS_OFFLINE_LOCAL_PATHS){{try{{await fetch(path,{{cache:'force-cache'}});}}catch(_e){{}}}}}};setTimeout(warm,1200);}});window.addEventListener('beforeunload',saveScroll);</script></main></body></html>''')

    offline.write_text('<!doctype html><html lang="es"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1"><title>rss-offline sin conexión</title><style>body{font-family:system-ui,sans-serif;background:#07111f;color:#fff;display:grid;place-items:center;min-height:100vh;padding:24px;margin:0}main{max-width:520px;padding:24px;border:1px solid rgba(255,255,255,.12);border-radius:12px;background:rgba(255,255,255,.05)}a{color:#8dc3ff}</style></head><body><main><h1>Sin conexión</h1><p>La lista y copias locales visitadas deberían seguir disponibles.</p><p><a href="/rss-offline/">Volver a rss-offline</a></p></main></body></html>')

    precache_paths = ['/rss-offline/', '/rss-offline/offline.html', '/rss-offline/entries.json'] + ([pdf_info['path']] if pdf_info else []) + [entry['local_path'] for entry in archived]
    sw_version = f"rss-offline-v{int(time.time())}"
    sw.write_text(f"""const V='{sw_version}';const CORE=V+'-core';const PAGES=V+'-pages';const PRECACHE={json.dumps(precache_paths, ensure_ascii=False)};self.addEventListener('install',e=>{{e.waitUntil(caches.open(CORE).then(c=>c.addAll(PRECACHE)));self.skipWaiting();}});self.addEventListener('activate',e=>{{e.waitUntil(caches.keys().then(keys=>Promise.all(keys.filter(k=>k!==CORE&&k!==PAGES).map(k=>caches.delete(k)))).then(()=>self.clients.claim()));}});self.addEventListener('fetch',e=>{{const r=e.request;const u=new URL(r.url);if(r.method!=='GET'||u.origin!==self.location.origin||!u.pathname.startsWith('/rss-offline/'))return;e.respondWith((async()=>{{const core=await caches.open(CORE);const pages=await caches.open(PAGES);const cached=await core.match(r)||await pages.match(r);try{{const res=await fetch(r);if(res.ok) await pages.put(r,res.clone());return res;}}catch{{return cached||await core.match('/rss-offline/offline.html')||Response.error();}}}})());}});""")


if __name__ == '__main__':
    build()
