#!/usr/bin/env python3
import hashlib
import html
import json
import os
import sqlite3
from datetime import datetime, timezone, timedelta
from pathlib import Path

import requests
from bs4 import BeautifulSoup

DB = Path('/home/sebas/work/projects/bb-004/freshrss-trial/data/users/sebas/db.sqlite')
OUT = Path('/home/sebas/runtime/freshrss-recent-archive')
DAYS = int(os.environ.get('FRESHRSS_ARCHIVE_DAYS', '7'))
LIMIT = int(os.environ.get('FRESHRSS_ARCHIVE_LIMIT', '150'))
TIMEOUT = 20
HEADERS = {'user-agent': 'Mozilla/5.0 (FreshRSS recent archive)'}


def slug(url: str) -> str:
    return hashlib.sha256(url.encode()).hexdigest()[:16]


def extract_text(page_html: str) -> str:
    soup = BeautifulSoup(page_html, 'html.parser')
    for tag in soup(['script', 'style', 'noscript']):
        tag.decompose()
    title = soup.title.get_text(' ', strip=True) if soup.title else ''
    body = soup.body or soup
    text = body.get_text('\n', strip=True)
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    text = '\n'.join(lines)
    return (title + '\n\n' + text).strip() if title and title not in text[:200] else text


def main() -> int:
    OUT.mkdir(parents=True, exist_ok=True)
    since = int((datetime.now(timezone.utc) - timedelta(days=DAYS)).timestamp())
    con = sqlite3.connect(DB)
    cur = con.cursor()
    rows = cur.execute(
        '''
        SELECT e.id, e.title, e.link, e.date, f.name
        FROM entry e
        LEFT JOIN feed f ON f.id = e.id_feed
        WHERE e.date >= ? AND e.link LIKE 'http%'
        ORDER BY e.date DESC
        LIMIT ?
        ''',
        (since, LIMIT),
    ).fetchall()

    session = requests.Session()
    index = []
    for entry_id, title, link, ts, feed_name in rows:
        try:
            r = session.get(link, timeout=TIMEOUT, headers=HEADERS)
            r.raise_for_status()
            text = extract_text(r.text)
            key = slug(link)
            html_path = OUT / f'{key}.html'
            txt_path = OUT / f'{key}.txt'
            wrapped = f'''<!doctype html><html lang="es"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1"><title>{html.escape(title or link)}</title><style>body{{font-family:system-ui,sans-serif;max-width:900px;margin:0 auto;padding:24px;line-height:1.6}}a{{color:#0b63ce}}pre{{white-space:pre-wrap}}</style></head><body><p><a href="{html.escape(link)}">Abrir original</a></p><h1>{html.escape(title or link)}</h1><p>{html.escape(feed_name or '')}</p><pre>{html.escape(text)}</pre></body></html>'''
            html_path.write_text(wrapped)
            txt_path.write_text(text)
            index.append({
                'entry_id': entry_id,
                'title': title,
                'link': link,
                'feed': feed_name,
                'date': ts,
                'local_html': str(html_path),
                'local_txt': str(txt_path),
                'local_url': f'https://ballbox-first.emperor-ratio.ts.net/files/home/runtime/freshrss-recent-archive/{html_path.name}',
            })
            print('OK', link)
        except Exception as e:
            print('ERR', link, e)
    (OUT / 'index.json').write_text(json.dumps(index, ensure_ascii=False, indent=2))
    return 0


if __name__ == '__main__':
    raise SystemExit(main())
