from k import get_raw, get_urls, test
from video_store_jsonl import save_all_videos,load_videos, add_or_update_many, get_all_videos

import time
from datetime import datetime

import subprocess
import textwrap

import html as html_lib

import os
import glob
import tempfile
import yt_dlp
import re


def get_youtube_transcript_ytdlp(video_id, languages=("fr", "en", "fr-orig" ), retries=3):
    url = f"https://www.youtube.com/watch?v={video_id}"

    for attempt in range(retries):
        try:
            with tempfile.TemporaryDirectory() as tmpdir:
                outtmpl = os.path.join(tmpdir, "%(id)s.%(ext)s")

                ydl_opts = {
                    "quiet": True,
                    "skip_download": True,
                    "writesubtitles": True,
                    "writeautomaticsub": True,
                    "subtitleslangs": list(languages),
                    "subtitlesformat": "vtt",
                    "outtmpl": outtmpl,
                    "cookiesfrombrowser": ("firefox",),
                    "ignoreerrors": True,
                }

                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    ydl.download([url])

                files = glob.glob(os.path.join(tmpdir, "*.vtt"))

                if not files:
                    return ""

                with open(files[0], "r", encoding="utf-8") as f:
                    return clean_vtt(f.read())

        except Exception as e:
            wait = 2 ** attempt + random.uniform(1, 3)
            print(f"⚠ Retry {attempt+1} pour {video_id} dans {wait:.1f}s")
            time.sleep(wait)

    print(f"❌ Échec transcript pour {video_id}")
    return ""


def clean_vtt(vtt_text):
    lines = []

    for line in vtt_text.splitlines():
        line = line.strip()

        if not line:
            continue
        if line.startswith("WEBVTT"):
            continue
        if "-->" in line:
            continue
        if line.startswith(("Kind:", "Language:")):
            continue

        line = re.sub(r"<[^>]+>", "", line)
        line = re.sub(r"&nbsp;", " ", line)

        if line not in lines:
            lines.append(line)

    return " ".join(lines)

def remove_ansi_sequences(text):
    """
    Supprime les séquences ANSI terminal.
    """

    ansi_escape = re.compile(
        r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])'
    )

    return ansi_escape.sub('', text)

def summarize_video_ai(video, model="gpt-oss:20b-cloud"):
    title = video.get("title", "")
    channel = video.get("channel", "")
    transcript = video.get("transcript", "")
    description = video.get("description", "")

    source = transcript[:12000] if transcript else ""

    prompt = f"""
Tu es un assistant de revue de presse.

Résume cette vidéo YouTube en français en 5 lignes maximum.
Ne rajoute aucune information absente de la source.

Chaîne : {channel}
Titre : {title}
Description :
{description}

Transcription :
{source}
"""

    try:
        result = subprocess.run(
            ["ollama", "run", "--nowordwrap", model],
            input=prompt,
            text=True,
            capture_output=True,
            timeout=120,
            env={**os.environ, "TERM": "dumb"}
        )

        if result.returncode != 0:
            return "Résumé IA indisponible."

        raw_output = result.stdout.strip()
        raw_output = remove_ansi_sequences(raw_output)
        # 🔥 nettoyage ici
        cleaned_output = remove_thinking(raw_output)

        return cleaned_output

    except Exception as e:
        print(f"⚠️ Erreur résumé IA : {e}")
        return "Résumé IA indisponible."

def remove_thinking(text):
    """
    Supprime les blocs du type :
    Thinking...
    ...
    ...done thinking.
    """

    if not text:
        return text

    # version robuste multi-lignes
    cleaned = re.sub(
        r"Thinking\.\.\..*?\.\.\.done thinking\.\s*",
        "",
        text,
        flags=re.DOTALL | re.IGNORECASE
    )

    return cleaned.strip()


def parse_date(date_str):
    if not date_str:
        return None

    try:
        return datetime.strptime(date_str, "%Y%m%d")
    except ValueError:
        return None

def extract_video_id(url):
    """
    Extrait l'identifiant vidéo depuis plusieurs formes d'URL YouTube :
    - https://www.youtube.com/watch?v=VIDEO_ID
    - https://youtu.be/VIDEO_ID
    - https://www.youtube.com/embed/VIDEO_ID
    - https://www.youtube.com/shorts/VIDEO_ID
    """

    patterns = [
        r"youtube\.com/watch\?v=([^&]+)",
        r"youtu\.be/([^?&/]+)",
        r"youtube\.com/embed/([^?&/]+)",
        r"youtube\.com/shorts/([^?&/]+)",
    ]

    for pattern in patterns:
        m = re.search(pattern, url)
        if m:
            return m.group(1)

    return None

def get_video_info_from_url(url, cookies_browser="firefox"):
    """
    Récupère les informations d'une vidéo YouTube désignée directement par son URL.
    Retourne un dictionnaire video compatible avec le reste du programme.
    """

    ydl_opts = {
        "quiet": True,
        "ignoreerrors": True,
        "skip_download": True,
        "noplaylist": True,
        "cookiesfrombrowser": (cookies_browser,),
        "js_runtimes": {
            "deno": {}
        },
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=False)

        if not info:
            return None

        video_id = info.get("id") or extract_video_id(url)

        if not video_id:
            return None

        upload_date = info.get("upload_date")

        return {
            "title": info.get("title", "Sans titre"),
            "video_id": video_id,
            "url": f"https://www.youtube.com/watch?v={video_id}",
            "channel": info.get("channel") or info.get("uploader") or "Chaîne inconnue",
            "duration": info.get("duration"),
            "view_count": info.get("view_count"),
            "upload_date": upload_date,
            "upload_datetime": parse_date(upload_date),
            "description": info.get("description", ""),
        }

    except Exception as e:
        print(f"❌ Erreur avec {url} : {e}")
        return None


def collect_videos_from_urls(urls):
    videos = []

    for raw_url in urls:
        normalized_url = normalize_youtube_url(raw_url)

        if not normalized_url:
            print(f"⚠ URL invalide ignorée : {raw_url}")
            continue

        print(f"🔎 Lecture de {normalized_url}")

        video = get_video_info_from_url(normalized_url)

        if video:
            videos.append(video)

        time.sleep(1)

    return videos


def normalize_youtube_url(url):
    """
    Convertit toute URL YouTube en format canonique :
    https://www.youtube.com/watch?v=VIDEO_ID
    """

    video_id = extract_video_id(url)

    if not video_id:
        return None

    return f"https://www.youtube.com/watch?v={video_id}"

def extract_video_id(url):
    patterns = [
        r"youtube\.com/watch\?v=([^&]+)",
        r"youtu\.be/([^?&/]+)",
        r"youtube\.com/embed/([^?&/]+)",
        r"youtube\.com/shorts/([^?&/]+)",
    ]

    for pattern in patterns:
        m = re.search(pattern, url)
        if m:
            return m.group(1)

    return None

def enrich_video_date(video, cookies_browser="firefox"):
    """
    Complète upload_date / upload_datetime pour une vidéo.
    Plus lent, mais plus fiable.
    """
    if video.get("upload_datetime"):
        return video

    url = video.get("url")
    if not url:
        return video

    ydl_opts = {
        "quiet": True,
        "ignoreerrors": True,
        "skip_download": True,
        "noplaylist": True,
        "cookiesfrombrowser": (cookies_browser,),
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=False)

        upload_date = info.get("upload_date")

        video["upload_date"] = upload_date
        video["upload_datetime"] = parse_date(upload_date)

        if not video.get("title"):
            video["title"] = info.get("title", "Sans titre")

    except Exception as e:
        print(f"⚠️ Date non récupérée pour {url} : {e}")

    return video

def filter_and_sort_videos(videos, min_date):
    enriched = []

    for v in videos:
        # Si la date manque, on interroge la page de la vidéo
        if not v.get("upload_datetime"):
            v = enrich_video_date(v)
            time.sleep(1)

        d = v.get("upload_datetime")

        if d and d >= min_date:
            enriched.append(v)

    enriched.sort(
        key=lambda v: v.get("upload_datetime") or datetime.min,
        reverse=True
    )

    return enriched

def sort_videos(videos):
    enriched = []
    for v in videos:
        # Si la date manque, on interroge la page de la vidéo
        if not v.get("upload_datetime"):
            v = enrich_video_date(v)
            time.sleep(1)
        enriched.append(v)

    enriched.sort(
        key=lambda v: v.get("upload_datetime") or datetime.min,
        reverse=True
    )
    return enriched

def generate_grid_html(videos, filename="index.html"):
    v=videos[0]
    date_str = v["upload_datetime"].strftime("%d/%m/%Y") if v["upload_datetime"] else "?"
    html_page = f"""
<!DOCTYPE html>
<html lang="fr">
<head>
<meta charset="UTF-8">
<style>
  @import url(../style1.css);
</style>
<title>Revue de presse {date_str}</title>
</head>
<body>
### Barre de navigation ###
<h1>Revue de presse</h1>
<div class="grid">
"""
    for v in videos:
        date_str = v["upload_datetime"].strftime("%d/%m/%Y") if v["upload_datetime"] else "?"
        summary = html_lib.escape(v.get("summary", ""))
        title = html_lib.escape(v["title"])
        channel = html_lib.escape(v["channel"])
        html_page += f"""
    <div class="video-card">
        <div class="video-title">{title}</div>
        <div class="video-meta">{channel} — {date_str}</div>
        <iframe src="https://www.youtube.com/embed/{v['video_id']}" class="video-iframe" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
        <div class="summary">{summary}</div>
    </div>
"""
    html_page += """
</div>
<p><strong>LLM :</strong> gpt-oss:20b-cloud</p>
### Barre de navigation ###
</body>
</html>
"""
    with open(filename, "w", encoding="utf-8") as f:
        f.write(html_page)
    print(f"✅ HTML généré : {filename}")



import random

def work(urls):

    videos = collect_videos_from_urls(urls)

#    min_date = datetime(2026, 4, 23)

    videos = sort_videos(videos)

    print(f"✅ {len(videos)} vidéos récupérées")


    for v in videos:
        print(f"📝 Sous-titres : {v.get('title')}")

        v["transcript"] = get_youtube_transcript_ytdlp(v["video_id"])
        v["summary"] = summarize_video_ai(v, "gpt-oss:20b-cloud")

        time.sleep(1 + random.uniform(2, 10))

    print(f"✅ {len(videos)} vidéos après traitement")

    return videos


# generate_grid_html(videos,filename)


# Exemple :

# L1 = get_urls("page_kronopaf.txt")

# L2 = get_raw("https://kronopaf.toile-libre.org/2026/2026-04-17.html")

# www(L2, 30, "ia2026-04-17.html")

