import requests
#from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_video_urls(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; Firefox/120.0)",
        "Accept": "text/html,application/xhtml+xml",
        "Connection": "close",
    }

    session = requests.Session()

    response = session.get(
        url,
        headers=headers,
        timeout=30,
        allow_redirects=True
    )

    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    video_urls = []

    # Liens classiques <a href="">
    for tag in soup.find_all("a", href=True):
        href = tag["href"]
        full_url = urljoin(url, href)

        if is_video_url(full_url):
            video_urls.append(full_url)

    # Vidéos intégrées <iframe src="">
    for tag in soup.find_all("iframe", src=True):
        src = tag["src"]
        full_url = urljoin(url, src)

        if is_video_url(full_url):
            video_urls.append(full_url)

    # Suppression des doublons en gardant l'ordre
    return list(dict.fromkeys(video_urls))


def is_video_url(url):
    video_patterns = [
        "youtube.com/watch",
        "youtu.be/",
        "youtube.com/embed/",
        "dailymotion.com/video",
        "vimeo.com",
    ]

    return any(pattern in url for pattern in video_patterns)

def get_raw(url="http://kronopaf.toile-libre.org/2026/2026-04-28+.html"):

    try:
        videos = get_video_urls(url)

        print(f"{len(videos)} vidéo(s) trouvée(s) :")
        for video in videos:
            print(video)

    except requests.exceptions.RequestException as e:
        print("Erreur réseau :", e)

    return videos

from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs

def normalize_video_url(url):
    parsed = urlparse(url)

    # YouTube embed
    if "youtube.com" in parsed.netloc and parsed.path.startswith("/embed/"):
        video_id = parsed.path.split("/embed/")[1].split("/")[0]
        return f"https://www.youtube.com/watch?v={video_id}"

    # YouTube shorts
    if "youtube.com" in parsed.netloc and parsed.path.startswith("/shorts/"):
        video_id = parsed.path.split("/shorts/")[1].split("/")[0]
        return f"https://www.youtube.com/watch?v={video_id}"

    # youtu.be
    if "youtu.be" in parsed.netloc:
        video_id = parsed.path.strip("/").split("/")[0]
        return f"https://www.youtube.com/watch?v={video_id}"

    # YouTube watch
    if "youtube.com" in parsed.netloc and parsed.path == "/watch":
        query = parse_qs(parsed.query)
        video_id = query.get("v", [None])[0]
        if video_id:
            return f"https://www.youtube.com/watch?v={video_id}"

    return url


def is_video_url(url):
    patterns = [
        "youtube.com/watch",
        "youtube.com/embed/",
        "youtube.com/shorts/",
        "youtu.be/",
        "dailymotion.com/video",
        "vimeo.com",
    ]
    return any(p in url for p in patterns)


def get_urls(filepath):
    video_urls = []

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            url = line.strip()

            if not url:
                continue

            if is_video_url(url):
                video_urls.append(normalize_video_url(url))

    # suppression doublons
    video_urls=list(dict.fromkeys(video_urls))
    print(f"{len(video_urls)} vidéo(s) trouvée(s) :")
    for video in video_urls:
        print(video)

    return video_urls


# Exemple d'utilisation
def test():
    videos1 = get_urls("page_kronopaf.txt")

    videos2 = get_raw("https://kronopaf.toile-libre.org/2026/2026-04-17.html")

    print(" ")
    print(" ")


    print(f"{len(videos1)} vidéo(s) trouvée(s) :")
    for url in videos1:
        print(url)
    print("-------------------------------------")
    print(f"{len(videos2)} vidéo(s) trouvée(s) :")
    for url in videos2:
        print(url)
