# DOM-METHODE-AUSNAHME: LinkedIn-Scraper — kein Magnific-Kontext, DOM-Parsing für HTML-Auswertung, kein Magnific-Klick
#!/usr/bin/env python3
"""
LinkedIn Profilfoto-Fetcher für Dispo Resilienz Experten 2026
Verbindet sich via CDP mit 'Mein Chrome' (Port 9222), sucht jede Person auf LinkedIn
und lädt das Profilfoto herunter.

Voraussetzung: 'Mein Chrome' läuft, Victor ist auf LinkedIn eingeloggt.
"""

import json
import time
import urllib.request
import urllib.error
import os
import sys
import re
import struct

CDP_PORT = 9222
OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "photos")

EXPERTEN = [
    "Corinna Sparr",
    "Emre Liman",
    "Gina Schneider",
    "Jens Behrens",
    "Jürgen Höfert",
    "Luca Keller",
    "Lukas Gabriel",
    "Nico Rietbrock",
    "Nicolai Geiger",
    "Oliver Meier",
    "Stefan Schell",
    "Steffen Bölger",
    "Thorsten Niggemann",
    "Tim Gysbers",
]


def get_tab_ws_url():
    """Holt die WebSocket-URL des aktiven Tabs (LinkedIn bevorzugt)."""
    with urllib.request.urlopen(f"http://localhost:{CDP_PORT}/json/list", timeout=5) as r:
        tabs = json.loads(r.read())
    for t in tabs:
        if "linkedin" in t.get("url", "").lower():
            return t["webSocketDebuggerUrl"]
    return tabs[0]["webSocketDebuggerUrl"] if tabs else None


def ws_send(ws_url, method, params=None):
    """WebSocket-Befehl an CDP senden, Antwort zurückgeben."""
    import socket
    import base64

    m = re.match(r"ws://([^:/]+):?(\d+)?(/.+)", ws_url)
    host = m.group(1)
    port = int(m.group(2)) if m.group(2) else CDP_PORT
    path = m.group(3)

    sock = socket.create_connection((host, port), timeout=15)

    key = base64.b64encode(os.urandom(16)).decode()
    handshake = (
        f"GET {path} HTTP/1.1\r\nHost: {host}:{port}\r\nUpgrade: websocket\r\n"
        f"Connection: Upgrade\r\nSec-WebSocket-Key: {key}\r\nSec-WebSocket-Version: 13\r\n\r\n"
    )
    sock.send(handshake.encode())
    buf = b""
    while b"\r\n\r\n" not in buf:
        buf += sock.recv(4096)

    cmd = json.dumps({"id": 1, "method": method, "params": params or {}}).encode()
    mask = os.urandom(4)
    masked = bytes([cmd[i] ^ mask[i % 4] for i in range(len(cmd))])
    n = len(cmd)
    if n <= 125:
        frame = bytes([0x81, 0x80 | n]) + mask + masked
    else:
        frame = bytes([0x81, 0xFE]) + struct.pack(">H", n) + mask + masked
    sock.send(frame)

    data = b""
    deadline = time.time() + 30
    while time.time() < deadline:
        try:
            sock.settimeout(5)
            chunk = sock.recv(65536)
            if not chunk:
                break
            data += chunk
            if len(data) >= 2:
                pl = data[1] & 0x7F
                off = 2
                if pl == 126:
                    pl = struct.unpack(">H", data[2:4])[0]; off = 4
                if len(data) >= off + pl:
                    result = json.loads(data[off:off + pl].decode())
                    sock.close()
                    return result
        except socket.timeout:
            break
    sock.close()
    return None


def navigate_and_wait(url, wait=4):
    ws = get_tab_ws_url()
    ws_send(ws, "Page.navigate", {"url": url})
    time.sleep(wait)


def get_page_html():
    ws = get_tab_ws_url()
    r = ws_send(ws, "Runtime.evaluate", {
        "expression": "document.documentElement.outerHTML",
        "returnByValue": True
    })
    if r and "result" in r:
        return r["result"].get("value", "")
    return ""


def find_profile_photo_url(html):
    patterns = [
        r'src="(https://media\.licdn\.com/dms/image/v2/[^"&]+)"[^>]*class="[^"]*EntityPhoto[^"]*"',
        r'class="[^"]*EntityPhoto[^"]*"[^>]*src="(https://media\.licdn\.com/dms/image/[^"&]+)"',
        r'"displayImageUrl":"(https://media\.licdn\.com/dms/image/[^"]+?)"',
        r'src="(https://media\.licdn\.com/dms/image/v2/[^"&]+/profile-displayphoto[^"&]*)"',
        r'src="(https://media\.licdn\.com/dms/image/[^"&]+/profile-displayphoto[^"&]*)"',
    ]
    for pattern in patterns:
        m = re.search(pattern, html)
        if m:
            url = m.group(1)
            if "ghost" not in url.lower() and len(url) > 60:
                return url
    return None


def search_and_fetch(name):
    query = urllib.parse.quote(name)
    navigate_and_wait(f"https://www.linkedin.com/search/results/people/?keywords={query}", wait=4)
    html = get_page_html()

    links = re.findall(r'href="(/in/[a-zA-Z0-9\-]+/)"', html)
    if not links:
        print(f"  ⚠ Kein Profil gefunden")
        return None

    navigate_and_wait(f"https://www.linkedin.com{links[0]}", wait=4)
    html = get_page_html()
    return find_profile_photo_url(html)


def download_image(url, path):
    req = urllib.request.Request(url, headers={
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Referer": "https://www.linkedin.com/",
    })
    try:
        with urllib.request.urlopen(req, timeout=15) as r:
            with open(path, "wb") as f:
                f.write(r.read())
        return True
    except Exception as e:
        print(f"  ✗ {e}")
        return False


def check_cdp():
    try:
        with urllib.request.urlopen(f"http://localhost:{CDP_PORT}/json/version", timeout=3) as r:
            info = json.loads(r.read())
        return True, info.get("Browser", "?")
    except Exception as e:
        return False, str(e)


def check_login():
    navigate_and_wait("https://www.linkedin.com/feed/", wait=3)
    html = get_page_html()
    if "join now" in html.lower() or "sign in" in html.lower():
        return False
    return True


import urllib.parse


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print("=== LinkedIn Profilfoto-Fetcher — Resilienz Experten 2026 ===")
    print(f"Ausgabe: {OUTPUT_DIR}\n")

    ok, info = check_cdp()
    if not ok:
        print(f"✗ Chrome nicht erreichbar (Port {CDP_PORT}): {info}")
        print("  → 'Mein Chrome' starten und auf LinkedIn einloggen, dann erneut starten.")
        sys.exit(1)
    print(f"✓ Chrome: {info}")

    print("Prüfe LinkedIn-Login...")
    if not check_login():
        print("✗ Nicht eingeloggt — bitte LinkedIn öffnen und einloggen.")
        sys.exit(1)
    print("✓ Eingeloggt\n")

    print(f"{len(EXPERTEN)} Experten:\n" + "\n".join(f"  - {n}" for n in EXPERTEN) + "\n")

    results = {}
    for name in EXPERTEN:
        print(f"[{name}]")
        safe = re.sub(r"[äÄ]", "ae", re.sub(r"[öÖ]", "oe", re.sub(r"[üÜ]", "ue", re.sub(r"ß", "ss", name))))
        safe = safe.replace(" ", "_")
        jpg = os.path.join(OUTPUT_DIR, f"{safe}.jpg")

        if os.path.exists(jpg):
            sz = os.path.getsize(jpg)
            print(f"  ✓ Bereits vorhanden ({sz//1024} KB)")
            results[name] = jpg
            continue

        url = search_and_fetch(name)
        if url:
            print(f"  Foto: {url[:70]}...")
            if download_image(url, jpg):
                sz = os.path.getsize(jpg)
                print(f"  ✓ {jpg} ({sz//1024} KB)")
                results[name] = jpg
            else:
                results[name] = None
        else:
            print(f"  ✗ Kein Foto gefunden")
            results[name] = None

        time.sleep(2)

    print("\n=== Ergebnis ===")
    ok_n = sum(1 for v in results.values() if v)
    print(f"{ok_n}/{len(EXPERTEN)} Fotos geholt")
    for n, p in results.items():
        print(f"  {'✓' if p else '✗'} {n}")

    with open(os.path.join(OUTPUT_DIR, "fetch_report.json"), "w") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)


if __name__ == "__main__":
    main()