# INSTRUMENT-AUSNAHME: Victor-Go — Holmes Atemtest, 6 Varianten
"""
Generiert denselben Satz 6x mit verschiedenen Ansätzen.
Output: watson_demo_clips/atemtest_*.mp3
"""
import json, base64, time, subprocess, urllib.request, urllib.error
from pathlib import Path
from pydub import AudioSegment, effects

API_KEY = "sk_cac6e576af419777a078b639869a5b3f36ae04eca65d6e37"
CLIPS   = Path("/Users/victorholland/Vibe Coding/dispatcher/cockpit/watson_demo_clips")
LOG     = Path("/tmp/holmes_atemtest.log")
HOLMES  = "kMg8CgFduU4YUWmbRBx1"
LOG.write_text("")

def log(msg):
    print(msg, flush=True)
    with LOG.open("a") as f:
        f.write(msg + "\n")

# Testsatz: mittelland, etwas dramatisch — repräsentativ
TEXT = "Watson. Ich habe es von Anfang an gewusst. Alles hat seinen Zeitpunkt."

BREAK = '<break time="600ms"/>'

TESTS = [
    # (label, beschreibung, model, stability, similarity, style, speaker_boost)
    ("A_alt",    "Alt-Settings (v1)",          "eleven_multilingual_v2", 0.92, 0.88, 0.04, True),
    ("B_aktuell","Aktuell (v2)",               "eleven_multilingual_v2", 0.80, 0.55, 0.0,  False),
    ("C_lowsim", "Sehr niedrige Similarity",   "eleven_multilingual_v2", 0.80, 0.25, 0.0,  False),
    ("D_maxstab","Max Stability + niedrig sim","eleven_multilingual_v2", 0.96, 0.30, 0.0,  False),
    ("E_turbo",  "Turbo Modell",               "eleven_turbo_v2_5",      0.80, 0.55, 0.0,  False),
    ("F_nossmll","Kein SSML-Break",            "eleven_multilingual_v2", 0.80, 0.55, 0.0,  False),
]

def get_speech_bounds(alignment, audio_len_ms):
    chars  = alignment.get("characters", [])
    starts = alignment.get("character_start_times_seconds", [])
    ends   = alignment.get("character_end_times_seconds", [])
    if not chars:
        return 30, audio_len_ms - 30
    in_tag, real = False, []
    for c, s, e in zip(chars, starts, ends):
        if c == '<': in_tag = True
        if not in_tag: real.append((c, s, e))
        if c == '>': in_tag = False
    if not real:
        return 30, audio_len_ms - 30
    return int(real[0][1] * 1000), int(real[-1][2] * 1000)

for label, desc, model, stability, similarity, style, boost in TESTS:
    out_path = CLIPS / f"atemtest_{label}.mp3"
    log(f"\n── {label}: {desc} ──")

    # SSML
    if label == "F_nossmll":
        ssml = TEXT  # kein SSML break
    else:
        ssml = f"<speak>{BREAK}{TEXT}{BREAK}</speak>"

    payload = json.dumps({
        "text": ssml,
        "model_id": model,
        "voice_settings": {
            "stability": stability,
            "similarity_boost": similarity,
            "style": style,
            "use_speaker_boost": boost,
        }
    }).encode()

    req = urllib.request.Request(
        f"https://api.elevenlabs.io/v1/text-to-speech/{HOLMES}/with-timestamps",
        data=payload,
        headers={"xi-api-key": API_KEY, "Content-Type": "application/json",
                 "Accept": "application/json"},
        method="POST"
    )
    try:
        with urllib.request.urlopen(req, timeout=60) as r:
            resp = json.loads(r.read())
    except urllib.error.HTTPError as e:
        log(f"  ✗ HTTP {e.code}: {e.read()[:150]}")
        continue

    raw_bytes = base64.b64decode(resp["audio_base64"])
    # Roh-Datei direkt schreiben (kein Trimmen) — Victor soll die volle Version hören
    raw_path = CLIPS / f"atemtest_{label}_raw.mp3"
    raw_path.write_bytes(raw_bytes)

    # Getrimmt: 30ms Lead, 400ms Tail (großzügiger als bisher) + 80ms fade-out
    alignment = resp["alignment"]
    raw_audio = AudioSegment.from_mp3(str(raw_path))
    norm = effects.normalize(raw_audio, headroom=1.0)
    s_ms, e_ms = get_speech_bounds(alignment, len(norm))
    cut_start = max(0, s_ms - 30)
    cut_end   = min(len(norm), e_ms + 400)   # 400ms statt 200ms
    clipped   = norm[cut_start:cut_end]
    # Kurzer Fade-out am Ende — wenn Atmer noch läuft, klingt Abschnitt natürlich
    faded = clipped.fade_out(80)
    faded.export(str(out_path), format="mp3", bitrate="192k")

    dur_raw = len(raw_audio) / 1000
    dur_cut = len(faded) / 1000
    log(f"  raw: {dur_raw:.2f}s  getrimmt: {dur_cut:.2f}s  speech: {s_ms}–{e_ms}ms")
    log(f"  → atemtest_{label}.mp3 + atemtest_{label}_raw.mp3")
    time.sleep(0.5)

log("\n=== Fertig ===")
print("\nFertig. Dateien in watson_demo_clips/atemtest_*.mp3")
