using System.Diagnostics; using System.Text.Json; using YoutubeSummarizer.Models; namespace YoutubeSummarizer.Services; /// /// Uses yt-dlp (https://github.com/yt-dlp/yt-dlp) to retrieve video metadata /// and download caption tracks. No YouTube API key required. /// /// yt-dlp is the de-facto standard tool for reliably extracting video /// information and subtitles from YouTube. It must be installed and /// available on PATH (e.g. pip install yt-dlp). /// public sealed class YouTubeService { private readonly HttpClient _httpClient; public YouTubeService(HttpClient httpClient) { _httpClient = httpClient; } // ───────────────────────────────────────────────────────────────────────── // Public API // ───────────────────────────────────────────────────────────────────────── /// /// Parses a YouTube video ID from any common URL format. /// Handles: watch?v=, youtu.be/, /embed/, /shorts/ /// public static string? ExtractVideoId(string url) { // Normalize — strip whitespace the user may have pasted url = url.Trim(); // youtu.be short links: https://youtu.be/VIDEO_ID if (Uri.TryCreate(url, UriKind.Absolute, out var uri)) { if (uri.Host.Contains("youtu.be")) return uri.AbsolutePath.TrimStart('/').Split('?')[0]; // Standard and embed URLs: ?v=VIDEO_ID, /embed/VIDEO_ID, /shorts/VIDEO_ID var query = System.Web.HttpUtility.ParseQueryString(uri.Query); if (query["v"] is { } vParam && vParam.Length == 11) return vParam; var segments = uri.AbsolutePath.Split('/', StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < segments.Length - 1; i++) { if (segments[i] is "embed" or "shorts" or "v") return segments[i + 1].Split('?')[0]; } } // Raw ID passed directly (11 alphanumeric chars + dash/underscore) if (System.Text.RegularExpressions.Regex.IsMatch(url, @"^[\w-]{11}$")) return url; return null; } /// /// Fetches metadata for a video using yt-dlp --dump-json. /// No API key required — yt-dlp scrapes the public video page. /// public async Task GetVideoMetadataAsync(string videoId, CancellationToken ct = default) { var psi = new ProcessStartInfo { FileName = "yt-dlp", RedirectStandardOutput = true, RedirectStandardError = true, UseShellExecute = false, CreateNoWindow = true, }; psi.ArgumentList.Add("--dump-json"); psi.ArgumentList.Add("--no-download"); psi.ArgumentList.Add($"https://www.youtube.com/watch?v={videoId}"); using var process = new Process { StartInfo = psi }; process.Start(); var stdoutTask = process.StandardOutput.ReadToEndAsync(ct); var stderrTask = process.StandardError.ReadToEndAsync(ct); await process.WaitForExitAsync(ct); if (process.ExitCode != 0) return null; var json = await stdoutTask; if (string.IsNullOrWhiteSpace(json)) return null; try { using var doc = JsonDocument.Parse(json); var root = doc.RootElement; var title = root.TryGetProperty("title", out var t) ? t.GetString() ?? "(no title)" : "(no title)"; var channel = root.TryGetProperty("channel", out var c) ? c.GetString() ?? "(unknown channel)" : "(unknown channel)"; var description = root.TryGetProperty("description", out var d) ? d.GetString() : null; // yt-dlp returns duration in seconds TimeSpan? duration = null; if (root.TryGetProperty("duration", out var dur) && dur.ValueKind == JsonValueKind.Number) duration = TimeSpan.FromSeconds(dur.GetDouble()); // Upload date comes as "YYYYMMDD" DateTimeOffset publishedAt = DateTimeOffset.MinValue; if (root.TryGetProperty("upload_date", out var ud) && ud.GetString() is { } dateStr && DateTime.TryParseExact(dateStr, "yyyyMMdd", null, System.Globalization.DateTimeStyles.None, out var parsed)) { publishedAt = new DateTimeOffset(parsed, TimeSpan.Zero); } // Build ISO 8601 duration string for FormattedDuration compatibility string? isoDuration = null; if (duration.HasValue) { var ts = duration.Value; isoDuration = $"PT{(int)ts.TotalHours}H{ts.Minutes}M{ts.Seconds}S"; } return new VideoMetadata { VideoId = videoId, Title = title, ChannelTitle = channel, PublishedAt = publishedAt, Duration = isoDuration, Description = description }; } catch { return null; } } /// /// Retrieves the best available transcript for the video using yt-dlp. /// /// yt-dlp is invoked to download subtitle files (preferring manual English /// captions, falling back to auto-generated). The downloaded XML is parsed /// into clean plain text for summarization. /// /// If yt-dlp fails or no captions exist, returns a metadata-only transcript /// from the video description. /// public async Task GetTranscriptAsync( VideoMetadata metadata, CancellationToken ct = default) { // Try manual (human-written) subtitles first, then auto-generated var (text, segments, isAuto) = await DownloadSubtitlesWithTimestampsAsync(metadata.VideoId, ct); if (!string.IsNullOrWhiteSpace(text)) { return new VideoTranscript { VideoId = metadata.VideoId, Text = text, Segments = segments, SourceTrack = new CaptionTrack { TrackId = "yt-dlp", Language = "en", TrackKind = isAuto ? "asr" : "standard", Name = isAuto ? "Auto-generated (en)" : "English" }, Source = isAuto ? TranscriptSource.AutoGenerated : TranscriptSource.OwnerPublished }; } // No captions at all — fall back to the description text return BuildMetadataOnlyTranscript(metadata); } // ───────────────────────────────────────────────────────────────────────── // Private helpers // ───────────────────────────────────────────────────────────────────────── /// /// Shells out to yt-dlp to download subtitles for the given video. /// First attempts manual subs, then auto-generated if none found. /// Returns the cleaned transcript text and whether it was auto-generated. /// private static async Task<(string? Text, bool IsAuto)> DownloadSubtitlesWithYtDlpAsync( string videoId, CancellationToken ct) { var (text, segments, isAuto) = await DownloadSubtitlesWithTimestampsAsync(videoId, ct); return (text, isAuto); } /// /// Downloads subtitles and returns both the plain text and timestamped segments. /// private static async Task<(string? Text, List Segments, bool IsAuto)> DownloadSubtitlesWithTimestampsAsync( string videoId, CancellationToken ct) { var tempDir = Path.Combine(Path.GetTempPath(), $"ytsumm_{videoId}_{Guid.NewGuid():N}"); Directory.CreateDirectory(tempDir); try { // Attempt 1: manual (human-written) subtitles only var (manualText, manualSegments) = await RunYtDlpSubtitleWithTimestampsAsync( videoId, tempDir, writeSub: true, writeAutoSub: false, ct); if (!string.IsNullOrWhiteSpace(manualText)) return (manualText, manualSegments, false); // Attempt 2: auto-generated subtitles var (autoText, autoSegments) = await RunYtDlpSubtitleWithTimestampsAsync( videoId, tempDir, writeSub: false, writeAutoSub: true, ct); if (!string.IsNullOrWhiteSpace(autoText)) return (autoText, autoSegments, true); return (null, new List(), false); } finally { try { Directory.Delete(tempDir, recursive: true); } catch { /* best effort */ } } } /// /// Runs a single yt-dlp invocation to download subtitles with timestamps. /// Returns the parsed plain-text transcript and timestamped segments. /// private static async Task<(string? Text, List Segments)> RunYtDlpSubtitleWithTimestampsAsync( string videoId, string tempDir, bool writeSub, bool writeAutoSub, CancellationToken ct) { // Clean any previous subtitle files from this temp dir foreach (var f in Directory.GetFiles(tempDir, "*.srv1")) File.Delete(f); foreach (var f in Directory.GetFiles(tempDir, "*.vtt")) File.Delete(f); var args = new List { "--skip-download", "--sub-lang", "en,en-US,en-GB,en.*", "--sub-format", "srv1/vtt/best", "-o", Path.Combine(tempDir, "%(id)s"), }; if (writeSub) args.Add("--write-sub"); if (writeAutoSub) args.Add("--write-auto-sub"); args.Add($"https://www.youtube.com/watch?v={videoId}"); var psi = new ProcessStartInfo { FileName = "yt-dlp", RedirectStandardOutput = true, RedirectStandardError = true, UseShellExecute = false, CreateNoWindow = true, }; foreach (var arg in args) psi.ArgumentList.Add(arg); using var process = new Process { StartInfo = psi }; process.Start(); var stdoutTask = process.StandardOutput.ReadToEndAsync(ct); var stderrTask = process.StandardError.ReadToEndAsync(ct); await process.WaitForExitAsync(ct); if (process.ExitCode != 0) return (null, new List()); var subFiles = Directory.GetFiles(tempDir) .Where(f => f.EndsWith(".srv1") || f.EndsWith(".vtt") || f.EndsWith(".srt")) .OrderBy(f => f.EndsWith(".srv1") ? 0 : 1) .ToList(); if (subFiles.Count == 0) return (null, new List()); var content = await File.ReadAllTextAsync(subFiles[0], ct); if (string.IsNullOrWhiteSpace(content)) return (null, new List()); return subFiles[0].EndsWith(".srv1") ? ParseTimedTextXmlWithTimestamps(content) : ParseVttOrSrtWithTimestamps(content); } /// /// Parses YouTube's srv1 timed-text XML into plain text and timestamped segments. /// private static (string Text, List Segments) ParseTimedTextXmlWithTimestamps(string xml) { try { var doc = System.Xml.Linq.XDocument.Parse(xml); var segments = new List(); var textParts = new List(); foreach (var el in doc.Descendants("text")) { var decoded = System.Web.HttpUtility.HtmlDecode(el.Value); var cleaned = System.Text.RegularExpressions.Regex.Replace(decoded, @"\s+", " ").Trim(); if (string.IsNullOrEmpty(cleaned)) continue; textParts.Add(cleaned); var startAttr = el.Attribute("start")?.Value; var durAttr = el.Attribute("dur")?.Value; var start = double.TryParse(startAttr, System.Globalization.NumberStyles.Float, System.Globalization.CultureInfo.InvariantCulture, out var s) ? TimeSpan.FromSeconds(s) : TimeSpan.Zero; var dur = double.TryParse(durAttr, System.Globalization.NumberStyles.Float, System.Globalization.CultureInfo.InvariantCulture, out var d) ? TimeSpan.FromSeconds(d) : TimeSpan.Zero; segments.Add(new TimestampedSegment { Start = start, Duration = dur, Text = cleaned }); } return (string.Join(" ", textParts), segments); } catch { return (xml, new List()); } } /// /// Parses YouTube's srv1 timed-text XML into clean plain text. /// /// The XML structure looks like: /// <transcript> /// <text start="0.5" dur="2.1">Hello world</text> /// ... /// </transcript> /// private static string ParseTimedTextXml(string xml) { var (text, _) = ParseTimedTextXmlWithTimestamps(xml); return text; } /// /// Parses VTT or SRT subtitle formats into plain text and timestamped segments. /// Strips cue identifiers and formatting tags while preserving timestamp associations. /// private static (string Text, List Segments) ParseVttOrSrtWithTimestamps(string content) { var segments = new List(); var allLines = content.Split('\n').Select(l => l.Trim()).ToArray(); TimeSpan currentStart = TimeSpan.Zero; TimeSpan currentEnd = TimeSpan.Zero; var currentText = new List(); foreach (var line in allLines) { // Skip headers and metadata if (string.IsNullOrEmpty(line) || line.StartsWith("WEBVTT") || line.StartsWith("NOTE") || line.StartsWith("Kind:") || line.StartsWith("Language:") || System.Text.RegularExpressions.Regex.IsMatch(line, @"^\d+$")) { // Flush current segment on blank line if (string.IsNullOrEmpty(line) && currentText.Count > 0) { var text = string.Join(" ", currentText); segments.Add(new TimestampedSegment { Start = currentStart, Duration = currentEnd - currentStart, Text = text }); currentText.Clear(); } continue; } // Timestamp line: "00:01:23.456 --> 00:01:27.890" var tsMatch = System.Text.RegularExpressions.Regex.Match(line, @"^(\d{2}:\d{2}[:\.][\d\.]+)\s*-->\s*(\d{2}:\d{2}[:\.][\d\.]+)"); if (tsMatch.Success) { // Flush previous segment if any text buffered if (currentText.Count > 0) { var text = string.Join(" ", currentText); segments.Add(new TimestampedSegment { Start = currentStart, Duration = currentEnd - currentStart, Text = text }); currentText.Clear(); } currentStart = ParseVttTimestamp(tsMatch.Groups[1].Value); currentEnd = ParseVttTimestamp(tsMatch.Groups[2].Value); continue; } // Content line — strip HTML tags and decode var stripped = System.Text.RegularExpressions.Regex.Replace(line, @"<[^>]+>", ""); var decoded = System.Web.HttpUtility.HtmlDecode(stripped).Trim(); if (!string.IsNullOrEmpty(decoded)) currentText.Add(decoded); } // Flush last segment if (currentText.Count > 0) { segments.Add(new TimestampedSegment { Start = currentStart, Duration = currentEnd - currentStart, Text = string.Join(" ", currentText) }); } // Deduplicate consecutive identical text segments (common in VTT) var deduped = new List(); string? prevText = null; foreach (var seg in segments) { if (seg.Text != prevText) deduped.Add(seg); prevText = seg.Text; } var plainText = string.Join(" ", deduped.Select(s => s.Text)); return (plainText, deduped); } /// Parses a VTT/SRT timestamp string into a TimeSpan. private static TimeSpan ParseVttTimestamp(string ts) { // Normalize: VTT uses "." for ms, SRT uses "," — handle both ts = ts.Replace(',', '.'); // Handle both HH:MM:SS.mmm and MM:SS.mmm var parts = ts.Split(':'); if (parts.Length == 3) { int.TryParse(parts[0], out var h); int.TryParse(parts[1], out var m); double.TryParse(parts[2], System.Globalization.NumberStyles.Float, System.Globalization.CultureInfo.InvariantCulture, out var s); return new TimeSpan(0, h, m, (int)s, (int)((s - (int)s) * 1000)); } else if (parts.Length == 2) { int.TryParse(parts[0], out var m); double.TryParse(parts[1], System.Globalization.NumberStyles.Float, System.Globalization.CultureInfo.InvariantCulture, out var s); return new TimeSpan(0, 0, m, (int)s, (int)((s - (int)s) * 1000)); } return TimeSpan.Zero; } /// /// Parses VTT or SRT subtitle formats into clean plain text. /// Strips timestamps, cue identifiers, and formatting tags. /// private static string ParseVttOrSrt(string content) { var (text, _) = ParseVttOrSrtWithTimestamps(content); return text; } /// /// When no captions exist, builds a minimal "transcript" from the video description. /// The summary will be based on much less information and will be flagged accordingly. /// private static VideoTranscript BuildMetadataOnlyTranscript(VideoMetadata metadata) { var text = string.IsNullOrWhiteSpace(metadata.Description) ? $"No transcript or description available for: {metadata.Title}" : $"Video title: {metadata.Title}\n\nChannel: {metadata.ChannelTitle}\n\nDescription:\n{metadata.Description}"; return new VideoTranscript { VideoId = metadata.VideoId, Text = text, SourceTrack = null, Source = TranscriptSource.MetadataOnly }; } }