using System.Diagnostics;
using System.Text.Json;
using YoutubeSummarizer.Models;
namespace YoutubeSummarizer.Services;
///
/// Uses yt-dlp (https://github.com/yt-dlp/yt-dlp) to retrieve video metadata
/// and download caption tracks. No YouTube API key required.
///
/// yt-dlp is the de-facto standard tool for reliably extracting video
/// information and subtitles from YouTube. It must be installed and
/// available on PATH (e.g. pip install yt-dlp).
///
public sealed class YouTubeService
{
private readonly HttpClient _httpClient;
public YouTubeService(HttpClient httpClient)
{
_httpClient = httpClient;
}
// ─────────────────────────────────────────────────────────────────────────
// Public API
// ─────────────────────────────────────────────────────────────────────────
///
/// Parses a YouTube video ID from any common URL format.
/// Handles: watch?v=, youtu.be/, /embed/, /shorts/
///
public static string? ExtractVideoId(string url)
{
// Normalize — strip whitespace the user may have pasted
url = url.Trim();
// youtu.be short links: https://youtu.be/VIDEO_ID
if (Uri.TryCreate(url, UriKind.Absolute, out var uri))
{
if (uri.Host.Contains("youtu.be"))
return uri.AbsolutePath.TrimStart('/').Split('?')[0];
// Standard and embed URLs: ?v=VIDEO_ID, /embed/VIDEO_ID, /shorts/VIDEO_ID
var query = System.Web.HttpUtility.ParseQueryString(uri.Query);
if (query["v"] is { } vParam && vParam.Length == 11)
return vParam;
var segments = uri.AbsolutePath.Split('/', StringSplitOptions.RemoveEmptyEntries);
for (int i = 0; i < segments.Length - 1; i++)
{
if (segments[i] is "embed" or "shorts" or "v")
return segments[i + 1].Split('?')[0];
}
}
// Raw ID passed directly (11 alphanumeric chars + dash/underscore)
if (System.Text.RegularExpressions.Regex.IsMatch(url, @"^[\w-]{11}$"))
return url;
return null;
}
///
/// Fetches metadata for a video using yt-dlp --dump-json.
/// No API key required — yt-dlp scrapes the public video page.
///
public async Task GetVideoMetadataAsync(string videoId, CancellationToken ct = default)
{
var psi = new ProcessStartInfo
{
FileName = "yt-dlp",
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true,
};
psi.ArgumentList.Add("--dump-json");
psi.ArgumentList.Add("--no-download");
psi.ArgumentList.Add($"https://www.youtube.com/watch?v={videoId}");
using var process = new Process { StartInfo = psi };
process.Start();
var stdoutTask = process.StandardOutput.ReadToEndAsync(ct);
var stderrTask = process.StandardError.ReadToEndAsync(ct);
await process.WaitForExitAsync(ct);
if (process.ExitCode != 0)
return null;
var json = await stdoutTask;
if (string.IsNullOrWhiteSpace(json))
return null;
try
{
using var doc = JsonDocument.Parse(json);
var root = doc.RootElement;
var title = root.TryGetProperty("title", out var t) ? t.GetString() ?? "(no title)" : "(no title)";
var channel = root.TryGetProperty("channel", out var c) ? c.GetString() ?? "(unknown channel)" : "(unknown channel)";
var description = root.TryGetProperty("description", out var d) ? d.GetString() : null;
// yt-dlp returns duration in seconds
TimeSpan? duration = null;
if (root.TryGetProperty("duration", out var dur) && dur.ValueKind == JsonValueKind.Number)
duration = TimeSpan.FromSeconds(dur.GetDouble());
// Upload date comes as "YYYYMMDD"
DateTimeOffset publishedAt = DateTimeOffset.MinValue;
if (root.TryGetProperty("upload_date", out var ud) && ud.GetString() is { } dateStr
&& DateTime.TryParseExact(dateStr, "yyyyMMdd", null,
System.Globalization.DateTimeStyles.None, out var parsed))
{
publishedAt = new DateTimeOffset(parsed, TimeSpan.Zero);
}
// Build ISO 8601 duration string for FormattedDuration compatibility
string? isoDuration = null;
if (duration.HasValue)
{
var ts = duration.Value;
isoDuration = $"PT{(int)ts.TotalHours}H{ts.Minutes}M{ts.Seconds}S";
}
return new VideoMetadata
{
VideoId = videoId,
Title = title,
ChannelTitle = channel,
PublishedAt = publishedAt,
Duration = isoDuration,
Description = description
};
}
catch
{
return null;
}
}
///
/// Retrieves the best available transcript for the video using yt-dlp.
///
/// yt-dlp is invoked to download subtitle files (preferring manual English
/// captions, falling back to auto-generated). The downloaded XML is parsed
/// into clean plain text for summarization.
///
/// If yt-dlp fails or no captions exist, returns a metadata-only transcript
/// from the video description.
///
public async Task GetTranscriptAsync(
VideoMetadata metadata,
CancellationToken ct = default)
{
// Try manual (human-written) subtitles first, then auto-generated
var (text, segments, isAuto) = await DownloadSubtitlesWithTimestampsAsync(metadata.VideoId, ct);
if (!string.IsNullOrWhiteSpace(text))
{
return new VideoTranscript
{
VideoId = metadata.VideoId,
Text = text,
Segments = segments,
SourceTrack = new CaptionTrack
{
TrackId = "yt-dlp",
Language = "en",
TrackKind = isAuto ? "asr" : "standard",
Name = isAuto ? "Auto-generated (en)" : "English"
},
Source = isAuto
? TranscriptSource.AutoGenerated
: TranscriptSource.OwnerPublished
};
}
// No captions at all — fall back to the description text
return BuildMetadataOnlyTranscript(metadata);
}
// ─────────────────────────────────────────────────────────────────────────
// Private helpers
// ─────────────────────────────────────────────────────────────────────────
///
/// Shells out to yt-dlp to download subtitles for the given video.
/// First attempts manual subs, then auto-generated if none found.
/// Returns the cleaned transcript text and whether it was auto-generated.
///
private static async Task<(string? Text, bool IsAuto)> DownloadSubtitlesWithYtDlpAsync(
string videoId,
CancellationToken ct)
{
var (text, segments, isAuto) = await DownloadSubtitlesWithTimestampsAsync(videoId, ct);
return (text, isAuto);
}
///
/// Downloads subtitles and returns both the plain text and timestamped segments.
///
private static async Task<(string? Text, List Segments, bool IsAuto)> DownloadSubtitlesWithTimestampsAsync(
string videoId,
CancellationToken ct)
{
var tempDir = Path.Combine(Path.GetTempPath(), $"ytsumm_{videoId}_{Guid.NewGuid():N}");
Directory.CreateDirectory(tempDir);
try
{
// Attempt 1: manual (human-written) subtitles only
var (manualText, manualSegments) = await RunYtDlpSubtitleWithTimestampsAsync(
videoId, tempDir, writeSub: true, writeAutoSub: false, ct);
if (!string.IsNullOrWhiteSpace(manualText))
return (manualText, manualSegments, false);
// Attempt 2: auto-generated subtitles
var (autoText, autoSegments) = await RunYtDlpSubtitleWithTimestampsAsync(
videoId, tempDir, writeSub: false, writeAutoSub: true, ct);
if (!string.IsNullOrWhiteSpace(autoText))
return (autoText, autoSegments, true);
return (null, new List(), false);
}
finally
{
try { Directory.Delete(tempDir, recursive: true); } catch { /* best effort */ }
}
}
///
/// Runs a single yt-dlp invocation to download subtitles with timestamps.
/// Returns the parsed plain-text transcript and timestamped segments.
///
private static async Task<(string? Text, List Segments)> RunYtDlpSubtitleWithTimestampsAsync(
string videoId,
string tempDir,
bool writeSub,
bool writeAutoSub,
CancellationToken ct)
{
// Clean any previous subtitle files from this temp dir
foreach (var f in Directory.GetFiles(tempDir, "*.srv1"))
File.Delete(f);
foreach (var f in Directory.GetFiles(tempDir, "*.vtt"))
File.Delete(f);
var args = new List
{
"--skip-download",
"--sub-lang", "en,en-US,en-GB,en.*",
"--sub-format", "srv1/vtt/best",
"-o", Path.Combine(tempDir, "%(id)s"),
};
if (writeSub)
args.Add("--write-sub");
if (writeAutoSub)
args.Add("--write-auto-sub");
args.Add($"https://www.youtube.com/watch?v={videoId}");
var psi = new ProcessStartInfo
{
FileName = "yt-dlp",
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true,
};
foreach (var arg in args)
psi.ArgumentList.Add(arg);
using var process = new Process { StartInfo = psi };
process.Start();
var stdoutTask = process.StandardOutput.ReadToEndAsync(ct);
var stderrTask = process.StandardError.ReadToEndAsync(ct);
await process.WaitForExitAsync(ct);
if (process.ExitCode != 0)
return (null, new List());
var subFiles = Directory.GetFiles(tempDir)
.Where(f => f.EndsWith(".srv1") || f.EndsWith(".vtt") || f.EndsWith(".srt"))
.OrderBy(f => f.EndsWith(".srv1") ? 0 : 1)
.ToList();
if (subFiles.Count == 0)
return (null, new List());
var content = await File.ReadAllTextAsync(subFiles[0], ct);
if (string.IsNullOrWhiteSpace(content))
return (null, new List());
return subFiles[0].EndsWith(".srv1")
? ParseTimedTextXmlWithTimestamps(content)
: ParseVttOrSrtWithTimestamps(content);
}
///
/// Parses YouTube's srv1 timed-text XML into plain text and timestamped segments.
///
private static (string Text, List Segments) ParseTimedTextXmlWithTimestamps(string xml)
{
try
{
var doc = System.Xml.Linq.XDocument.Parse(xml);
var segments = new List();
var textParts = new List();
foreach (var el in doc.Descendants("text"))
{
var decoded = System.Web.HttpUtility.HtmlDecode(el.Value);
var cleaned = System.Text.RegularExpressions.Regex.Replace(decoded, @"\s+", " ").Trim();
if (string.IsNullOrEmpty(cleaned)) continue;
textParts.Add(cleaned);
var startAttr = el.Attribute("start")?.Value;
var durAttr = el.Attribute("dur")?.Value;
var start = double.TryParse(startAttr, System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out var s)
? TimeSpan.FromSeconds(s) : TimeSpan.Zero;
var dur = double.TryParse(durAttr, System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out var d)
? TimeSpan.FromSeconds(d) : TimeSpan.Zero;
segments.Add(new TimestampedSegment
{
Start = start,
Duration = dur,
Text = cleaned
});
}
return (string.Join(" ", textParts), segments);
}
catch
{
return (xml, new List());
}
}
///
/// Parses YouTube's srv1 timed-text XML into clean plain text.
///
/// The XML structure looks like:
/// <transcript>
/// <text start="0.5" dur="2.1">Hello world</text>
/// ...
/// </transcript>
///
private static string ParseTimedTextXml(string xml)
{
var (text, _) = ParseTimedTextXmlWithTimestamps(xml);
return text;
}
///
/// Parses VTT or SRT subtitle formats into plain text and timestamped segments.
/// Strips cue identifiers and formatting tags while preserving timestamp associations.
///
private static (string Text, List Segments) ParseVttOrSrtWithTimestamps(string content)
{
var segments = new List();
var allLines = content.Split('\n').Select(l => l.Trim()).ToArray();
TimeSpan currentStart = TimeSpan.Zero;
TimeSpan currentEnd = TimeSpan.Zero;
var currentText = new List();
foreach (var line in allLines)
{
// Skip headers and metadata
if (string.IsNullOrEmpty(line) ||
line.StartsWith("WEBVTT") ||
line.StartsWith("NOTE") ||
line.StartsWith("Kind:") ||
line.StartsWith("Language:") ||
System.Text.RegularExpressions.Regex.IsMatch(line, @"^\d+$"))
{
// Flush current segment on blank line
if (string.IsNullOrEmpty(line) && currentText.Count > 0)
{
var text = string.Join(" ", currentText);
segments.Add(new TimestampedSegment
{
Start = currentStart,
Duration = currentEnd - currentStart,
Text = text
});
currentText.Clear();
}
continue;
}
// Timestamp line: "00:01:23.456 --> 00:01:27.890"
var tsMatch = System.Text.RegularExpressions.Regex.Match(line,
@"^(\d{2}:\d{2}[:\.][\d\.]+)\s*-->\s*(\d{2}:\d{2}[:\.][\d\.]+)");
if (tsMatch.Success)
{
// Flush previous segment if any text buffered
if (currentText.Count > 0)
{
var text = string.Join(" ", currentText);
segments.Add(new TimestampedSegment
{
Start = currentStart,
Duration = currentEnd - currentStart,
Text = text
});
currentText.Clear();
}
currentStart = ParseVttTimestamp(tsMatch.Groups[1].Value);
currentEnd = ParseVttTimestamp(tsMatch.Groups[2].Value);
continue;
}
// Content line — strip HTML tags and decode
var stripped = System.Text.RegularExpressions.Regex.Replace(line, @"<[^>]+>", "");
var decoded = System.Web.HttpUtility.HtmlDecode(stripped).Trim();
if (!string.IsNullOrEmpty(decoded))
currentText.Add(decoded);
}
// Flush last segment
if (currentText.Count > 0)
{
segments.Add(new TimestampedSegment
{
Start = currentStart,
Duration = currentEnd - currentStart,
Text = string.Join(" ", currentText)
});
}
// Deduplicate consecutive identical text segments (common in VTT)
var deduped = new List();
string? prevText = null;
foreach (var seg in segments)
{
if (seg.Text != prevText)
deduped.Add(seg);
prevText = seg.Text;
}
var plainText = string.Join(" ", deduped.Select(s => s.Text));
return (plainText, deduped);
}
/// Parses a VTT/SRT timestamp string into a TimeSpan.
private static TimeSpan ParseVttTimestamp(string ts)
{
// Normalize: VTT uses "." for ms, SRT uses "," — handle both
ts = ts.Replace(',', '.');
// Handle both HH:MM:SS.mmm and MM:SS.mmm
var parts = ts.Split(':');
if (parts.Length == 3)
{
int.TryParse(parts[0], out var h);
int.TryParse(parts[1], out var m);
double.TryParse(parts[2], System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out var s);
return new TimeSpan(0, h, m, (int)s, (int)((s - (int)s) * 1000));
}
else if (parts.Length == 2)
{
int.TryParse(parts[0], out var m);
double.TryParse(parts[1], System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out var s);
return new TimeSpan(0, 0, m, (int)s, (int)((s - (int)s) * 1000));
}
return TimeSpan.Zero;
}
///
/// Parses VTT or SRT subtitle formats into clean plain text.
/// Strips timestamps, cue identifiers, and formatting tags.
///
private static string ParseVttOrSrt(string content)
{
var (text, _) = ParseVttOrSrtWithTimestamps(content);
return text;
}
///
/// When no captions exist, builds a minimal "transcript" from the video description.
/// The summary will be based on much less information and will be flagged accordingly.
///
private static VideoTranscript BuildMetadataOnlyTranscript(VideoMetadata metadata)
{
var text = string.IsNullOrWhiteSpace(metadata.Description)
? $"No transcript or description available for: {metadata.Title}"
: $"Video title: {metadata.Title}\n\nChannel: {metadata.ChannelTitle}\n\nDescription:\n{metadata.Description}";
return new VideoTranscript
{
VideoId = metadata.VideoId,
Text = text,
SourceTrack = null,
Source = TranscriptSource.MetadataOnly
};
}
}