519 lines
19 KiB
C#
519 lines
19 KiB
C#
using System.Diagnostics;
|
|
using System.Text.Json;
|
|
using YoutubeSummarizer.Models;
|
|
|
|
namespace YoutubeSummarizer.Services;
|
|
|
|
/// <summary>
|
|
/// Uses yt-dlp (https://github.com/yt-dlp/yt-dlp) to retrieve video metadata
|
|
/// and download caption tracks. No YouTube API key required.
|
|
///
|
|
/// yt-dlp is the de-facto standard tool for reliably extracting video
|
|
/// information and subtitles from YouTube. It must be installed and
|
|
/// available on PATH (e.g. pip install yt-dlp).
|
|
/// </summary>
|
|
public sealed class YouTubeService
|
|
{
|
|
private readonly HttpClient _httpClient;
|
|
|
|
public YouTubeService(HttpClient httpClient)
|
|
{
|
|
_httpClient = httpClient;
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────
|
|
// Public API
|
|
// ─────────────────────────────────────────────────────────────────────────
|
|
|
|
/// <summary>
|
|
/// Parses a YouTube video ID from any common URL format.
|
|
/// Handles: watch?v=, youtu.be/, /embed/, /shorts/
|
|
/// </summary>
|
|
public static string? ExtractVideoId(string url)
|
|
{
|
|
// Normalize — strip whitespace the user may have pasted
|
|
url = url.Trim();
|
|
|
|
// youtu.be short links: https://youtu.be/VIDEO_ID
|
|
if (Uri.TryCreate(url, UriKind.Absolute, out var uri))
|
|
{
|
|
if (uri.Host.Contains("youtu.be"))
|
|
return uri.AbsolutePath.TrimStart('/').Split('?')[0];
|
|
|
|
// Standard and embed URLs: ?v=VIDEO_ID, /embed/VIDEO_ID, /shorts/VIDEO_ID
|
|
var query = System.Web.HttpUtility.ParseQueryString(uri.Query);
|
|
if (query["v"] is { } vParam && vParam.Length == 11)
|
|
return vParam;
|
|
|
|
var segments = uri.AbsolutePath.Split('/', StringSplitOptions.RemoveEmptyEntries);
|
|
for (int i = 0; i < segments.Length - 1; i++)
|
|
{
|
|
if (segments[i] is "embed" or "shorts" or "v")
|
|
return segments[i + 1].Split('?')[0];
|
|
}
|
|
}
|
|
|
|
// Raw ID passed directly (11 alphanumeric chars + dash/underscore)
|
|
if (System.Text.RegularExpressions.Regex.IsMatch(url, @"^[\w-]{11}$"))
|
|
return url;
|
|
|
|
return null;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Fetches metadata for a video using yt-dlp --dump-json.
|
|
/// No API key required — yt-dlp scrapes the public video page.
|
|
/// </summary>
|
|
public async Task<VideoMetadata?> GetVideoMetadataAsync(string videoId, CancellationToken ct = default)
|
|
{
|
|
var psi = new ProcessStartInfo
|
|
{
|
|
FileName = "yt-dlp",
|
|
RedirectStandardOutput = true,
|
|
RedirectStandardError = true,
|
|
UseShellExecute = false,
|
|
CreateNoWindow = true,
|
|
};
|
|
psi.ArgumentList.Add("--dump-json");
|
|
psi.ArgumentList.Add("--no-download");
|
|
psi.ArgumentList.Add($"https://www.youtube.com/watch?v={videoId}");
|
|
|
|
using var process = new Process { StartInfo = psi };
|
|
process.Start();
|
|
|
|
var stdoutTask = process.StandardOutput.ReadToEndAsync(ct);
|
|
var stderrTask = process.StandardError.ReadToEndAsync(ct);
|
|
|
|
await process.WaitForExitAsync(ct);
|
|
|
|
if (process.ExitCode != 0)
|
|
return null;
|
|
|
|
var json = await stdoutTask;
|
|
if (string.IsNullOrWhiteSpace(json))
|
|
return null;
|
|
|
|
try
|
|
{
|
|
using var doc = JsonDocument.Parse(json);
|
|
var root = doc.RootElement;
|
|
|
|
var title = root.TryGetProperty("title", out var t) ? t.GetString() ?? "(no title)" : "(no title)";
|
|
var channel = root.TryGetProperty("channel", out var c) ? c.GetString() ?? "(unknown channel)" : "(unknown channel)";
|
|
var description = root.TryGetProperty("description", out var d) ? d.GetString() : null;
|
|
|
|
// yt-dlp returns duration in seconds
|
|
TimeSpan? duration = null;
|
|
if (root.TryGetProperty("duration", out var dur) && dur.ValueKind == JsonValueKind.Number)
|
|
duration = TimeSpan.FromSeconds(dur.GetDouble());
|
|
|
|
// Upload date comes as "YYYYMMDD"
|
|
DateTimeOffset publishedAt = DateTimeOffset.MinValue;
|
|
if (root.TryGetProperty("upload_date", out var ud) && ud.GetString() is { } dateStr
|
|
&& DateTime.TryParseExact(dateStr, "yyyyMMdd", null,
|
|
System.Globalization.DateTimeStyles.None, out var parsed))
|
|
{
|
|
publishedAt = new DateTimeOffset(parsed, TimeSpan.Zero);
|
|
}
|
|
|
|
// Build ISO 8601 duration string for FormattedDuration compatibility
|
|
string? isoDuration = null;
|
|
if (duration.HasValue)
|
|
{
|
|
var ts = duration.Value;
|
|
isoDuration = $"PT{(int)ts.TotalHours}H{ts.Minutes}M{ts.Seconds}S";
|
|
}
|
|
|
|
return new VideoMetadata
|
|
{
|
|
VideoId = videoId,
|
|
Title = title,
|
|
ChannelTitle = channel,
|
|
PublishedAt = publishedAt,
|
|
Duration = isoDuration,
|
|
Description = description
|
|
};
|
|
}
|
|
catch
|
|
{
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Retrieves the best available transcript for the video using yt-dlp.
|
|
///
|
|
/// yt-dlp is invoked to download subtitle files (preferring manual English
|
|
/// captions, falling back to auto-generated). The downloaded XML is parsed
|
|
/// into clean plain text for summarization.
|
|
///
|
|
/// If yt-dlp fails or no captions exist, returns a metadata-only transcript
|
|
/// from the video description.
|
|
/// </summary>
|
|
public async Task<VideoTranscript> GetTranscriptAsync(
|
|
VideoMetadata metadata,
|
|
CancellationToken ct = default)
|
|
{
|
|
// Try manual (human-written) subtitles first, then auto-generated
|
|
var (text, segments, isAuto) = await DownloadSubtitlesWithTimestampsAsync(metadata.VideoId, ct);
|
|
|
|
if (!string.IsNullOrWhiteSpace(text))
|
|
{
|
|
return new VideoTranscript
|
|
{
|
|
VideoId = metadata.VideoId,
|
|
Text = text,
|
|
Segments = segments,
|
|
SourceTrack = new CaptionTrack
|
|
{
|
|
TrackId = "yt-dlp",
|
|
Language = "en",
|
|
TrackKind = isAuto ? "asr" : "standard",
|
|
Name = isAuto ? "Auto-generated (en)" : "English"
|
|
},
|
|
Source = isAuto
|
|
? TranscriptSource.AutoGenerated
|
|
: TranscriptSource.OwnerPublished
|
|
};
|
|
}
|
|
|
|
// No captions at all — fall back to the description text
|
|
return BuildMetadataOnlyTranscript(metadata);
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────
|
|
// Private helpers
|
|
// ─────────────────────────────────────────────────────────────────────────
|
|
|
|
/// <summary>
|
|
/// Shells out to yt-dlp to download subtitles for the given video.
|
|
/// First attempts manual subs, then auto-generated if none found.
|
|
/// Returns the cleaned transcript text and whether it was auto-generated.
|
|
/// </summary>
|
|
private static async Task<(string? Text, bool IsAuto)> DownloadSubtitlesWithYtDlpAsync(
|
|
string videoId,
|
|
CancellationToken ct)
|
|
{
|
|
var (text, segments, isAuto) = await DownloadSubtitlesWithTimestampsAsync(videoId, ct);
|
|
return (text, isAuto);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Downloads subtitles and returns both the plain text and timestamped segments.
|
|
/// </summary>
|
|
private static async Task<(string? Text, List<TimestampedSegment> Segments, bool IsAuto)> DownloadSubtitlesWithTimestampsAsync(
|
|
string videoId,
|
|
CancellationToken ct)
|
|
{
|
|
var tempDir = Path.Combine(Path.GetTempPath(), $"ytsumm_{videoId}_{Guid.NewGuid():N}");
|
|
Directory.CreateDirectory(tempDir);
|
|
|
|
try
|
|
{
|
|
// Attempt 1: manual (human-written) subtitles only
|
|
var (manualText, manualSegments) = await RunYtDlpSubtitleWithTimestampsAsync(
|
|
videoId, tempDir, writeSub: true, writeAutoSub: false, ct);
|
|
|
|
if (!string.IsNullOrWhiteSpace(manualText))
|
|
return (manualText, manualSegments, false);
|
|
|
|
// Attempt 2: auto-generated subtitles
|
|
var (autoText, autoSegments) = await RunYtDlpSubtitleWithTimestampsAsync(
|
|
videoId, tempDir, writeSub: false, writeAutoSub: true, ct);
|
|
|
|
if (!string.IsNullOrWhiteSpace(autoText))
|
|
return (autoText, autoSegments, true);
|
|
|
|
return (null, new List<TimestampedSegment>(), false);
|
|
}
|
|
finally
|
|
{
|
|
try { Directory.Delete(tempDir, recursive: true); } catch { /* best effort */ }
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Runs a single yt-dlp invocation to download subtitles with timestamps.
|
|
/// Returns the parsed plain-text transcript and timestamped segments.
|
|
/// </summary>
|
|
private static async Task<(string? Text, List<TimestampedSegment> Segments)> RunYtDlpSubtitleWithTimestampsAsync(
|
|
string videoId,
|
|
string tempDir,
|
|
bool writeSub,
|
|
bool writeAutoSub,
|
|
CancellationToken ct)
|
|
{
|
|
// Clean any previous subtitle files from this temp dir
|
|
foreach (var f in Directory.GetFiles(tempDir, "*.srv1"))
|
|
File.Delete(f);
|
|
foreach (var f in Directory.GetFiles(tempDir, "*.vtt"))
|
|
File.Delete(f);
|
|
|
|
var args = new List<string>
|
|
{
|
|
"--skip-download",
|
|
"--sub-lang", "en,en-US,en-GB,en.*",
|
|
"--sub-format", "srv1/vtt/best",
|
|
"-o", Path.Combine(tempDir, "%(id)s"),
|
|
};
|
|
|
|
if (writeSub)
|
|
args.Add("--write-sub");
|
|
if (writeAutoSub)
|
|
args.Add("--write-auto-sub");
|
|
|
|
args.Add($"https://www.youtube.com/watch?v={videoId}");
|
|
|
|
var psi = new ProcessStartInfo
|
|
{
|
|
FileName = "yt-dlp",
|
|
RedirectStandardOutput = true,
|
|
RedirectStandardError = true,
|
|
UseShellExecute = false,
|
|
CreateNoWindow = true,
|
|
};
|
|
|
|
foreach (var arg in args)
|
|
psi.ArgumentList.Add(arg);
|
|
|
|
using var process = new Process { StartInfo = psi };
|
|
process.Start();
|
|
|
|
var stdoutTask = process.StandardOutput.ReadToEndAsync(ct);
|
|
var stderrTask = process.StandardError.ReadToEndAsync(ct);
|
|
|
|
await process.WaitForExitAsync(ct);
|
|
|
|
if (process.ExitCode != 0)
|
|
return (null, new List<TimestampedSegment>());
|
|
|
|
var subFiles = Directory.GetFiles(tempDir)
|
|
.Where(f => f.EndsWith(".srv1") || f.EndsWith(".vtt") || f.EndsWith(".srt"))
|
|
.OrderBy(f => f.EndsWith(".srv1") ? 0 : 1)
|
|
.ToList();
|
|
|
|
if (subFiles.Count == 0)
|
|
return (null, new List<TimestampedSegment>());
|
|
|
|
var content = await File.ReadAllTextAsync(subFiles[0], ct);
|
|
|
|
if (string.IsNullOrWhiteSpace(content))
|
|
return (null, new List<TimestampedSegment>());
|
|
|
|
return subFiles[0].EndsWith(".srv1")
|
|
? ParseTimedTextXmlWithTimestamps(content)
|
|
: ParseVttOrSrtWithTimestamps(content);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Parses YouTube's srv1 timed-text XML into plain text and timestamped segments.
|
|
/// </summary>
|
|
private static (string Text, List<TimestampedSegment> Segments) ParseTimedTextXmlWithTimestamps(string xml)
|
|
{
|
|
try
|
|
{
|
|
var doc = System.Xml.Linq.XDocument.Parse(xml);
|
|
var segments = new List<TimestampedSegment>();
|
|
var textParts = new List<string>();
|
|
|
|
foreach (var el in doc.Descendants("text"))
|
|
{
|
|
var decoded = System.Web.HttpUtility.HtmlDecode(el.Value);
|
|
var cleaned = System.Text.RegularExpressions.Regex.Replace(decoded, @"\s+", " ").Trim();
|
|
|
|
if (string.IsNullOrEmpty(cleaned)) continue;
|
|
|
|
textParts.Add(cleaned);
|
|
|
|
var startAttr = el.Attribute("start")?.Value;
|
|
var durAttr = el.Attribute("dur")?.Value;
|
|
|
|
var start = double.TryParse(startAttr, System.Globalization.NumberStyles.Float,
|
|
System.Globalization.CultureInfo.InvariantCulture, out var s)
|
|
? TimeSpan.FromSeconds(s) : TimeSpan.Zero;
|
|
|
|
var dur = double.TryParse(durAttr, System.Globalization.NumberStyles.Float,
|
|
System.Globalization.CultureInfo.InvariantCulture, out var d)
|
|
? TimeSpan.FromSeconds(d) : TimeSpan.Zero;
|
|
|
|
segments.Add(new TimestampedSegment
|
|
{
|
|
Start = start,
|
|
Duration = dur,
|
|
Text = cleaned
|
|
});
|
|
}
|
|
|
|
return (string.Join(" ", textParts), segments);
|
|
}
|
|
catch
|
|
{
|
|
return (xml, new List<TimestampedSegment>());
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Parses YouTube's srv1 timed-text XML into clean plain text.
|
|
///
|
|
/// The XML structure looks like:
|
|
/// <transcript>
|
|
/// <text start="0.5" dur="2.1">Hello world</text>
|
|
/// ...
|
|
/// </transcript>
|
|
/// </summary>
|
|
private static string ParseTimedTextXml(string xml)
|
|
{
|
|
var (text, _) = ParseTimedTextXmlWithTimestamps(xml);
|
|
return text;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Parses VTT or SRT subtitle formats into plain text and timestamped segments.
|
|
/// Strips cue identifiers and formatting tags while preserving timestamp associations.
|
|
/// </summary>
|
|
private static (string Text, List<TimestampedSegment> Segments) ParseVttOrSrtWithTimestamps(string content)
|
|
{
|
|
var segments = new List<TimestampedSegment>();
|
|
var allLines = content.Split('\n').Select(l => l.Trim()).ToArray();
|
|
|
|
TimeSpan currentStart = TimeSpan.Zero;
|
|
TimeSpan currentEnd = TimeSpan.Zero;
|
|
var currentText = new List<string>();
|
|
|
|
foreach (var line in allLines)
|
|
{
|
|
// Skip headers and metadata
|
|
if (string.IsNullOrEmpty(line) ||
|
|
line.StartsWith("WEBVTT") ||
|
|
line.StartsWith("NOTE") ||
|
|
line.StartsWith("Kind:") ||
|
|
line.StartsWith("Language:") ||
|
|
System.Text.RegularExpressions.Regex.IsMatch(line, @"^\d+$"))
|
|
{
|
|
// Flush current segment on blank line
|
|
if (string.IsNullOrEmpty(line) && currentText.Count > 0)
|
|
{
|
|
var text = string.Join(" ", currentText);
|
|
segments.Add(new TimestampedSegment
|
|
{
|
|
Start = currentStart,
|
|
Duration = currentEnd - currentStart,
|
|
Text = text
|
|
});
|
|
currentText.Clear();
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Timestamp line: "00:01:23.456 --> 00:01:27.890"
|
|
var tsMatch = System.Text.RegularExpressions.Regex.Match(line,
|
|
@"^(\d{2}:\d{2}[:\.][\d\.]+)\s*-->\s*(\d{2}:\d{2}[:\.][\d\.]+)");
|
|
if (tsMatch.Success)
|
|
{
|
|
// Flush previous segment if any text buffered
|
|
if (currentText.Count > 0)
|
|
{
|
|
var text = string.Join(" ", currentText);
|
|
segments.Add(new TimestampedSegment
|
|
{
|
|
Start = currentStart,
|
|
Duration = currentEnd - currentStart,
|
|
Text = text
|
|
});
|
|
currentText.Clear();
|
|
}
|
|
|
|
currentStart = ParseVttTimestamp(tsMatch.Groups[1].Value);
|
|
currentEnd = ParseVttTimestamp(tsMatch.Groups[2].Value);
|
|
continue;
|
|
}
|
|
|
|
// Content line — strip HTML tags and decode
|
|
var stripped = System.Text.RegularExpressions.Regex.Replace(line, @"<[^>]+>", "");
|
|
var decoded = System.Web.HttpUtility.HtmlDecode(stripped).Trim();
|
|
if (!string.IsNullOrEmpty(decoded))
|
|
currentText.Add(decoded);
|
|
}
|
|
|
|
// Flush last segment
|
|
if (currentText.Count > 0)
|
|
{
|
|
segments.Add(new TimestampedSegment
|
|
{
|
|
Start = currentStart,
|
|
Duration = currentEnd - currentStart,
|
|
Text = string.Join(" ", currentText)
|
|
});
|
|
}
|
|
|
|
// Deduplicate consecutive identical text segments (common in VTT)
|
|
var deduped = new List<TimestampedSegment>();
|
|
string? prevText = null;
|
|
foreach (var seg in segments)
|
|
{
|
|
if (seg.Text != prevText)
|
|
deduped.Add(seg);
|
|
prevText = seg.Text;
|
|
}
|
|
|
|
var plainText = string.Join(" ", deduped.Select(s => s.Text));
|
|
return (plainText, deduped);
|
|
}
|
|
|
|
/// <summary>Parses a VTT/SRT timestamp string into a TimeSpan.</summary>
|
|
private static TimeSpan ParseVttTimestamp(string ts)
|
|
{
|
|
// Normalize: VTT uses "." for ms, SRT uses "," — handle both
|
|
ts = ts.Replace(',', '.');
|
|
|
|
// Handle both HH:MM:SS.mmm and MM:SS.mmm
|
|
var parts = ts.Split(':');
|
|
if (parts.Length == 3)
|
|
{
|
|
int.TryParse(parts[0], out var h);
|
|
int.TryParse(parts[1], out var m);
|
|
double.TryParse(parts[2], System.Globalization.NumberStyles.Float,
|
|
System.Globalization.CultureInfo.InvariantCulture, out var s);
|
|
return new TimeSpan(0, h, m, (int)s, (int)((s - (int)s) * 1000));
|
|
}
|
|
else if (parts.Length == 2)
|
|
{
|
|
int.TryParse(parts[0], out var m);
|
|
double.TryParse(parts[1], System.Globalization.NumberStyles.Float,
|
|
System.Globalization.CultureInfo.InvariantCulture, out var s);
|
|
return new TimeSpan(0, 0, m, (int)s, (int)((s - (int)s) * 1000));
|
|
}
|
|
|
|
return TimeSpan.Zero;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Parses VTT or SRT subtitle formats into clean plain text.
|
|
/// Strips timestamps, cue identifiers, and formatting tags.
|
|
/// </summary>
|
|
private static string ParseVttOrSrt(string content)
|
|
{
|
|
var (text, _) = ParseVttOrSrtWithTimestamps(content);
|
|
return text;
|
|
}
|
|
|
|
/// <summary>
|
|
/// When no captions exist, builds a minimal "transcript" from the video description.
|
|
/// The summary will be based on much less information and will be flagged accordingly.
|
|
/// </summary>
|
|
private static VideoTranscript BuildMetadataOnlyTranscript(VideoMetadata metadata)
|
|
{
|
|
var text = string.IsNullOrWhiteSpace(metadata.Description)
|
|
? $"No transcript or description available for: {metadata.Title}"
|
|
: $"Video title: {metadata.Title}\n\nChannel: {metadata.ChannelTitle}\n\nDescription:\n{metadata.Description}";
|
|
|
|
return new VideoTranscript
|
|
{
|
|
VideoId = metadata.VideoId,
|
|
Text = text,
|
|
SourceTrack = null,
|
|
Source = TranscriptSource.MetadataOnly
|
|
};
|
|
}
|
|
}
|