feat: initialize YouTube summarizer project with OpenAI integration and map-reduce processing strategy

This commit is contained in:
Clinton Billedeaux 2026-05-18 11:00:15 -05:00
commit 8f9291883d
12 changed files with 1927 additions and 0 deletions

27
.gitignore vendored Normal file
View file

@ -0,0 +1,27 @@
# Visual Studio / .NET build outputs
[Bb]in/
[Oo]bj/
[Pp]ublish/
*.user
*.userosscache
*.sln.docstates
*.suo
*.cache
# IDEs / Tools
.idea/
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
*.swp
*.~*
# OS metadata
.DS_Store
Thumbs.db
# Project specific / Temporary files
scratch/
*.log

56
AppSettings.cs Normal file
View file

@ -0,0 +1,56 @@
namespace YoutubeSummarizer.Configuration;
/// <summary>
/// Root configuration object bound from appsettings.json.
/// Only OpenAI and Summarizer sections are required.
/// </summary>
public sealed class AppSettings
{
public LlmSettings LLM { get; init; } = new();
public SummarizerSettings Summarizer { get; init; } = new();
}
/// <summary>
/// Settings for the LLM API (OpenAI or Ollama).
/// </summary>
public sealed class LlmSettings
{
/// <summary>
/// Base URL for the API.
/// For OpenAI: https://api.openai.com/v1
/// For Ollama: http://localhost:11434/v1
/// </summary>
public string BaseUrl { get; init; } = "https://api.openai.com/v1";
/// <summary>Your API key. (For Ollama, any value works).</summary>
public string ApiKey { get; init; } = string.Empty;
/// <summary>
/// Model to use.
/// OpenAI: gpt-4o-mini, gpt-4o
/// Ollama: qwen3:14b, llama3.1
/// </summary>
public string Model { get; init; } = "gpt-4o-mini";
/// <summary>Max tokens for the summary response (not the input).</summary>
public int MaxTokens { get; init; } = 1500;
/// <summary>Timeout in seconds for API calls.</summary>
public int TimeoutSeconds { get; init; } = 100;
}
/// <summary>
/// Controls summarization behavior.
/// </summary>
public sealed class SummarizerSettings
{
/// <summary>
/// Approximate word count at which we split a long transcript into chunks
/// before doing a final "summary of summaries" pass. This keeps individual
/// API calls within model context limits.
/// </summary>
public int ChunkWordLimit { get; init; } = 3000;
/// <summary>When true, prints the full transcript text before summarizing.</summary>
public bool ShowTranscript { get; init; } = false;
}

197
ConsoleRenderer.cs Normal file
View file

@ -0,0 +1,197 @@
using YoutubeSummarizer.Models;
namespace YoutubeSummarizer.Services;
/// <summary>
/// Handles all console output formatting.
/// Keeping display logic separate from business logic makes it easy to
/// later add output modes (JSON, Markdown file, HTML report) without
/// touching the service layer.
/// </summary>
public static class ConsoleRenderer
{
// ANSI color codes. These render correctly in most Linux terminals.
// If you pipe output to a file, the escape codes will appear as-is —
// run with --no-color if that's a concern (not implemented here, left
// as an exercise).
private const string Reset = "\x1b[0m";
private const string Bold = "\x1b[1m";
private const string Cyan = "\x1b[36m";
private const string Yellow = "\x1b[33m";
private const string Green = "\x1b[32m";
private const string Red = "\x1b[31m";
private const string Dim = "\x1b[2m";
/// <summary>Prints the application banner on startup.</summary>
public static void PrintBanner()
{
Console.WriteLine();
Console.WriteLine($"{Bold}{Cyan}╔════════════════════════════════════════╗{Reset}");
Console.WriteLine($"{Bold}{Cyan}║ YouTube Video Summarizer ║{Reset}");
Console.WriteLine($"{Bold}{Cyan}╚════════════════════════════════════════╝{Reset}");
Console.WriteLine();
}
/// <summary>Prompts the user for a URL and reads input.</summary>
public static string PromptForUrl()
{
Console.Write($"{Bold}Enter YouTube URL (or 'q' to quit):{Reset} ");
return Console.ReadLine()?.Trim() ?? string.Empty;
}
/// <summary>
/// Asks the user whether they want to save the transcript to a text file.
/// Returns true if the user answers yes.
/// </summary>
public static bool PromptSaveTranscript()
{
Console.Write($"{Bold}Save transcript to file? (y/n):{Reset} ");
var answer = Console.ReadLine()?.Trim() ?? string.Empty;
return answer.Equals("y", StringComparison.OrdinalIgnoreCase)
|| answer.Equals("yes", StringComparison.OrdinalIgnoreCase);
}
/// <summary>Prints a success message with the saved file path.</summary>
public static void PrintFileSaved(string filePath)
{
Console.WriteLine($" {Green}✓ Transcript saved to:{Reset} {filePath}");
Console.WriteLine();
}
/// <summary>
/// Prompts the user to choose a summary mode.
/// Returns the selected <see cref="SummaryMode"/>.
/// </summary>
public static SummaryMode PromptSummaryMode()
{
Console.WriteLine($" {Dim}Summary modes:{Reset}");
Console.WriteLine($" {Bold}1{Reset} Standard (detailed bullet-point summary)");
Console.WriteLine($" {Bold}2{Reset} Personal Filter (relevance verdict: ACT / MONITOR / IGNORE)");
Console.Write($"{Bold}Choose summary mode [1]:{Reset} ");
var choice = Console.ReadLine()?.Trim() ?? string.Empty;
return choice == "2" ? SummaryMode.PersonalFilter : SummaryMode.Standard;
}
/// <summary>Displays a spinner-style "working" indicator while async work runs.</summary>
public static void PrintWorking(string message)
{
Console.WriteLine($" {Dim}→ {message}...{Reset}");
}
/// <summary>
/// Renders the full summary result to the console in a structured,
/// readable format. Includes metadata header, quality warning, and
/// the summary body.
/// </summary>
public static void PrintSummary(VideoSummary summary, bool showTranscriptSource)
{
Console.WriteLine();
PrintDivider();
// ── Metadata header ──────────────────────────────────────────────────
Console.WriteLine($"{Bold}{Green} {summary.Metadata.Title}{Reset}");
Console.WriteLine($" {Dim}Channel:{Reset} {summary.Metadata.ChannelTitle}");
Console.WriteLine($" {Dim}Published:{Reset} {summary.Metadata.PublishedAt:MMMM d, yyyy}");
Console.WriteLine($" {Dim}Duration:{Reset} {summary.Metadata.FormattedDuration}");
Console.WriteLine($" {Dim}URL:{Reset} https://youtu.be/{summary.Metadata.VideoId}");
// ── Transcript source badge ──────────────────────────────────────────
if (showTranscriptSource)
{
var (badge, color) = summary.TranscriptSource switch
{
TranscriptSource.OwnerPublished => ("✓ Owner-published captions", Green),
TranscriptSource.CommunityContributed=> ("✓ Community captions", Green),
TranscriptSource.AutoGenerated => ("~ Auto-generated (ASR)", Yellow),
TranscriptSource.MetadataOnly => ("✗ Metadata only", Red),
_ => ("? Unknown", Dim)
};
Console.WriteLine($" {Dim}Transcript:{Reset} {color}{badge}{Reset}");
}
Console.WriteLine($" {Dim}Model:{Reset} {summary.ModelUsed}");
Console.WriteLine($" {Dim}Generated:{Reset} {summary.GeneratedAt:yyyy-MM-dd HH:mm} UTC");
PrintDivider();
// ── Quality warning ──────────────────────────────────────────────────
if (summary.QualityWarning is not null)
{
Console.WriteLine();
Console.WriteLine($" {Yellow}{summary.QualityWarning}{Reset}");
}
// ── Summary body ─────────────────────────────────────────────────────
Console.WriteLine();
Console.WriteLine($"{Bold} SUMMARY{Reset}");
Console.WriteLine();
// Word-wrap the summary body at 80 characters so it's readable in
// standard terminal widths without horizontal scrolling.
foreach (var line in WordWrap(summary.SummaryText, maxWidth: 78))
{
Console.WriteLine($" {line}");
}
Console.WriteLine();
PrintDivider();
Console.WriteLine();
}
/// <summary>Prints a styled error message.</summary>
public static void PrintError(string message)
{
Console.WriteLine();
Console.WriteLine($" {Red}✗ Error: {message}{Reset}");
Console.WriteLine();
}
/// <summary>Prints a styled warning (non-fatal).</summary>
public static void PrintWarning(string message)
{
Console.WriteLine($" {Yellow}⚠ {message}{Reset}");
}
// ─────────────────────────────────────────────────────────────────────────
// Private helpers
// ─────────────────────────────────────────────────────────────────────────
private static void PrintDivider()
{
Console.WriteLine($" {Dim}{"".PadRight(74, '─')}{Reset}");
}
/// <summary>
/// Splits text into lines no wider than <paramref name="maxWidth"/> characters,
/// breaking only at word boundaries. Respects existing newlines in the input.
/// </summary>
private static IEnumerable<string> WordWrap(string text, int maxWidth)
{
foreach (var paragraph in text.Split('\n'))
{
if (string.IsNullOrWhiteSpace(paragraph))
{
yield return string.Empty;
continue;
}
var words = paragraph.Split(' ', StringSplitOptions.RemoveEmptyEntries);
var current = new System.Text.StringBuilder();
foreach (var word in words)
{
if (current.Length + word.Length + 1 > maxWidth)
{
yield return current.ToString();
current.Clear();
}
if (current.Length > 0) current.Append(' ');
current.Append(word);
}
if (current.Length > 0)
yield return current.ToString();
}
}
}

227
Program.cs Normal file
View file

@ -0,0 +1,227 @@
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using YoutubeSummarizer.Configuration;
using YoutubeSummarizer.Models;
using YoutubeSummarizer.Services;
// ═════════════════════════════════════════════════════════════════════════════
// Bootstrap
// ═════════════════════════════════════════════════════════════════════════════
// Build configuration from appsettings.json (required) with optional
// environment variable overrides (useful for CI or containerized deployment).
// Environment variables follow the pattern: YouTube__ApiKey, LLM__ApiKey, etc.
var config = new ConfigurationBuilder()
.SetBasePath(AppContext.BaseDirectory)
.AddJsonFile("appsettings.json", optional: false, reloadOnChange: false)
.AddEnvironmentVariables() // overrides appsettings values if set
.Build();
// Bind configuration sections to strongly-typed objects.
var appSettings = new AppSettings();
config.Bind(appSettings);
// Validate required keys up front — fail fast with a clear message rather
// than letting the first API call blow up with a cryptic 401.
ValidateSettings(appSettings);
// Wire up DI container.
// For a console app this is lightweight, but it mirrors the pattern used
// in the LIKA/IKA ASP.NET services so the code is easy to lift into a
// background service or API controller later.
var services = new ServiceCollection();
// Register HttpClient for the YouTube timedtext endpoint.
// Using IHttpClientFactory gives us connection pooling and the ability to
// attach Polly retry policies.
services.AddHttpClient<YouTubeService>(client =>
{
client.DefaultRequestHeaders.Add("User-Agent",
"Mozilla/5.0 (compatible; YoutubeSummarizer/1.0)");
client.Timeout = TimeSpan.FromSeconds(30);
});
// Register services with their config dependencies.
services.AddSingleton(appSettings.LLM);
services.AddSingleton(appSettings.Summarizer);
services.AddTransient<SummarizerService>();
var serviceProvider = services.BuildServiceProvider();
// ═════════════════════════════════════════════════════════════════════════════
// Main loop
// ═════════════════════════════════════════════════════════════════════════════
ConsoleRenderer.PrintBanner();
// Handle Ctrl+C gracefully so any in-progress API call can finish or cancel.
using var cts = new CancellationTokenSource();
Console.CancelKeyPress += (_, e) =>
{
e.Cancel = true; // prevent immediate termination
cts.Cancel();
Console.WriteLine("\n Cancellation requested. Finishing current operation...");
};
while (!cts.Token.IsCancellationRequested)
{
var input = ConsoleRenderer.PromptForUrl();
if (string.IsNullOrWhiteSpace(input)) continue;
if (input.Equals("q", StringComparison.OrdinalIgnoreCase)) break;
// Parse the video ID from the URL
var videoId = YouTubeService.ExtractVideoId(input);
if (videoId is null)
{
ConsoleRenderer.PrintError("Could not extract a valid YouTube video ID from that URL.");
ConsoleRenderer.PrintWarning("Accepted formats: watch?v=..., youtu.be/..., /shorts/..., /embed/...");
continue;
}
// Ask whether to save transcript to file before processing
var saveTranscript = ConsoleRenderer.PromptSaveTranscript();
// Choose summary mode
var summaryMode = ConsoleRenderer.PromptSummaryMode();
await ProcessVideoAsync(videoId, serviceProvider, appSettings.Summarizer, saveTranscript, summaryMode, cts.Token);
}
Console.WriteLine(" Goodbye!");
// ═════════════════════════════════════════════════════════════════════════════
// Video processing pipeline
// ═════════════════════════════════════════════════════════════════════════════
/// <summary>
/// Orchestrates the full pipeline for a single video:
/// 1. Fetch metadata (YouTube Data API)
/// 2. Fetch transcript (caption track or timedtext fallback)
/// 3. Summarize (LLM Chat Completions)
/// 4. Display (ConsoleRenderer)
/// </summary>
static async Task ProcessVideoAsync(
string videoId,
IServiceProvider sp,
SummarizerSettings summarizerSettings,
bool saveTranscript,
SummaryMode summaryMode,
CancellationToken ct)
{
try
{
// Resolve scoped services
var youtubeService = sp.GetRequiredService<YouTubeService>();
var summarizerService = sp.GetRequiredService<SummarizerService>();
// ── Step 1: Metadata ──────────────────────────────────────────────
ConsoleRenderer.PrintWorking("Fetching video metadata");
var metadata = await youtubeService.GetVideoMetadataAsync(videoId, ct);
if (metadata is null)
{
ConsoleRenderer.PrintError($"Video not found or is private: {videoId}");
return;
}
Console.WriteLine($" {metadata.Title}");
// ── Step 2: Transcript ────────────────────────────────────────────
ConsoleRenderer.PrintWorking("Fetching transcript");
var transcript = await youtubeService.GetTranscriptAsync(metadata, ct);
// Optionally show raw transcript for debugging / inspection
if (summarizerSettings.ShowTranscript)
{
Console.WriteLine();
Console.WriteLine(" ─── RAW TRANSCRIPT ───");
Console.WriteLine(transcript.Text);
Console.WriteLine(" ─── END TRANSCRIPT ───");
Console.WriteLine();
}
Console.WriteLine(
$" Transcript: {transcript.Source} | {transcript.WordCount:N0} words");
// ── Step 2.5: Save transcript to file (if requested) ─────────────
// (moved after summarization so we can include the summary)
// ── Step 3: Summarize ─────────────────────────────────────────────
// Always run the standard summary (used for file saving).
ConsoleRenderer.PrintWorking("Summarizing with LLM");
var standardSummary = await summarizerService.SummarizeAsync(
metadata, transcript, SummaryMode.Standard, ct);
// If the user chose Personal Filter, run a second pass for display.
VideoSummary displaySummary;
if (summaryMode == SummaryMode.PersonalFilter)
{
ConsoleRenderer.PrintWorking("Applying Personal Information Filter");
displaySummary = await summarizerService.SummarizeAsync(
metadata, transcript, SummaryMode.PersonalFilter, ct);
}
else
{
displaySummary = standardSummary;
}
// ── Step 3.5: Save transcript + standard summary to file ─────────
if (saveTranscript)
{
var transcriptsDir = Path.Combine(
Environment.GetFolderPath(Environment.SpecialFolder.UserProfile),
"Downloads", "transcripts");
ConsoleRenderer.PrintWorking("Saving transcript to file");
var savedPath = await TranscriptFileService.SaveAsync(
metadata, transcript, summaryText: standardSummary.SummaryText,
outputDirectory: transcriptsDir, ct: ct);
ConsoleRenderer.PrintFileSaved(savedPath);
}
// ── Step 4: Display ───────────────────────────────────────────────
ConsoleRenderer.PrintSummary(displaySummary, showTranscriptSource: true);
}
catch (OperationCanceledException)
{
// User pressed Ctrl+C — nothing to report, the loop will exit
}
catch (Exception ex)
{
ConsoleRenderer.PrintError(ex.Message);
// Print the stack trace in dim text for debugging without overwhelming
// normal users who will rarely see this path.
Console.WriteLine($"\x1b[2m{ex}\x1b[0m");
}
}
// ═════════════════════════════════════════════════════════════════════════════
// Configuration validation
// ═════════════════════════════════════════════════════════════════════════════
static void ValidateSettings(AppSettings settings)
{
var errors = new List<string>();
if (string.IsNullOrWhiteSpace(settings.LLM.ApiKey) ||
settings.LLM.ApiKey == "YOUR_API_KEY_HERE")
{
// For local Ollama, we don't strictly need a real key, but it shouldn't be the placeholder.
// If they are using OpenAI, they definitely need a key.
if (settings.LLM.BaseUrl.Contains("openai.com", StringComparison.OrdinalIgnoreCase))
{
errors.Add("LLM:ApiKey is not set in appsettings.json (Required for OpenAI)");
}
}
if (errors.Count > 0)
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine("\nConfiguration errors:");
errors.ForEach(e => Console.WriteLine($" ✗ {e}"));
Console.ResetColor();
Console.WriteLine("\nCopy appsettings.example.json → appsettings.json and fill in your keys.\n");
Environment.Exit(1);
}
}

113
README.md Normal file
View file

@ -0,0 +1,113 @@
# YouTube Video Summarizer
A .NET 8 console application that fetches YouTube video transcripts and produces structured summaries using an LLM (Ollama or OpenAI).
---
## Prerequisites
- [.NET 8 SDK](https://dotnet.microsoft.com/download)
- A **YouTube Data API v3** key → [Google Cloud Console](https://console.cloud.google.com)
- **Local Ollama** (Recommended) or an **OpenAI API key**.
---
## Setup
```bash
# 1. Clone / copy the project
cd YoutubeSummarizer
# 2. Copy the example config and fill in your keys
cp appsettings.example.json appsettings.json
nano appsettings.json # or your editor of choice
# 3. Restore packages
dotnet restore
# 4. Run
dotnet run
```
---
## Google Cloud Setup (YouTube API Key)
1. Go to [console.cloud.google.com](https://console.cloud.google.com)
2. Create or select a project
3. **APIs & Services → Library** → search "YouTube Data API v3" → Enable
4. **APIs & Services → Credentials → Create Credentials → API key**
5. (Optional but recommended) Restrict the key to only the YouTube Data API v3
> Free quota: **10,000 units/day**. Each video lookup costs ~3 units. You can summarize thousands of videos before hitting the limit.
---
## Configuration Reference
| Key | Description | Default |
|---|---|---|
| `YouTube:ApiKey` | Your YouTube Data API v3 key | *(required)* |
| `LLM:BaseUrl` | API endpoint | `http://localhost:11434/v1` |
| `LLM:ApiKey` | API key (any for Ollama) | `ollama` |
| `LLM:Model` | Chat model to use | `qwen3:14b` |
| `LLM:MaxTokens` | Max tokens in summary response | `1500` |
| `LLM:TimeoutSeconds` | Max time for LLM generation | `300` |
| `Summarizer:ChunkWordLimit` | Words per chunk for long videos | `3000` |
| `Summarizer:ShowTranscript` | Print raw transcript before summary | `false` |
---
## Architecture
```
Program.cs
│ Main loop → parses URL → calls pipeline
├── YouTubeService
│ ├── ExtractVideoId() — URL parsing
│ ├── GetVideoMetadataAsync() — YouTube Data API v3 (Videos.list)
│ └── GetTranscriptAsync() — Caption list + timedtext download
├── SummarizerService
│ ├── SummarizeAsync() — Routes to single-pass or chunked
│ ├── SinglePassSummarize() — One OpenAI call for short videos
│ └── ChunkedSummarize() — Map-reduce for long videos
└── ConsoleRenderer — All terminal output / formatting
```
### Caption Quality Transparency
The app tracks how the transcript was obtained and flags it accordingly:
| Source | Label | Warning shown? |
|---|---|---|
| Owner-published captions | `✓ Owner-published` | No |
| Community-contributed | `✓ Community captions` | Minor note |
| Auto-generated (ASR) | `~ Auto-generated` | Yes — accuracy caveat |
| No captions (metadata only) | `✗ Metadata only` | Yes — limited accuracy |
### Long Video Strategy
Videos with transcripts exceeding `ChunkWordLimit` words use a **map-reduce** approach:
1. **Split** — transcript divided into overlapping chunks (200-word overlap preserves context at boundaries)
2. **Map** — each chunk summarized independently
3. **Reduce** — chunk summaries combined into a final coherent summary
This handles hour-long lectures, conference talks, and podcasts without hitting model context limits.
---
## Environment Variable Overrides
You can override `appsettings.json` values with environment variables, useful for CI or Docker:
```bash
export YouTube__ApiKey="your-key"
export LLM__ApiKey="ollama"
dotnet run
```
Note the double-underscore `__` as the section separator (standard .NET configuration convention).

342
SummarizerService.cs Normal file
View file

@ -0,0 +1,342 @@
using OpenAI;
using OpenAI.Chat;
using YoutubeSummarizer.Configuration;
using YoutubeSummarizer.Models;
namespace YoutubeSummarizer.Services;
/// <summary>
/// Sends transcript text to OpenAI's Chat Completions API and returns a
/// structured summary.
///
/// Long transcripts (word count > ChunkWordLimit) are handled with a
/// "map-reduce" strategy:
/// 1. Split the transcript into overlapping chunks.
/// 2. Summarize each chunk independently (map phase).
/// 3. Combine chunk summaries into a final cohesive summary (reduce phase).
///
/// This keeps individual API calls within model context limits while still
/// producing an accurate summary of long-form content like hour-long lectures.
/// </summary>
public sealed class SummarizerService
{
private readonly LlmSettings _llmSettings;
private readonly SummarizerSettings _summarizerSettings;
private readonly ChatClient _chatClient;
// System prompt used for single-pass and chunk summarization.
// Keeping it focused on facts and structure produces better summaries
// than open-ended "summarize this" prompts.
private const string ChunkSystemPrompt = """
You are a precise, factual assistant that summarizes YouTube video transcripts.
When given a transcript segment, produce a concise summary that:
- Captures the key points, arguments, and conclusions
- Preserves any specific facts, names, dates, or statistics mentioned
- Uses bullet points for individual points, then a short paragraph for the overall gist
- Omits filler words, repeated phrases, and off-topic tangents
- Does NOT add information not present in the transcript
Respond with only the summary text, no preamble.
""";
// Personal Information Filter — concise relevance-based summary.
private const string PersonalFilterSystemPrompt = """
You are a concise, factual assistant that applies a Personal Information Filter
to YouTube video transcripts. When given a transcript, respond with EXACTLY
three sections and nothing else:
Summary A concise, plain-English summary in 12 sentences.
Why it matters Directly evaluate relevance only against these priorities:
time, finances, health, family, service to others.
If none apply, say so clearly.
Priority tag End with a single word verdict: ACT, MONITOR, or IGNORE.
Constraints:
- Do not timestamp or number entries.
- Do not infer user interest beyond what is explicitly provided.
- Do not expand or add context unless the user requests it.
- The burden of interest is on the user.
- Respond with only the three sections above, no preamble.
""";
// Personal Filter combine prompt for long transcripts.
private const string PersonalFilterCombinePrompt = """
You are a concise, factual assistant. You will receive several partial summaries
of consecutive segments of a YouTube video, each formatted with Summary,
Why it matters, and Priority tag sections. Combine them into a single response
using the same three-section format:
Summary A concise, plain-English summary of the entire video in 12 sentences.
Why it matters Directly evaluate relevance only against these priorities:
time, finances, health, family, service to others.
If none apply, say so clearly.
Priority tag A single word verdict: ACT, MONITOR, or IGNORE.
Respond with only these three sections, no preamble.
""";
// Used in the reduce phase to combine chunk summaries coherently.
private const string CombineSystemPrompt = """
You are a precise, factual assistant. You will receive several partial summaries
of consecutive segments of a YouTube video. Your task is to combine them into
a single, coherent, well-structured summary that:
- Flows as a unified narrative, not as a list of sub-summaries
- Preserves all key facts, names, dates, and statistics
- Uses bullet points for supporting details beneath each main topic
- Omits redundant information that appears across multiple segments
- Concludes with a 23 sentence takeaway paragraph
Respond with only the combined summary, no preamble.
""";
public SummarizerService(LlmSettings llmSettings, SummarizerSettings summarizerSettings)
{
_llmSettings = llmSettings;
_summarizerSettings = summarizerSettings;
// Initialize the client with the specified model and endpoint.
// We use the OpenAI SDK's ability to point to any OpenAI-compatible API (like Ollama).
_chatClient = new ChatClient(
model: llmSettings.Model,
credential: new System.ClientModel.ApiKeyCredential(llmSettings.ApiKey),
options: new OpenAIClientOptions
{
Endpoint = new Uri(llmSettings.BaseUrl),
NetworkTimeout = TimeSpan.FromSeconds(llmSettings.TimeoutSeconds)
});
}
// ─────────────────────────────────────────────────────────────────────────
// Public API
// ─────────────────────────────────────────────────────────────────────────
/// <summary>
/// Produces a <see cref="VideoSummary"/> from the video's metadata and transcript.
/// Automatically routes to single-pass or chunked strategy based on word count.
/// </summary>
public async Task<VideoSummary> SummarizeAsync(
VideoMetadata metadata,
VideoTranscript transcript,
SummaryMode mode = SummaryMode.Standard,
CancellationToken ct = default)
{
string summaryText;
// Select prompt set based on mode
var chunkPrompt = mode == SummaryMode.PersonalFilter
? PersonalFilterSystemPrompt : ChunkSystemPrompt;
var combinePrompt = mode == SummaryMode.PersonalFilter
? PersonalFilterCombinePrompt : CombineSystemPrompt;
if (transcript.WordCount <= _summarizerSettings.ChunkWordLimit)
{
// Short video — single API call is sufficient
summaryText = await SinglePassSummarizeAsync(transcript.Text, metadata, chunkPrompt, ct);
}
else
{
// Long video — chunk-and-combine strategy
summaryText = await ChunkedSummarizeAsync(transcript.Text, metadata, chunkPrompt, combinePrompt, ct);
}
// Attach a quality warning when the transcript quality is uncertain
var warning = BuildQualityWarning(transcript.Source);
return new VideoSummary
{
Metadata = metadata,
SummaryText = summaryText,
TranscriptSource = transcript.Source,
QualityWarning = warning,
ModelUsed = _llmSettings.Model
};
}
// ─────────────────────────────────────────────────────────────────────────
// Summarization strategies
// ─────────────────────────────────────────────────────────────────────────
/// <summary>
/// Single-pass: sends the entire transcript in one API call.
/// Best for videos under ~30 minutes (roughly 30004000 words).
/// </summary>
private async Task<string> SinglePassSummarizeAsync(
string transcriptText,
VideoMetadata metadata,
string systemPrompt,
CancellationToken ct)
{
var userMessage = BuildUserPrompt(metadata, transcriptText);
return await CallChatCompletionAsync(systemPrompt, userMessage, ct);
}
/// <summary>
/// Map-reduce: splits long transcripts, summarizes each chunk, then combines.
///
/// Overlap: each chunk ends with a brief overlap window (last ~200 words of
/// the previous chunk) so the model retains context across chunk boundaries
/// and avoids abrupt topic changes in the summaries.
/// </summary>
private async Task<string> ChunkedSummarizeAsync(
string transcriptText,
VideoMetadata metadata,
string chunkSystemPrompt,
string combineSystemPrompt,
CancellationToken ct)
{
var words = transcriptText.Split(' ', StringSplitOptions.RemoveEmptyEntries);
var chunks = SplitIntoChunks(words, _summarizerSettings.ChunkWordLimit, overlapWords: 200);
Console.WriteLine($"\n [Chunking] Transcript split into {chunks.Count} chunks for processing...");
// Map phase: summarize each chunk in sequence
// (Parallel would be faster but could hit rate limits — sequential is safer)
var chunkSummaries = new List<string>(chunks.Count);
for (int i = 0; i < chunks.Count; i++)
{
Console.Write($" [Chunk {i + 1}/{chunks.Count}] Summarizing");
var chunkText = string.Join(" ", chunks[i]);
var prompt = $"This is segment {i + 1} of {chunks.Count} from the video \"{metadata.Title}\":\n\n{chunkText}";
var summary = await CallChatCompletionAsync(chunkSystemPrompt, prompt, ct);
chunkSummaries.Add(summary);
}
// Reduce phase: combine all chunk summaries into one coherent summary
Console.Write(" [Combine] Merging chunk summaries into final summary");
var combinedInput = string.Join("\n\n---\n\n",
chunkSummaries.Select((s, i) => $"Segment {i + 1} summary:\n{s}"));
var combinePrompt = $"Video: \"{metadata.Title}\" by {metadata.ChannelTitle}\n\n" +
$"The following are summaries of {chunks.Count} consecutive segments:\n\n{combinedInput}";
return await CallChatCompletionAsync(combineSystemPrompt, combinePrompt, ct);
}
// ─────────────────────────────────────────────────────────────────────────
// Helpers
// ─────────────────────────────────────────────────────────────────────────
/// <summary>
/// Sends a system + user message pair to the Chat Completions endpoint
/// and returns the assistant's reply text.
/// </summary>
private async Task<string> CallChatCompletionAsync(
string systemPrompt,
string userMessage,
CancellationToken ct)
{
var messages = new List<ChatMessage>
{
new SystemChatMessage(systemPrompt),
new UserChatMessage(userMessage)
};
var options = new ChatCompletionOptions
{
MaxOutputTokenCount = _llmSettings.MaxTokens
};
var sw = System.Diagnostics.Stopwatch.StartNew();
var fullContent = new System.Text.StringBuilder();
try
{
var streamingUpdates = _chatClient.CompleteChatStreamingAsync(messages, options, ct);
await foreach (var update in streamingUpdates)
{
foreach (var part in update.ContentUpdate)
{
if (!string.IsNullOrEmpty(part.Text))
{
if (fullContent.Length == 0)
{
// First token received!
Console.Write(" (working)");
}
fullContent.Append(part.Text);
// Show progress: print a dot every ~50 characters of output
// or just periodically. For now, let's just do a dot every update
// to show it's alive.
if (fullContent.Length % 20 == 0) Console.Write(".");
}
}
}
}
finally
{
sw.Stop();
Console.WriteLine($" Done! ({sw.Elapsed.TotalSeconds:F1}s)");
}
return fullContent.ToString();
}
/// <summary>
/// Builds the user-turn prompt for a single-pass summarization.
/// Including the title and channel anchors the model to the subject matter,
/// which reduces hallucination on ambiguous ASR transcripts.
/// </summary>
private static string BuildUserPrompt(VideoMetadata metadata, string transcriptText)
{
return $"""
Video title: {metadata.Title}
Channel: {metadata.ChannelTitle}
Published: {metadata.PublishedAt:MMMM d, yyyy}
Duration: {metadata.FormattedDuration}
Full transcript:
{transcriptText}
""";
}
/// <summary>
/// Splits a word array into overlapping chunks of roughly <paramref name="chunkSize"/> words.
/// The overlap prevents the model from missing context at chunk boundaries.
/// </summary>
private static List<string[]> SplitIntoChunks(string[] words, int chunkSize, int overlapWords)
{
var chunks = new List<string[]>();
int start = 0;
while (start < words.Length)
{
int end = Math.Min(start + chunkSize, words.Length);
chunks.Add(words[start..end]);
// Next chunk starts after current chunk minus the overlap window
start = end - overlapWords;
// Guard: if remaining words are less than the overlap, we're done
if (start >= words.Length - overlapWords) break;
}
return chunks;
}
/// <summary>
/// Returns a human-readable warning when transcript quality may affect summary accuracy.
/// Returns null for high-confidence sources (no warning needed).
/// </summary>
private static string? BuildQualityWarning(TranscriptSource source) =>
source switch
{
TranscriptSource.AutoGenerated =>
"⚠ This summary is based on YouTube's auto-generated captions (ASR). " +
"The transcript may contain errors, especially for technical terms, names, or accented speech.",
TranscriptSource.MetadataOnly =>
"⚠ No captions were available. This summary is based on the video's title " +
"and description only — it may be incomplete or inaccurate.",
TranscriptSource.CommunityContributed =>
" This summary is based on community-contributed captions. " +
"Quality is generally good but not guaranteed.",
_ => null // OwnerPublished — no warning needed
};
}

212
TranscriptFileService.cs Normal file
View file

@ -0,0 +1,212 @@
using System.Text;
using YoutubeSummarizer.Models;
namespace YoutubeSummarizer.Services;
/// <summary>
/// Saves video metadata and timestamped transcript to a plain text file.
/// The file is formatted with metadata at the top followed by the transcript
/// organized by timestamps.
/// </summary>
public static class TranscriptFileService
{
/// <summary>
/// Saves the transcript and metadata to a text file in the specified directory.
/// Returns the full path to the saved file.
/// </summary>
public static async Task<string> SaveAsync(
VideoMetadata metadata,
VideoTranscript transcript,
string? summaryText = null,
string? outputDirectory = null,
CancellationToken ct = default)
{
outputDirectory ??= Environment.CurrentDirectory;
Directory.CreateDirectory(outputDirectory);
// Build a safe filename from the video title
var safeTitle = SanitizeFileName(metadata.Title);
var fileName = $"{safeTitle}_{metadata.VideoId}.txt";
var filePath = Path.Combine(outputDirectory, fileName);
var sb = new StringBuilder();
// ── Metadata section ─────────────────────────────────────────────────
sb.AppendLine("════════════════════════════════════════════════════════════════");
sb.AppendLine(" VIDEO METADATA");
sb.AppendLine("════════════════════════════════════════════════════════════════");
sb.AppendLine();
sb.AppendLine($" Title: {metadata.Title}");
sb.AppendLine($" Channel: {metadata.ChannelTitle}");
sb.AppendLine($" Published: {metadata.PublishedAt:MMMM d, yyyy}");
sb.AppendLine($" Duration: {metadata.FormattedDuration}");
sb.AppendLine($" Video ID: {metadata.VideoId}");
sb.AppendLine($" URL: https://youtu.be/{metadata.VideoId}");
if (!string.IsNullOrWhiteSpace(summaryText))
{
sb.AppendLine();
sb.AppendLine(" ── SUMMARY ──────────────────────────────────────────────");
sb.AppendLine();
foreach (var wrappedLine in WordWrap(summaryText, maxWidth: 72))
{
sb.AppendLine($" {wrappedLine}");
}
}
sb.AppendLine();
// ── Transcript source ────────────────────────────────────────────────
var sourceLabel = transcript.Source switch
{
TranscriptSource.OwnerPublished => "Owner-published captions",
TranscriptSource.CommunityContributed => "Community-contributed captions",
TranscriptSource.AutoGenerated => "Auto-generated (ASR)",
TranscriptSource.MetadataOnly => "Metadata only (no captions)",
_ => "Unknown"
};
sb.AppendLine($" Transcript Source: {sourceLabel}");
sb.AppendLine($" Word Count: {transcript.WordCount:N0}");
sb.AppendLine($" Saved: {DateTimeOffset.UtcNow:yyyy-MM-dd HH:mm} UTC");
sb.AppendLine();
// ── Transcript section ───────────────────────────────────────────────
sb.AppendLine("════════════════════════════════════════════════════════════════");
sb.AppendLine(" TRANSCRIPT");
sb.AppendLine("════════════════════════════════════════════════════════════════");
sb.AppendLine();
if (transcript.Segments.Count > 0)
{
// Group segments into blocks by time intervals for readability
// Each block groups consecutive segments within ~30 seconds
var blocks = GroupSegmentsByInterval(transcript.Segments, intervalSeconds: 30);
foreach (var block in blocks)
{
var firstTimestamp = block[0].FormattedTimestamp;
sb.AppendLine($" [{firstTimestamp}]");
// Combine the text for segments in this time block
var blockText = string.Join(" ", block.Select(s => s.Text));
foreach (var wrappedLine in WordWrap(blockText, maxWidth: 72))
{
sb.AppendLine($" {wrappedLine}");
}
sb.AppendLine();
}
}
else
{
// No timestamps available — write plain text
sb.AppendLine(" (No timestamp data available)");
sb.AppendLine();
foreach (var wrappedLine in WordWrap(transcript.Text, maxWidth: 72))
{
sb.AppendLine($" {wrappedLine}");
}
sb.AppendLine();
}
sb.AppendLine("════════════════════════════════════════════════════════════════");
sb.AppendLine(" END OF TRANSCRIPT");
sb.AppendLine("════════════════════════════════════════════════════════════════");
await File.WriteAllTextAsync(filePath, sb.ToString(), ct);
return filePath;
}
// ─────────────────────────────────────────────────────────────────────────
// Helpers
// ─────────────────────────────────────────────────────────────────────────
/// <summary>
/// Groups timestamped segments into blocks based on a time interval.
/// This produces readable chunks (e.g. every 30 seconds) instead of
/// one line per subtitle cue.
/// </summary>
private static List<List<TimestampedSegment>> GroupSegmentsByInterval(
IReadOnlyList<TimestampedSegment> segments,
int intervalSeconds)
{
var blocks = new List<List<TimestampedSegment>>();
if (segments.Count == 0) return blocks;
var currentBlock = new List<TimestampedSegment> { segments[0] };
var blockStartTime = segments[0].Start;
for (int i = 1; i < segments.Count; i++)
{
if ((segments[i].Start - blockStartTime).TotalSeconds >= intervalSeconds)
{
blocks.Add(currentBlock);
currentBlock = new List<TimestampedSegment>();
blockStartTime = segments[i].Start;
}
currentBlock.Add(segments[i]);
}
if (currentBlock.Count > 0)
blocks.Add(currentBlock);
return blocks;
}
/// <summary>
/// Removes characters that are invalid in file names.
/// Truncates to a reasonable length to avoid path-length issues.
/// </summary>
private static string SanitizeFileName(string title)
{
var invalid = Path.GetInvalidFileNameChars();
var sb = new StringBuilder(title.Length);
foreach (var ch in title)
{
if (Array.IndexOf(invalid, ch) < 0)
sb.Append(ch);
else
sb.Append('_');
}
// Replace runs of spaces/underscores with a single underscore
var result = System.Text.RegularExpressions.Regex.Replace(
sb.ToString().Trim(), @"[\s_]+", "_");
// Truncate to keep file paths manageable
return result.Length > 80 ? result[..80] : result;
}
/// <summary>
/// Word-wraps text at the specified width, breaking at word boundaries.
/// </summary>
private static IEnumerable<string> WordWrap(string text, int maxWidth)
{
foreach (var paragraph in text.Split('\n'))
{
if (string.IsNullOrWhiteSpace(paragraph))
{
yield return string.Empty;
continue;
}
var words = paragraph.Split(' ', StringSplitOptions.RemoveEmptyEntries);
var current = new StringBuilder();
foreach (var word in words)
{
if (current.Length + word.Length + 1 > maxWidth)
{
yield return current.ToString();
current.Clear();
}
if (current.Length > 0) current.Append(' ');
current.Append(word);
}
if (current.Length > 0)
yield return current.ToString();
}
}
}

161
VideoModels.cs Normal file
View file

@ -0,0 +1,161 @@
namespace YoutubeSummarizer.Models;
/// <summary>
/// Metadata returned from the YouTube Data API for a single video.
/// This is a slim projection — the API returns far more fields, but we
/// only bind what we actually need for the summarization workflow.
/// </summary>
public sealed class VideoMetadata
{
/// <summary>The 11-character YouTube video ID parsed from the URL.</summary>
public required string VideoId { get; init; }
/// <summary>Full video title as shown on YouTube.</summary>
public required string Title { get; init; }
/// <summary>Channel that published the video.</summary>
public required string ChannelTitle { get; init; }
/// <summary>UTC publish date of the video.</summary>
public DateTimeOffset PublishedAt { get; init; }
/// <summary>
/// Video duration in ISO 8601 format (e.g. "PT1H4M32S").
/// We store it raw and parse it for display purposes.
/// </summary>
public string? Duration { get; init; }
/// <summary>First 5000 characters of the video description (API cap).</summary>
public string? Description { get; init; }
/// <summary>Human-readable duration parsed from <see cref="Duration"/>.</summary>
public string FormattedDuration =>
Duration is null ? "Unknown"
: System.Xml.XmlConvert.ToTimeSpan(Duration).ToString(@"hh\:mm\:ss").TrimStart('0', ':');
}
/// <summary>
/// Represents a single caption track available for a video.
/// YouTube can provide multiple tracks (languages, auto-generated vs. manual).
/// </summary>
public sealed class CaptionTrack
{
public required string TrackId { get; init; }
public required string Language { get; init; } // BCP-47, e.g. "en"
public required string TrackKind { get; init; } // "standard", "asr" (auto), "forced"
public required string Name { get; init; } // Display name from YouTube
/// <summary>
/// True when the track was automatically generated by YouTube's ASR system.
/// ASR captions are less reliable — typos, missing punctuation, run-on sentences.
/// </summary>
public bool IsAutoGenerated => TrackKind.Equals("asr", StringComparison.OrdinalIgnoreCase);
}
/// <summary>
/// The full textual transcript assembled from caption data,
/// along with provenance information about how it was obtained.
/// </summary>
public sealed class VideoTranscript
{
public required string VideoId { get; init; }
/// <summary>The concatenated, cleaned transcript text.</summary>
public required string Text { get; init; }
/// <summary>The caption track this text came from, if available.</summary>
public CaptionTrack? SourceTrack { get; init; }
/// <summary>
/// How the transcript was obtained. This is important context for
/// interpreting the quality of the summary.
/// </summary>
public TranscriptSource Source { get; init; }
/// <summary>
/// Individual timestamped segments from the caption track.
/// Empty when timestamps are not available (e.g. metadata-only transcripts).
/// </summary>
public IReadOnlyList<TimestampedSegment> Segments { get; init; } = Array.Empty<TimestampedSegment>();
/// <summary>Approximate word count of the raw transcript.</summary>
public int WordCount => Text.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length;
}
/// <summary>
/// A single timestamped segment from a caption track.
/// Used when saving the transcript to a file with timestamp formatting.
/// </summary>
public sealed class TimestampedSegment
{
/// <summary>Start time offset from the beginning of the video.</summary>
public TimeSpan Start { get; init; }
/// <summary>Duration of this caption segment.</summary>
public TimeSpan Duration { get; init; }
/// <summary>The caption text for this segment.</summary>
public required string Text { get; init; }
/// <summary>Formats the start time as [HH:MM:SS] or [MM:SS] for display.</summary>
public string FormattedTimestamp =>
Start.TotalHours >= 1
? Start.ToString(@"hh\:mm\:ss")
: Start.ToString(@"mm\:ss");
}
/// <summary>
/// Describes how a transcript was obtained, ordered from most to least reliable.
/// This maps directly to the caption quality transparency layer discussed in LIKA.
/// </summary>
public enum TranscriptSource
{
/// <summary>Human-reviewed caption track provided by the video owner.</summary>
OwnerPublished,
/// <summary>Community-contributed captions (YouTube retired this but tracks may exist).</summary>
CommunityContributed,
/// <summary>YouTube's automatic speech recognition — less reliable.</summary>
AutoGenerated,
/// <summary>No captions available; summary based on metadata/description only.</summary>
MetadataOnly
}
/// <summary>
/// Controls which summarization prompt style is used.
/// </summary>
public enum SummaryMode
{
/// <summary>Default detailed summary with bullet points and takeaways.</summary>
Standard,
/// <summary>
/// Personal Information Filter — brief 12 sentence summary, relevance
/// evaluation against personal priorities (time, finances, health, family,
/// service to others), and a single-word verdict: ACT, MONITOR, or IGNORE.
/// </summary>
PersonalFilter
}
/// <summary>
/// The final deliverable: a structured summary of a YouTube video.
/// </summary>
public sealed class VideoSummary
{
public required VideoMetadata Metadata { get; init; }
public required string SummaryText { get; init; }
public required TranscriptSource TranscriptSource { get; init; }
/// <summary>
/// Warning shown when the summary is based on low-quality or missing transcript data.
/// Null when the source is reliable.
/// </summary>
public string? QualityWarning { get; init; }
/// <summary>Model used to generate this summary.</summary>
public required string ModelUsed { get; init; }
public DateTimeOffset GeneratedAt { get; init; } = DateTimeOffset.UtcNow;
}

518
YouTubeService.cs Normal file
View file

@ -0,0 +1,518 @@
using System.Diagnostics;
using System.Text.Json;
using YoutubeSummarizer.Models;
namespace YoutubeSummarizer.Services;
/// <summary>
/// Uses yt-dlp (https://github.com/yt-dlp/yt-dlp) to retrieve video metadata
/// and download caption tracks. No YouTube API key required.
///
/// yt-dlp is the de-facto standard tool for reliably extracting video
/// information and subtitles from YouTube. It must be installed and
/// available on PATH (e.g. pip install yt-dlp).
/// </summary>
public sealed class YouTubeService
{
private readonly HttpClient _httpClient;
public YouTubeService(HttpClient httpClient)
{
_httpClient = httpClient;
}
// ─────────────────────────────────────────────────────────────────────────
// Public API
// ─────────────────────────────────────────────────────────────────────────
/// <summary>
/// Parses a YouTube video ID from any common URL format.
/// Handles: watch?v=, youtu.be/, /embed/, /shorts/
/// </summary>
public static string? ExtractVideoId(string url)
{
// Normalize — strip whitespace the user may have pasted
url = url.Trim();
// youtu.be short links: https://youtu.be/VIDEO_ID
if (Uri.TryCreate(url, UriKind.Absolute, out var uri))
{
if (uri.Host.Contains("youtu.be"))
return uri.AbsolutePath.TrimStart('/').Split('?')[0];
// Standard and embed URLs: ?v=VIDEO_ID, /embed/VIDEO_ID, /shorts/VIDEO_ID
var query = System.Web.HttpUtility.ParseQueryString(uri.Query);
if (query["v"] is { } vParam && vParam.Length == 11)
return vParam;
var segments = uri.AbsolutePath.Split('/', StringSplitOptions.RemoveEmptyEntries);
for (int i = 0; i < segments.Length - 1; i++)
{
if (segments[i] is "embed" or "shorts" or "v")
return segments[i + 1].Split('?')[0];
}
}
// Raw ID passed directly (11 alphanumeric chars + dash/underscore)
if (System.Text.RegularExpressions.Regex.IsMatch(url, @"^[\w-]{11}$"))
return url;
return null;
}
/// <summary>
/// Fetches metadata for a video using yt-dlp --dump-json.
/// No API key required — yt-dlp scrapes the public video page.
/// </summary>
public async Task<VideoMetadata?> GetVideoMetadataAsync(string videoId, CancellationToken ct = default)
{
var psi = new ProcessStartInfo
{
FileName = "yt-dlp",
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true,
};
psi.ArgumentList.Add("--dump-json");
psi.ArgumentList.Add("--no-download");
psi.ArgumentList.Add($"https://www.youtube.com/watch?v={videoId}");
using var process = new Process { StartInfo = psi };
process.Start();
var stdoutTask = process.StandardOutput.ReadToEndAsync(ct);
var stderrTask = process.StandardError.ReadToEndAsync(ct);
await process.WaitForExitAsync(ct);
if (process.ExitCode != 0)
return null;
var json = await stdoutTask;
if (string.IsNullOrWhiteSpace(json))
return null;
try
{
using var doc = JsonDocument.Parse(json);
var root = doc.RootElement;
var title = root.TryGetProperty("title", out var t) ? t.GetString() ?? "(no title)" : "(no title)";
var channel = root.TryGetProperty("channel", out var c) ? c.GetString() ?? "(unknown channel)" : "(unknown channel)";
var description = root.TryGetProperty("description", out var d) ? d.GetString() : null;
// yt-dlp returns duration in seconds
TimeSpan? duration = null;
if (root.TryGetProperty("duration", out var dur) && dur.ValueKind == JsonValueKind.Number)
duration = TimeSpan.FromSeconds(dur.GetDouble());
// Upload date comes as "YYYYMMDD"
DateTimeOffset publishedAt = DateTimeOffset.MinValue;
if (root.TryGetProperty("upload_date", out var ud) && ud.GetString() is { } dateStr
&& DateTime.TryParseExact(dateStr, "yyyyMMdd", null,
System.Globalization.DateTimeStyles.None, out var parsed))
{
publishedAt = new DateTimeOffset(parsed, TimeSpan.Zero);
}
// Build ISO 8601 duration string for FormattedDuration compatibility
string? isoDuration = null;
if (duration.HasValue)
{
var ts = duration.Value;
isoDuration = $"PT{(int)ts.TotalHours}H{ts.Minutes}M{ts.Seconds}S";
}
return new VideoMetadata
{
VideoId = videoId,
Title = title,
ChannelTitle = channel,
PublishedAt = publishedAt,
Duration = isoDuration,
Description = description
};
}
catch
{
return null;
}
}
/// <summary>
/// Retrieves the best available transcript for the video using yt-dlp.
///
/// yt-dlp is invoked to download subtitle files (preferring manual English
/// captions, falling back to auto-generated). The downloaded XML is parsed
/// into clean plain text for summarization.
///
/// If yt-dlp fails or no captions exist, returns a metadata-only transcript
/// from the video description.
/// </summary>
public async Task<VideoTranscript> GetTranscriptAsync(
VideoMetadata metadata,
CancellationToken ct = default)
{
// Try manual (human-written) subtitles first, then auto-generated
var (text, segments, isAuto) = await DownloadSubtitlesWithTimestampsAsync(metadata.VideoId, ct);
if (!string.IsNullOrWhiteSpace(text))
{
return new VideoTranscript
{
VideoId = metadata.VideoId,
Text = text,
Segments = segments,
SourceTrack = new CaptionTrack
{
TrackId = "yt-dlp",
Language = "en",
TrackKind = isAuto ? "asr" : "standard",
Name = isAuto ? "Auto-generated (en)" : "English"
},
Source = isAuto
? TranscriptSource.AutoGenerated
: TranscriptSource.OwnerPublished
};
}
// No captions at all — fall back to the description text
return BuildMetadataOnlyTranscript(metadata);
}
// ─────────────────────────────────────────────────────────────────────────
// Private helpers
// ─────────────────────────────────────────────────────────────────────────
/// <summary>
/// Shells out to yt-dlp to download subtitles for the given video.
/// First attempts manual subs, then auto-generated if none found.
/// Returns the cleaned transcript text and whether it was auto-generated.
/// </summary>
private static async Task<(string? Text, bool IsAuto)> DownloadSubtitlesWithYtDlpAsync(
string videoId,
CancellationToken ct)
{
var (text, segments, isAuto) = await DownloadSubtitlesWithTimestampsAsync(videoId, ct);
return (text, isAuto);
}
/// <summary>
/// Downloads subtitles and returns both the plain text and timestamped segments.
/// </summary>
private static async Task<(string? Text, List<TimestampedSegment> Segments, bool IsAuto)> DownloadSubtitlesWithTimestampsAsync(
string videoId,
CancellationToken ct)
{
var tempDir = Path.Combine(Path.GetTempPath(), $"ytsumm_{videoId}_{Guid.NewGuid():N}");
Directory.CreateDirectory(tempDir);
try
{
// Attempt 1: manual (human-written) subtitles only
var (manualText, manualSegments) = await RunYtDlpSubtitleWithTimestampsAsync(
videoId, tempDir, writeSub: true, writeAutoSub: false, ct);
if (!string.IsNullOrWhiteSpace(manualText))
return (manualText, manualSegments, false);
// Attempt 2: auto-generated subtitles
var (autoText, autoSegments) = await RunYtDlpSubtitleWithTimestampsAsync(
videoId, tempDir, writeSub: false, writeAutoSub: true, ct);
if (!string.IsNullOrWhiteSpace(autoText))
return (autoText, autoSegments, true);
return (null, new List<TimestampedSegment>(), false);
}
finally
{
try { Directory.Delete(tempDir, recursive: true); } catch { /* best effort */ }
}
}
/// <summary>
/// Runs a single yt-dlp invocation to download subtitles with timestamps.
/// Returns the parsed plain-text transcript and timestamped segments.
/// </summary>
private static async Task<(string? Text, List<TimestampedSegment> Segments)> RunYtDlpSubtitleWithTimestampsAsync(
string videoId,
string tempDir,
bool writeSub,
bool writeAutoSub,
CancellationToken ct)
{
// Clean any previous subtitle files from this temp dir
foreach (var f in Directory.GetFiles(tempDir, "*.srv1"))
File.Delete(f);
foreach (var f in Directory.GetFiles(tempDir, "*.vtt"))
File.Delete(f);
var args = new List<string>
{
"--skip-download",
"--sub-lang", "en,en-US,en-GB,en.*",
"--sub-format", "srv1/vtt/best",
"-o", Path.Combine(tempDir, "%(id)s"),
};
if (writeSub)
args.Add("--write-sub");
if (writeAutoSub)
args.Add("--write-auto-sub");
args.Add($"https://www.youtube.com/watch?v={videoId}");
var psi = new ProcessStartInfo
{
FileName = "yt-dlp",
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true,
};
foreach (var arg in args)
psi.ArgumentList.Add(arg);
using var process = new Process { StartInfo = psi };
process.Start();
var stdoutTask = process.StandardOutput.ReadToEndAsync(ct);
var stderrTask = process.StandardError.ReadToEndAsync(ct);
await process.WaitForExitAsync(ct);
if (process.ExitCode != 0)
return (null, new List<TimestampedSegment>());
var subFiles = Directory.GetFiles(tempDir)
.Where(f => f.EndsWith(".srv1") || f.EndsWith(".vtt") || f.EndsWith(".srt"))
.OrderBy(f => f.EndsWith(".srv1") ? 0 : 1)
.ToList();
if (subFiles.Count == 0)
return (null, new List<TimestampedSegment>());
var content = await File.ReadAllTextAsync(subFiles[0], ct);
if (string.IsNullOrWhiteSpace(content))
return (null, new List<TimestampedSegment>());
return subFiles[0].EndsWith(".srv1")
? ParseTimedTextXmlWithTimestamps(content)
: ParseVttOrSrtWithTimestamps(content);
}
/// <summary>
/// Parses YouTube's srv1 timed-text XML into plain text and timestamped segments.
/// </summary>
private static (string Text, List<TimestampedSegment> Segments) ParseTimedTextXmlWithTimestamps(string xml)
{
try
{
var doc = System.Xml.Linq.XDocument.Parse(xml);
var segments = new List<TimestampedSegment>();
var textParts = new List<string>();
foreach (var el in doc.Descendants("text"))
{
var decoded = System.Web.HttpUtility.HtmlDecode(el.Value);
var cleaned = System.Text.RegularExpressions.Regex.Replace(decoded, @"\s+", " ").Trim();
if (string.IsNullOrEmpty(cleaned)) continue;
textParts.Add(cleaned);
var startAttr = el.Attribute("start")?.Value;
var durAttr = el.Attribute("dur")?.Value;
var start = double.TryParse(startAttr, System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out var s)
? TimeSpan.FromSeconds(s) : TimeSpan.Zero;
var dur = double.TryParse(durAttr, System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out var d)
? TimeSpan.FromSeconds(d) : TimeSpan.Zero;
segments.Add(new TimestampedSegment
{
Start = start,
Duration = dur,
Text = cleaned
});
}
return (string.Join(" ", textParts), segments);
}
catch
{
return (xml, new List<TimestampedSegment>());
}
}
/// <summary>
/// Parses YouTube's srv1 timed-text XML into clean plain text.
///
/// The XML structure looks like:
/// &lt;transcript&gt;
/// &lt;text start="0.5" dur="2.1"&gt;Hello world&lt;/text&gt;
/// ...
/// &lt;/transcript&gt;
/// </summary>
private static string ParseTimedTextXml(string xml)
{
var (text, _) = ParseTimedTextXmlWithTimestamps(xml);
return text;
}
/// <summary>
/// Parses VTT or SRT subtitle formats into plain text and timestamped segments.
/// Strips cue identifiers and formatting tags while preserving timestamp associations.
/// </summary>
private static (string Text, List<TimestampedSegment> Segments) ParseVttOrSrtWithTimestamps(string content)
{
var segments = new List<TimestampedSegment>();
var allLines = content.Split('\n').Select(l => l.Trim()).ToArray();
TimeSpan currentStart = TimeSpan.Zero;
TimeSpan currentEnd = TimeSpan.Zero;
var currentText = new List<string>();
foreach (var line in allLines)
{
// Skip headers and metadata
if (string.IsNullOrEmpty(line) ||
line.StartsWith("WEBVTT") ||
line.StartsWith("NOTE") ||
line.StartsWith("Kind:") ||
line.StartsWith("Language:") ||
System.Text.RegularExpressions.Regex.IsMatch(line, @"^\d+$"))
{
// Flush current segment on blank line
if (string.IsNullOrEmpty(line) && currentText.Count > 0)
{
var text = string.Join(" ", currentText);
segments.Add(new TimestampedSegment
{
Start = currentStart,
Duration = currentEnd - currentStart,
Text = text
});
currentText.Clear();
}
continue;
}
// Timestamp line: "00:01:23.456 --> 00:01:27.890"
var tsMatch = System.Text.RegularExpressions.Regex.Match(line,
@"^(\d{2}:\d{2}[:\.][\d\.]+)\s*-->\s*(\d{2}:\d{2}[:\.][\d\.]+)");
if (tsMatch.Success)
{
// Flush previous segment if any text buffered
if (currentText.Count > 0)
{
var text = string.Join(" ", currentText);
segments.Add(new TimestampedSegment
{
Start = currentStart,
Duration = currentEnd - currentStart,
Text = text
});
currentText.Clear();
}
currentStart = ParseVttTimestamp(tsMatch.Groups[1].Value);
currentEnd = ParseVttTimestamp(tsMatch.Groups[2].Value);
continue;
}
// Content line — strip HTML tags and decode
var stripped = System.Text.RegularExpressions.Regex.Replace(line, @"<[^>]+>", "");
var decoded = System.Web.HttpUtility.HtmlDecode(stripped).Trim();
if (!string.IsNullOrEmpty(decoded))
currentText.Add(decoded);
}
// Flush last segment
if (currentText.Count > 0)
{
segments.Add(new TimestampedSegment
{
Start = currentStart,
Duration = currentEnd - currentStart,
Text = string.Join(" ", currentText)
});
}
// Deduplicate consecutive identical text segments (common in VTT)
var deduped = new List<TimestampedSegment>();
string? prevText = null;
foreach (var seg in segments)
{
if (seg.Text != prevText)
deduped.Add(seg);
prevText = seg.Text;
}
var plainText = string.Join(" ", deduped.Select(s => s.Text));
return (plainText, deduped);
}
/// <summary>Parses a VTT/SRT timestamp string into a TimeSpan.</summary>
private static TimeSpan ParseVttTimestamp(string ts)
{
// Normalize: VTT uses "." for ms, SRT uses "," — handle both
ts = ts.Replace(',', '.');
// Handle both HH:MM:SS.mmm and MM:SS.mmm
var parts = ts.Split(':');
if (parts.Length == 3)
{
int.TryParse(parts[0], out var h);
int.TryParse(parts[1], out var m);
double.TryParse(parts[2], System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out var s);
return new TimeSpan(0, h, m, (int)s, (int)((s - (int)s) * 1000));
}
else if (parts.Length == 2)
{
int.TryParse(parts[0], out var m);
double.TryParse(parts[1], System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out var s);
return new TimeSpan(0, 0, m, (int)s, (int)((s - (int)s) * 1000));
}
return TimeSpan.Zero;
}
/// <summary>
/// Parses VTT or SRT subtitle formats into clean plain text.
/// Strips timestamps, cue identifiers, and formatting tags.
/// </summary>
private static string ParseVttOrSrt(string content)
{
var (text, _) = ParseVttOrSrtWithTimestamps(content);
return text;
}
/// <summary>
/// When no captions exist, builds a minimal "transcript" from the video description.
/// The summary will be based on much less information and will be flagged accordingly.
/// </summary>
private static VideoTranscript BuildMetadataOnlyTranscript(VideoMetadata metadata)
{
var text = string.IsNullOrWhiteSpace(metadata.Description)
? $"No transcript or description available for: {metadata.Title}"
: $"Video title: {metadata.Title}\n\nChannel: {metadata.ChannelTitle}\n\nDescription:\n{metadata.Description}";
return new VideoTranscript
{
VideoId = metadata.VideoId,
Text = text,
SourceTrack = null,
Source = TranscriptSource.MetadataOnly
};
}
}

36
YoutubeSummarizer.csproj Normal file
View file

@ -0,0 +1,36 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<RootNamespace>YoutubeSummarizer</RootNamespace>
<AssemblyName>YoutubeSummarizer</AssemblyName>
</PropertyGroup>
<ItemGroup>
<None Update="appsettings.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>
<ItemGroup>
<!-- OpenAI .NET SDK (official) -->
<PackageReference Include="OpenAI" Version="2.1.0" />
<!-- Microsoft.Extensions for config/DI patterns without full host overhead -->
<PackageReference Include="Microsoft.Extensions.Configuration" Version="10.0.0" />
<PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="10.0.0" />
<PackageReference Include="Microsoft.Extensions.Configuration.EnvironmentVariables" Version="10.0.0" />
<!-- Binder provides the .Bind(object) extension method on IConfiguration -->
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="10.0.0" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection" Version="10.0.0" />
<!-- Http provides AddHttpClient() / IHttpClientFactory -->
<PackageReference Include="Microsoft.Extensions.Http" Version="10.0.0" />
<!-- Polly for resilient HTTP retry logic -->
<PackageReference Include="Polly" Version="8.4.1" />
</ItemGroup>
</Project>

14
appsettings.json Normal file
View file

@ -0,0 +1,14 @@
{
"LLM": {
"BaseUrl": "http://localhost:11434/v1",
"ApiKey": "ollama",
"Model": "qwen3:14b",
"MaxTokens": 1500,
"TimeoutSeconds": 600
},
"Summarizer": {
"ChunkWordLimit": 1500,
"ShowTranscript": false
}
}

24
summarize.sln Normal file
View file

@ -0,0 +1,24 @@
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.5.2.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "YoutubeSummarizer", "YoutubeSummarizer.csproj", "{2364E226-41E1-8549-7D9A-3C959F71FD8A}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{2364E226-41E1-8549-7D9A-3C959F71FD8A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{2364E226-41E1-8549-7D9A-3C959F71FD8A}.Debug|Any CPU.Build.0 = Debug|Any CPU
{2364E226-41E1-8549-7D9A-3C959F71FD8A}.Release|Any CPU.ActiveCfg = Release|Any CPU
{2364E226-41E1-8549-7D9A-3C959F71FD8A}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {58A79D7B-0ADD-4677-A65B-B4E6E38D9AFE}
EndGlobalSection
EndGlobal