using OpenAI;
using OpenAI.Chat;
using YoutubeSummarizer.Configuration;
using YoutubeSummarizer.Models;
namespace YoutubeSummarizer.Services;
///
/// Sends transcript text to OpenAI's Chat Completions API and returns a
/// structured summary.
///
/// Long transcripts (word count > ChunkWordLimit) are handled with a
/// "map-reduce" strategy:
/// 1. Split the transcript into overlapping chunks.
/// 2. Summarize each chunk independently (map phase).
/// 3. Combine chunk summaries into a final cohesive summary (reduce phase).
///
/// This keeps individual API calls within model context limits while still
/// producing an accurate summary of long-form content like hour-long lectures.
///
public sealed class SummarizerService
{
private readonly LlmSettings _llmSettings;
private readonly SummarizerSettings _summarizerSettings;
private readonly ChatClient _chatClient;
// System prompt used for single-pass and chunk summarization.
// Keeping it focused on facts and structure produces better summaries
// than open-ended "summarize this" prompts.
private const string ChunkSystemPrompt = """
You are a precise, factual assistant that summarizes YouTube video transcripts.
When given a transcript segment, produce a concise summary that:
- Captures the key points, arguments, and conclusions
- Preserves any specific facts, names, dates, or statistics mentioned
- Uses bullet points for individual points, then a short paragraph for the overall gist
- Omits filler words, repeated phrases, and off-topic tangents
- Does NOT add information not present in the transcript
Respond with only the summary text, no preamble.
""";
// Personal Information Filter — concise relevance-based summary.
private const string PersonalFilterSystemPrompt = """
You are a concise, factual assistant that applies a Personal Information Filter
to YouTube video transcripts. When given a transcript, respond with EXACTLY
three sections and nothing else:
Summary – A concise, plain-English summary in 1–2 sentences.
Why it matters – Directly evaluate relevance only against these priorities:
time, finances, health, family, service to others.
If none apply, say so clearly.
Priority tag – End with a single word verdict: ACT, MONITOR, or IGNORE.
Constraints:
- Do not timestamp or number entries.
- Do not infer user interest beyond what is explicitly provided.
- Do not expand or add context unless the user requests it.
- The burden of interest is on the user.
- Respond with only the three sections above, no preamble.
""";
// Personal Filter combine prompt for long transcripts.
private const string PersonalFilterCombinePrompt = """
You are a concise, factual assistant. You will receive several partial summaries
of consecutive segments of a YouTube video, each formatted with Summary,
Why it matters, and Priority tag sections. Combine them into a single response
using the same three-section format:
Summary – A concise, plain-English summary of the entire video in 1–2 sentences.
Why it matters – Directly evaluate relevance only against these priorities:
time, finances, health, family, service to others.
If none apply, say so clearly.
Priority tag – A single word verdict: ACT, MONITOR, or IGNORE.
Respond with only these three sections, no preamble.
""";
// Used in the reduce phase to combine chunk summaries coherently.
private const string CombineSystemPrompt = """
You are a precise, factual assistant. You will receive several partial summaries
of consecutive segments of a YouTube video. Your task is to combine them into
a single, coherent, well-structured summary that:
- Flows as a unified narrative, not as a list of sub-summaries
- Preserves all key facts, names, dates, and statistics
- Uses bullet points for supporting details beneath each main topic
- Omits redundant information that appears across multiple segments
- Concludes with a 2–3 sentence takeaway paragraph
Respond with only the combined summary, no preamble.
""";
public SummarizerService(LlmSettings llmSettings, SummarizerSettings summarizerSettings)
{
_llmSettings = llmSettings;
_summarizerSettings = summarizerSettings;
// Initialize the client with the specified model and endpoint.
// We use the OpenAI SDK's ability to point to any OpenAI-compatible API (like Ollama).
_chatClient = new ChatClient(
model: llmSettings.Model,
credential: new System.ClientModel.ApiKeyCredential(llmSettings.ApiKey),
options: new OpenAIClientOptions
{
Endpoint = new Uri(llmSettings.BaseUrl),
NetworkTimeout = TimeSpan.FromSeconds(llmSettings.TimeoutSeconds)
});
}
// ─────────────────────────────────────────────────────────────────────────
// Public API
// ─────────────────────────────────────────────────────────────────────────
///
/// Produces a from the video's metadata and transcript.
/// Automatically routes to single-pass or chunked strategy based on word count.
///
public async Task SummarizeAsync(
VideoMetadata metadata,
VideoTranscript transcript,
SummaryMode mode = SummaryMode.Standard,
CancellationToken ct = default)
{
string summaryText;
// Select prompt set based on mode
var chunkPrompt = mode == SummaryMode.PersonalFilter
? PersonalFilterSystemPrompt : ChunkSystemPrompt;
var combinePrompt = mode == SummaryMode.PersonalFilter
? PersonalFilterCombinePrompt : CombineSystemPrompt;
if (transcript.WordCount <= _summarizerSettings.ChunkWordLimit)
{
// Short video — single API call is sufficient
summaryText = await SinglePassSummarizeAsync(transcript.Text, metadata, chunkPrompt, ct);
}
else
{
// Long video — chunk-and-combine strategy
summaryText = await ChunkedSummarizeAsync(transcript.Text, metadata, chunkPrompt, combinePrompt, ct);
}
// Attach a quality warning when the transcript quality is uncertain
var warning = BuildQualityWarning(transcript.Source);
return new VideoSummary
{
Metadata = metadata,
SummaryText = summaryText,
TranscriptSource = transcript.Source,
QualityWarning = warning,
ModelUsed = _llmSettings.Model
};
}
// ─────────────────────────────────────────────────────────────────────────
// Summarization strategies
// ─────────────────────────────────────────────────────────────────────────
///
/// Single-pass: sends the entire transcript in one API call.
/// Best for videos under ~30 minutes (roughly 3000–4000 words).
///
private async Task SinglePassSummarizeAsync(
string transcriptText,
VideoMetadata metadata,
string systemPrompt,
CancellationToken ct)
{
var userMessage = BuildUserPrompt(metadata, transcriptText);
return await CallChatCompletionAsync(systemPrompt, userMessage, ct);
}
///
/// Map-reduce: splits long transcripts, summarizes each chunk, then combines.
///
/// Overlap: each chunk ends with a brief overlap window (last ~200 words of
/// the previous chunk) so the model retains context across chunk boundaries
/// and avoids abrupt topic changes in the summaries.
///
private async Task ChunkedSummarizeAsync(
string transcriptText,
VideoMetadata metadata,
string chunkSystemPrompt,
string combineSystemPrompt,
CancellationToken ct)
{
var words = transcriptText.Split(' ', StringSplitOptions.RemoveEmptyEntries);
var chunks = SplitIntoChunks(words, _summarizerSettings.ChunkWordLimit, overlapWords: 200);
Console.WriteLine($"\n [Chunking] Transcript split into {chunks.Count} chunks for processing...");
// Map phase: summarize each chunk in sequence
// (Parallel would be faster but could hit rate limits — sequential is safer)
var chunkSummaries = new List(chunks.Count);
for (int i = 0; i < chunks.Count; i++)
{
Console.Write($" [Chunk {i + 1}/{chunks.Count}] Summarizing");
var chunkText = string.Join(" ", chunks[i]);
var prompt = $"This is segment {i + 1} of {chunks.Count} from the video \"{metadata.Title}\":\n\n{chunkText}";
var summary = await CallChatCompletionAsync(chunkSystemPrompt, prompt, ct);
chunkSummaries.Add(summary);
}
// Reduce phase: combine all chunk summaries into one coherent summary
Console.Write(" [Combine] Merging chunk summaries into final summary");
var combinedInput = string.Join("\n\n---\n\n",
chunkSummaries.Select((s, i) => $"Segment {i + 1} summary:\n{s}"));
var combinePrompt = $"Video: \"{metadata.Title}\" by {metadata.ChannelTitle}\n\n" +
$"The following are summaries of {chunks.Count} consecutive segments:\n\n{combinedInput}";
return await CallChatCompletionAsync(combineSystemPrompt, combinePrompt, ct);
}
// ─────────────────────────────────────────────────────────────────────────
// Helpers
// ─────────────────────────────────────────────────────────────────────────
///
/// Sends a system + user message pair to the Chat Completions endpoint
/// and returns the assistant's reply text.
///
private async Task CallChatCompletionAsync(
string systemPrompt,
string userMessage,
CancellationToken ct)
{
var messages = new List
{
new SystemChatMessage(systemPrompt),
new UserChatMessage(userMessage)
};
var options = new ChatCompletionOptions
{
MaxOutputTokenCount = _llmSettings.MaxTokens
};
var sw = System.Diagnostics.Stopwatch.StartNew();
var fullContent = new System.Text.StringBuilder();
try
{
var streamingUpdates = _chatClient.CompleteChatStreamingAsync(messages, options, ct);
await foreach (var update in streamingUpdates)
{
foreach (var part in update.ContentUpdate)
{
if (!string.IsNullOrEmpty(part.Text))
{
if (fullContent.Length == 0)
{
// First token received!
Console.Write(" (working)");
}
fullContent.Append(part.Text);
// Show progress: print a dot every ~50 characters of output
// or just periodically. For now, let's just do a dot every update
// to show it's alive.
if (fullContent.Length % 20 == 0) Console.Write(".");
}
}
}
}
finally
{
sw.Stop();
Console.WriteLine($" Done! ({sw.Elapsed.TotalSeconds:F1}s)");
}
return fullContent.ToString();
}
///
/// Builds the user-turn prompt for a single-pass summarization.
/// Including the title and channel anchors the model to the subject matter,
/// which reduces hallucination on ambiguous ASR transcripts.
///
private static string BuildUserPrompt(VideoMetadata metadata, string transcriptText)
{
return $"""
Video title: {metadata.Title}
Channel: {metadata.ChannelTitle}
Published: {metadata.PublishedAt:MMMM d, yyyy}
Duration: {metadata.FormattedDuration}
Full transcript:
{transcriptText}
""";
}
///
/// Splits a word array into overlapping chunks of roughly words.
/// The overlap prevents the model from missing context at chunk boundaries.
///
private static List SplitIntoChunks(string[] words, int chunkSize, int overlapWords)
{
var chunks = new List();
int start = 0;
while (start < words.Length)
{
int end = Math.Min(start + chunkSize, words.Length);
chunks.Add(words[start..end]);
// Next chunk starts after current chunk minus the overlap window
start = end - overlapWords;
// Guard: if remaining words are less than the overlap, we're done
if (start >= words.Length - overlapWords) break;
}
return chunks;
}
///
/// Returns a human-readable warning when transcript quality may affect summary accuracy.
/// Returns null for high-confidence sources (no warning needed).
///
private static string? BuildQualityWarning(TranscriptSource source) =>
source switch
{
TranscriptSource.AutoGenerated =>
"⚠ This summary is based on YouTube's auto-generated captions (ASR). " +
"The transcript may contain errors, especially for technical terms, names, or accented speech.",
TranscriptSource.MetadataOnly =>
"⚠ No captions were available. This summary is based on the video's title " +
"and description only — it may be incomplete or inaccurate.",
TranscriptSource.CommunityContributed =>
"ℹ This summary is based on community-contributed captions. " +
"Quality is generally good but not guaranteed.",
_ => null // OwnerPublished — no warning needed
};
}