summarizer/SummarizerService.cs

343 lines
15 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using OpenAI;
using OpenAI.Chat;
using YoutubeSummarizer.Configuration;
using YoutubeSummarizer.Models;
namespace YoutubeSummarizer.Services;
/// <summary>
/// Sends transcript text to OpenAI's Chat Completions API and returns a
/// structured summary.
///
/// Long transcripts (word count > ChunkWordLimit) are handled with a
/// "map-reduce" strategy:
/// 1. Split the transcript into overlapping chunks.
/// 2. Summarize each chunk independently (map phase).
/// 3. Combine chunk summaries into a final cohesive summary (reduce phase).
///
/// This keeps individual API calls within model context limits while still
/// producing an accurate summary of long-form content like hour-long lectures.
/// </summary>
public sealed class SummarizerService
{
private readonly LlmSettings _llmSettings;
private readonly SummarizerSettings _summarizerSettings;
private readonly ChatClient _chatClient;
// System prompt used for single-pass and chunk summarization.
// Keeping it focused on facts and structure produces better summaries
// than open-ended "summarize this" prompts.
private const string ChunkSystemPrompt = """
You are a precise, factual assistant that summarizes YouTube video transcripts.
When given a transcript segment, produce a concise summary that:
- Captures the key points, arguments, and conclusions
- Preserves any specific facts, names, dates, or statistics mentioned
- Uses bullet points for individual points, then a short paragraph for the overall gist
- Omits filler words, repeated phrases, and off-topic tangents
- Does NOT add information not present in the transcript
Respond with only the summary text, no preamble.
""";
// Personal Information Filter — concise relevance-based summary.
private const string PersonalFilterSystemPrompt = """
You are a concise, factual assistant that applies a Personal Information Filter
to YouTube video transcripts. When given a transcript, respond with EXACTLY
three sections and nothing else:
Summary A concise, plain-English summary in 12 sentences.
Why it matters Directly evaluate relevance only against these priorities:
time, finances, health, family, service to others.
If none apply, say so clearly.
Priority tag End with a single word verdict: ACT, MONITOR, or IGNORE.
Constraints:
- Do not timestamp or number entries.
- Do not infer user interest beyond what is explicitly provided.
- Do not expand or add context unless the user requests it.
- The burden of interest is on the user.
- Respond with only the three sections above, no preamble.
""";
// Personal Filter combine prompt for long transcripts.
private const string PersonalFilterCombinePrompt = """
You are a concise, factual assistant. You will receive several partial summaries
of consecutive segments of a YouTube video, each formatted with Summary,
Why it matters, and Priority tag sections. Combine them into a single response
using the same three-section format:
Summary A concise, plain-English summary of the entire video in 12 sentences.
Why it matters Directly evaluate relevance only against these priorities:
time, finances, health, family, service to others.
If none apply, say so clearly.
Priority tag A single word verdict: ACT, MONITOR, or IGNORE.
Respond with only these three sections, no preamble.
""";
// Used in the reduce phase to combine chunk summaries coherently.
private const string CombineSystemPrompt = """
You are a precise, factual assistant. You will receive several partial summaries
of consecutive segments of a YouTube video. Your task is to combine them into
a single, coherent, well-structured summary that:
- Flows as a unified narrative, not as a list of sub-summaries
- Preserves all key facts, names, dates, and statistics
- Uses bullet points for supporting details beneath each main topic
- Omits redundant information that appears across multiple segments
- Concludes with a 23 sentence takeaway paragraph
Respond with only the combined summary, no preamble.
""";
public SummarizerService(LlmSettings llmSettings, SummarizerSettings summarizerSettings)
{
_llmSettings = llmSettings;
_summarizerSettings = summarizerSettings;
// Initialize the client with the specified model and endpoint.
// We use the OpenAI SDK's ability to point to any OpenAI-compatible API (like Ollama).
_chatClient = new ChatClient(
model: llmSettings.Model,
credential: new System.ClientModel.ApiKeyCredential(llmSettings.ApiKey),
options: new OpenAIClientOptions
{
Endpoint = new Uri(llmSettings.BaseUrl),
NetworkTimeout = TimeSpan.FromSeconds(llmSettings.TimeoutSeconds)
});
}
// ─────────────────────────────────────────────────────────────────────────
// Public API
// ─────────────────────────────────────────────────────────────────────────
/// <summary>
/// Produces a <see cref="VideoSummary"/> from the video's metadata and transcript.
/// Automatically routes to single-pass or chunked strategy based on word count.
/// </summary>
public async Task<VideoSummary> SummarizeAsync(
VideoMetadata metadata,
VideoTranscript transcript,
SummaryMode mode = SummaryMode.Standard,
CancellationToken ct = default)
{
string summaryText;
// Select prompt set based on mode
var chunkPrompt = mode == SummaryMode.PersonalFilter
? PersonalFilterSystemPrompt : ChunkSystemPrompt;
var combinePrompt = mode == SummaryMode.PersonalFilter
? PersonalFilterCombinePrompt : CombineSystemPrompt;
if (transcript.WordCount <= _summarizerSettings.ChunkWordLimit)
{
// Short video — single API call is sufficient
summaryText = await SinglePassSummarizeAsync(transcript.Text, metadata, chunkPrompt, ct);
}
else
{
// Long video — chunk-and-combine strategy
summaryText = await ChunkedSummarizeAsync(transcript.Text, metadata, chunkPrompt, combinePrompt, ct);
}
// Attach a quality warning when the transcript quality is uncertain
var warning = BuildQualityWarning(transcript.Source);
return new VideoSummary
{
Metadata = metadata,
SummaryText = summaryText,
TranscriptSource = transcript.Source,
QualityWarning = warning,
ModelUsed = _llmSettings.Model
};
}
// ─────────────────────────────────────────────────────────────────────────
// Summarization strategies
// ─────────────────────────────────────────────────────────────────────────
/// <summary>
/// Single-pass: sends the entire transcript in one API call.
/// Best for videos under ~30 minutes (roughly 30004000 words).
/// </summary>
private async Task<string> SinglePassSummarizeAsync(
string transcriptText,
VideoMetadata metadata,
string systemPrompt,
CancellationToken ct)
{
var userMessage = BuildUserPrompt(metadata, transcriptText);
return await CallChatCompletionAsync(systemPrompt, userMessage, ct);
}
/// <summary>
/// Map-reduce: splits long transcripts, summarizes each chunk, then combines.
///
/// Overlap: each chunk ends with a brief overlap window (last ~200 words of
/// the previous chunk) so the model retains context across chunk boundaries
/// and avoids abrupt topic changes in the summaries.
/// </summary>
private async Task<string> ChunkedSummarizeAsync(
string transcriptText,
VideoMetadata metadata,
string chunkSystemPrompt,
string combineSystemPrompt,
CancellationToken ct)
{
var words = transcriptText.Split(' ', StringSplitOptions.RemoveEmptyEntries);
var chunks = SplitIntoChunks(words, _summarizerSettings.ChunkWordLimit, overlapWords: 200);
Console.WriteLine($"\n [Chunking] Transcript split into {chunks.Count} chunks for processing...");
// Map phase: summarize each chunk in sequence
// (Parallel would be faster but could hit rate limits — sequential is safer)
var chunkSummaries = new List<string>(chunks.Count);
for (int i = 0; i < chunks.Count; i++)
{
Console.Write($" [Chunk {i + 1}/{chunks.Count}] Summarizing");
var chunkText = string.Join(" ", chunks[i]);
var prompt = $"This is segment {i + 1} of {chunks.Count} from the video \"{metadata.Title}\":\n\n{chunkText}";
var summary = await CallChatCompletionAsync(chunkSystemPrompt, prompt, ct);
chunkSummaries.Add(summary);
}
// Reduce phase: combine all chunk summaries into one coherent summary
Console.Write(" [Combine] Merging chunk summaries into final summary");
var combinedInput = string.Join("\n\n---\n\n",
chunkSummaries.Select((s, i) => $"Segment {i + 1} summary:\n{s}"));
var combinePrompt = $"Video: \"{metadata.Title}\" by {metadata.ChannelTitle}\n\n" +
$"The following are summaries of {chunks.Count} consecutive segments:\n\n{combinedInput}";
return await CallChatCompletionAsync(combineSystemPrompt, combinePrompt, ct);
}
// ─────────────────────────────────────────────────────────────────────────
// Helpers
// ─────────────────────────────────────────────────────────────────────────
/// <summary>
/// Sends a system + user message pair to the Chat Completions endpoint
/// and returns the assistant's reply text.
/// </summary>
private async Task<string> CallChatCompletionAsync(
string systemPrompt,
string userMessage,
CancellationToken ct)
{
var messages = new List<ChatMessage>
{
new SystemChatMessage(systemPrompt),
new UserChatMessage(userMessage)
};
var options = new ChatCompletionOptions
{
MaxOutputTokenCount = _llmSettings.MaxTokens
};
var sw = System.Diagnostics.Stopwatch.StartNew();
var fullContent = new System.Text.StringBuilder();
try
{
var streamingUpdates = _chatClient.CompleteChatStreamingAsync(messages, options, ct);
await foreach (var update in streamingUpdates)
{
foreach (var part in update.ContentUpdate)
{
if (!string.IsNullOrEmpty(part.Text))
{
if (fullContent.Length == 0)
{
// First token received!
Console.Write(" (working)");
}
fullContent.Append(part.Text);
// Show progress: print a dot every ~50 characters of output
// or just periodically. For now, let's just do a dot every update
// to show it's alive.
if (fullContent.Length % 20 == 0) Console.Write(".");
}
}
}
}
finally
{
sw.Stop();
Console.WriteLine($" Done! ({sw.Elapsed.TotalSeconds:F1}s)");
}
return fullContent.ToString();
}
/// <summary>
/// Builds the user-turn prompt for a single-pass summarization.
/// Including the title and channel anchors the model to the subject matter,
/// which reduces hallucination on ambiguous ASR transcripts.
/// </summary>
private static string BuildUserPrompt(VideoMetadata metadata, string transcriptText)
{
return $"""
Video title: {metadata.Title}
Channel: {metadata.ChannelTitle}
Published: {metadata.PublishedAt:MMMM d, yyyy}
Duration: {metadata.FormattedDuration}
Full transcript:
{transcriptText}
""";
}
/// <summary>
/// Splits a word array into overlapping chunks of roughly <paramref name="chunkSize"/> words.
/// The overlap prevents the model from missing context at chunk boundaries.
/// </summary>
private static List<string[]> SplitIntoChunks(string[] words, int chunkSize, int overlapWords)
{
var chunks = new List<string[]>();
int start = 0;
while (start < words.Length)
{
int end = Math.Min(start + chunkSize, words.Length);
chunks.Add(words[start..end]);
// Next chunk starts after current chunk minus the overlap window
start = end - overlapWords;
// Guard: if remaining words are less than the overlap, we're done
if (start >= words.Length - overlapWords) break;
}
return chunks;
}
/// <summary>
/// Returns a human-readable warning when transcript quality may affect summary accuracy.
/// Returns null for high-confidence sources (no warning needed).
/// </summary>
private static string? BuildQualityWarning(TranscriptSource source) =>
source switch
{
TranscriptSource.AutoGenerated =>
"⚠ This summary is based on YouTube's auto-generated captions (ASR). " +
"The transcript may contain errors, especially for technical terms, names, or accented speech.",
TranscriptSource.MetadataOnly =>
"⚠ No captions were available. This summary is based on the video's title " +
"and description only — it may be incomplete or inaccurate.",
TranscriptSource.CommunityContributed =>
" This summary is based on community-contributed captions. " +
"Quality is generally good but not guaranteed.",
_ => null // OwnerPublished — no warning needed
};
}