using OpenAI; using OpenAI.Chat; using YoutubeSummarizer.Configuration; using YoutubeSummarizer.Models; namespace YoutubeSummarizer.Services; /// /// Sends transcript text to OpenAI's Chat Completions API and returns a /// structured summary. /// /// Long transcripts (word count > ChunkWordLimit) are handled with a /// "map-reduce" strategy: /// 1. Split the transcript into overlapping chunks. /// 2. Summarize each chunk independently (map phase). /// 3. Combine chunk summaries into a final cohesive summary (reduce phase). /// /// This keeps individual API calls within model context limits while still /// producing an accurate summary of long-form content like hour-long lectures. /// public sealed class SummarizerService { private readonly LlmSettings _llmSettings; private readonly SummarizerSettings _summarizerSettings; private readonly ChatClient _chatClient; // System prompt used for single-pass and chunk summarization. // Keeping it focused on facts and structure produces better summaries // than open-ended "summarize this" prompts. private const string ChunkSystemPrompt = """ You are a precise, factual assistant that summarizes YouTube video transcripts. When given a transcript segment, produce a concise summary that: - Captures the key points, arguments, and conclusions - Preserves any specific facts, names, dates, or statistics mentioned - Uses bullet points for individual points, then a short paragraph for the overall gist - Omits filler words, repeated phrases, and off-topic tangents - Does NOT add information not present in the transcript Respond with only the summary text, no preamble. """; // Personal Information Filter — concise relevance-based summary. private const string PersonalFilterSystemPrompt = """ You are a concise, factual assistant that applies a Personal Information Filter to YouTube video transcripts. When given a transcript, respond with EXACTLY three sections and nothing else: Summary – A concise, plain-English summary in 1–2 sentences. Why it matters – Directly evaluate relevance only against these priorities: time, finances, health, family, service to others. If none apply, say so clearly. Priority tag – End with a single word verdict: ACT, MONITOR, or IGNORE. Constraints: - Do not timestamp or number entries. - Do not infer user interest beyond what is explicitly provided. - Do not expand or add context unless the user requests it. - The burden of interest is on the user. - Respond with only the three sections above, no preamble. """; // Personal Filter combine prompt for long transcripts. private const string PersonalFilterCombinePrompt = """ You are a concise, factual assistant. You will receive several partial summaries of consecutive segments of a YouTube video, each formatted with Summary, Why it matters, and Priority tag sections. Combine them into a single response using the same three-section format: Summary – A concise, plain-English summary of the entire video in 1–2 sentences. Why it matters – Directly evaluate relevance only against these priorities: time, finances, health, family, service to others. If none apply, say so clearly. Priority tag – A single word verdict: ACT, MONITOR, or IGNORE. Respond with only these three sections, no preamble. """; // Used in the reduce phase to combine chunk summaries coherently. private const string CombineSystemPrompt = """ You are a precise, factual assistant. You will receive several partial summaries of consecutive segments of a YouTube video. Your task is to combine them into a single, coherent, well-structured summary that: - Flows as a unified narrative, not as a list of sub-summaries - Preserves all key facts, names, dates, and statistics - Uses bullet points for supporting details beneath each main topic - Omits redundant information that appears across multiple segments - Concludes with a 2–3 sentence takeaway paragraph Respond with only the combined summary, no preamble. """; public SummarizerService(LlmSettings llmSettings, SummarizerSettings summarizerSettings) { _llmSettings = llmSettings; _summarizerSettings = summarizerSettings; // Initialize the client with the specified model and endpoint. // We use the OpenAI SDK's ability to point to any OpenAI-compatible API (like Ollama). _chatClient = new ChatClient( model: llmSettings.Model, credential: new System.ClientModel.ApiKeyCredential(llmSettings.ApiKey), options: new OpenAIClientOptions { Endpoint = new Uri(llmSettings.BaseUrl), NetworkTimeout = TimeSpan.FromSeconds(llmSettings.TimeoutSeconds) }); } // ───────────────────────────────────────────────────────────────────────── // Public API // ───────────────────────────────────────────────────────────────────────── /// /// Produces a from the video's metadata and transcript. /// Automatically routes to single-pass or chunked strategy based on word count. /// public async Task SummarizeAsync( VideoMetadata metadata, VideoTranscript transcript, SummaryMode mode = SummaryMode.Standard, CancellationToken ct = default) { string summaryText; // Select prompt set based on mode var chunkPrompt = mode == SummaryMode.PersonalFilter ? PersonalFilterSystemPrompt : ChunkSystemPrompt; var combinePrompt = mode == SummaryMode.PersonalFilter ? PersonalFilterCombinePrompt : CombineSystemPrompt; if (transcript.WordCount <= _summarizerSettings.ChunkWordLimit) { // Short video — single API call is sufficient summaryText = await SinglePassSummarizeAsync(transcript.Text, metadata, chunkPrompt, ct); } else { // Long video — chunk-and-combine strategy summaryText = await ChunkedSummarizeAsync(transcript.Text, metadata, chunkPrompt, combinePrompt, ct); } // Attach a quality warning when the transcript quality is uncertain var warning = BuildQualityWarning(transcript.Source); return new VideoSummary { Metadata = metadata, SummaryText = summaryText, TranscriptSource = transcript.Source, QualityWarning = warning, ModelUsed = _llmSettings.Model }; } // ───────────────────────────────────────────────────────────────────────── // Summarization strategies // ───────────────────────────────────────────────────────────────────────── /// /// Single-pass: sends the entire transcript in one API call. /// Best for videos under ~30 minutes (roughly 3000–4000 words). /// private async Task SinglePassSummarizeAsync( string transcriptText, VideoMetadata metadata, string systemPrompt, CancellationToken ct) { var userMessage = BuildUserPrompt(metadata, transcriptText); return await CallChatCompletionAsync(systemPrompt, userMessage, ct); } /// /// Map-reduce: splits long transcripts, summarizes each chunk, then combines. /// /// Overlap: each chunk ends with a brief overlap window (last ~200 words of /// the previous chunk) so the model retains context across chunk boundaries /// and avoids abrupt topic changes in the summaries. /// private async Task ChunkedSummarizeAsync( string transcriptText, VideoMetadata metadata, string chunkSystemPrompt, string combineSystemPrompt, CancellationToken ct) { var words = transcriptText.Split(' ', StringSplitOptions.RemoveEmptyEntries); var chunks = SplitIntoChunks(words, _summarizerSettings.ChunkWordLimit, overlapWords: 200); Console.WriteLine($"\n [Chunking] Transcript split into {chunks.Count} chunks for processing..."); // Map phase: summarize each chunk in sequence // (Parallel would be faster but could hit rate limits — sequential is safer) var chunkSummaries = new List(chunks.Count); for (int i = 0; i < chunks.Count; i++) { Console.Write($" [Chunk {i + 1}/{chunks.Count}] Summarizing"); var chunkText = string.Join(" ", chunks[i]); var prompt = $"This is segment {i + 1} of {chunks.Count} from the video \"{metadata.Title}\":\n\n{chunkText}"; var summary = await CallChatCompletionAsync(chunkSystemPrompt, prompt, ct); chunkSummaries.Add(summary); } // Reduce phase: combine all chunk summaries into one coherent summary Console.Write(" [Combine] Merging chunk summaries into final summary"); var combinedInput = string.Join("\n\n---\n\n", chunkSummaries.Select((s, i) => $"Segment {i + 1} summary:\n{s}")); var combinePrompt = $"Video: \"{metadata.Title}\" by {metadata.ChannelTitle}\n\n" + $"The following are summaries of {chunks.Count} consecutive segments:\n\n{combinedInput}"; return await CallChatCompletionAsync(combineSystemPrompt, combinePrompt, ct); } // ───────────────────────────────────────────────────────────────────────── // Helpers // ───────────────────────────────────────────────────────────────────────── /// /// Sends a system + user message pair to the Chat Completions endpoint /// and returns the assistant's reply text. /// private async Task CallChatCompletionAsync( string systemPrompt, string userMessage, CancellationToken ct) { var messages = new List { new SystemChatMessage(systemPrompt), new UserChatMessage(userMessage) }; var options = new ChatCompletionOptions { MaxOutputTokenCount = _llmSettings.MaxTokens }; var sw = System.Diagnostics.Stopwatch.StartNew(); var fullContent = new System.Text.StringBuilder(); try { var streamingUpdates = _chatClient.CompleteChatStreamingAsync(messages, options, ct); await foreach (var update in streamingUpdates) { foreach (var part in update.ContentUpdate) { if (!string.IsNullOrEmpty(part.Text)) { if (fullContent.Length == 0) { // First token received! Console.Write(" (working)"); } fullContent.Append(part.Text); // Show progress: print a dot every ~50 characters of output // or just periodically. For now, let's just do a dot every update // to show it's alive. if (fullContent.Length % 20 == 0) Console.Write("."); } } } } finally { sw.Stop(); Console.WriteLine($" Done! ({sw.Elapsed.TotalSeconds:F1}s)"); } return fullContent.ToString(); } /// /// Builds the user-turn prompt for a single-pass summarization. /// Including the title and channel anchors the model to the subject matter, /// which reduces hallucination on ambiguous ASR transcripts. /// private static string BuildUserPrompt(VideoMetadata metadata, string transcriptText) { return $""" Video title: {metadata.Title} Channel: {metadata.ChannelTitle} Published: {metadata.PublishedAt:MMMM d, yyyy} Duration: {metadata.FormattedDuration} Full transcript: {transcriptText} """; } /// /// Splits a word array into overlapping chunks of roughly words. /// The overlap prevents the model from missing context at chunk boundaries. /// private static List SplitIntoChunks(string[] words, int chunkSize, int overlapWords) { var chunks = new List(); int start = 0; while (start < words.Length) { int end = Math.Min(start + chunkSize, words.Length); chunks.Add(words[start..end]); // Next chunk starts after current chunk minus the overlap window start = end - overlapWords; // Guard: if remaining words are less than the overlap, we're done if (start >= words.Length - overlapWords) break; } return chunks; } /// /// Returns a human-readable warning when transcript quality may affect summary accuracy. /// Returns null for high-confidence sources (no warning needed). /// private static string? BuildQualityWarning(TranscriptSource source) => source switch { TranscriptSource.AutoGenerated => "⚠ This summary is based on YouTube's auto-generated captions (ASR). " + "The transcript may contain errors, especially for technical terms, names, or accented speech.", TranscriptSource.MetadataOnly => "⚠ No captions were available. This summary is based on the video's title " + "and description only — it may be incomplete or inaccurate.", TranscriptSource.CommunityContributed => "ℹ This summary is based on community-contributed captions. " + "Quality is generally good but not guaranteed.", _ => null // OwnerPublished — no warning needed }; }