summarizer/SummarizerService.cs

using OpenAI;
using OpenAI.Chat;
using YoutubeSummarizer.Configuration;
using YoutubeSummarizer.Models;

namespace YoutubeSummarizer.Services;

/// <summary>
/// Sends transcript text to OpenAI's Chat Completions API and returns a
/// structured summary.
///
/// Long transcripts (word count > ChunkWordLimit) are handled with a
/// "map-reduce" strategy:
///   1. Split the transcript into overlapping chunks.
///   2. Summarize each chunk independently (map phase).
///   3. Combine chunk summaries into a final cohesive summary (reduce phase).
///
/// This keeps individual API calls within model context limits while still
/// producing an accurate summary of long-form content like hour-long lectures.
/// </summary>
public sealed class SummarizerService
{
    private readonly LlmSettings _llmSettings;
    private readonly SummarizerSettings _summarizerSettings;
    private readonly ChatClient _chatClient;

    // System prompt used for single-pass and chunk summarization.
    // Keeping it focused on facts and structure produces better summaries
    // than open-ended "summarize this" prompts.
    private const string ChunkSystemPrompt = """
        You are a precise, factual assistant that summarizes YouTube video transcripts.
        When given a transcript segment, produce a concise summary that:
        - Captures the key points, arguments, and conclusions
        - Preserves any specific facts, names, dates, or statistics mentioned
        - Uses bullet points for individual points, then a short paragraph for the overall gist
        - Omits filler words, repeated phrases, and off-topic tangents
        - Does NOT add information not present in the transcript
        Respond with only the summary text, no preamble.
        """;

    // Personal Information Filter — concise relevance-based summary.
    private const string PersonalFilterSystemPrompt = """
        You are a concise, factual assistant that applies a Personal Information Filter
        to YouTube video transcripts. When given a transcript, respond with EXACTLY
        three sections and nothing else:

        Summary – A concise, plain-English summary in 1–2 sentences.

        Why it matters – Directly evaluate relevance only against these priorities:
        time, finances, health, family, service to others.
        If none apply, say so clearly.

        Priority tag – End with a single word verdict: ACT, MONITOR, or IGNORE.

        Constraints:
        - Do not timestamp or number entries.
        - Do not infer user interest beyond what is explicitly provided.
        - Do not expand or add context unless the user requests it.
        - The burden of interest is on the user.
        - Respond with only the three sections above, no preamble.
        """;

    // Personal Filter combine prompt for long transcripts.
    private const string PersonalFilterCombinePrompt = """
        You are a concise, factual assistant. You will receive several partial summaries
        of consecutive segments of a YouTube video, each formatted with Summary,
        Why it matters, and Priority tag sections. Combine them into a single response
        using the same three-section format:

        Summary – A concise, plain-English summary of the entire video in 1–2 sentences.

        Why it matters – Directly evaluate relevance only against these priorities:
        time, finances, health, family, service to others.
        If none apply, say so clearly.

        Priority tag – A single word verdict: ACT, MONITOR, or IGNORE.

        Respond with only these three sections, no preamble.
        """;

    // Used in the reduce phase to combine chunk summaries coherently.
    private const string CombineSystemPrompt = """
        You are a precise, factual assistant. You will receive several partial summaries
        of consecutive segments of a YouTube video. Your task is to combine them into
        a single, coherent, well-structured summary that:
        - Flows as a unified narrative, not as a list of sub-summaries
        - Preserves all key facts, names, dates, and statistics
        - Uses bullet points for supporting details beneath each main topic
        - Omits redundant information that appears across multiple segments
        - Concludes with a 2–3 sentence takeaway paragraph
        Respond with only the combined summary, no preamble.
        """;

    public SummarizerService(LlmSettings llmSettings, SummarizerSettings summarizerSettings)
    {
        _llmSettings = llmSettings;
        _summarizerSettings = summarizerSettings;

        // Initialize the client with the specified model and endpoint.
        // We use the OpenAI SDK's ability to point to any OpenAI-compatible API (like Ollama).
        _chatClient = new ChatClient(
            model: llmSettings.Model,
            credential: new System.ClientModel.ApiKeyCredential(llmSettings.ApiKey),
            options: new OpenAIClientOptions
            {
                Endpoint = new Uri(llmSettings.BaseUrl),
                NetworkTimeout = TimeSpan.FromSeconds(llmSettings.TimeoutSeconds)
            });
    }

    // ─────────────────────────────────────────────────────────────────────────
    // Public API
    // ─────────────────────────────────────────────────────────────────────────

    /// <summary>
    /// Produces a <see cref="VideoSummary"/> from the video's metadata and transcript.
    /// Automatically routes to single-pass or chunked strategy based on word count.
    /// </summary>
    public async Task<VideoSummary> SummarizeAsync(
        VideoMetadata metadata,
        VideoTranscript transcript,
        SummaryMode mode = SummaryMode.Standard,
        CancellationToken ct = default)
    {
        string summaryText;

        // Select prompt set based on mode
        var chunkPrompt = mode == SummaryMode.PersonalFilter
            ? PersonalFilterSystemPrompt : ChunkSystemPrompt;
        var combinePrompt = mode == SummaryMode.PersonalFilter
            ? PersonalFilterCombinePrompt : CombineSystemPrompt;

        if (transcript.WordCount <= _summarizerSettings.ChunkWordLimit)
        {
            // Short video — single API call is sufficient
            summaryText = await SinglePassSummarizeAsync(transcript.Text, metadata, chunkPrompt, ct);
        }
        else
        {
            // Long video — chunk-and-combine strategy
            summaryText = await ChunkedSummarizeAsync(transcript.Text, metadata, chunkPrompt, combinePrompt, ct);
        }

        // Attach a quality warning when the transcript quality is uncertain
        var warning = BuildQualityWarning(transcript.Source);

        return new VideoSummary
        {
            Metadata = metadata,
            SummaryText = summaryText,
            TranscriptSource = transcript.Source,
            QualityWarning = warning,
            ModelUsed = _llmSettings.Model
        };
    }

    // ─────────────────────────────────────────────────────────────────────────
    // Summarization strategies
    // ─────────────────────────────────────────────────────────────────────────

    /// <summary>
    /// Single-pass: sends the entire transcript in one API call.
    /// Best for videos under ~30 minutes (roughly 3000–4000 words).
    /// </summary>
    private async Task<string> SinglePassSummarizeAsync(
        string transcriptText,
        VideoMetadata metadata,
        string systemPrompt,
        CancellationToken ct)
    {
        var userMessage = BuildUserPrompt(metadata, transcriptText);
        return await CallChatCompletionAsync(systemPrompt, userMessage, ct);
    }

    /// <summary>
    /// Map-reduce: splits long transcripts, summarizes each chunk, then combines.
    ///
    /// Overlap: each chunk ends with a brief overlap window (last ~200 words of
    /// the previous chunk) so the model retains context across chunk boundaries
    /// and avoids abrupt topic changes in the summaries.
    /// </summary>
    private async Task<string> ChunkedSummarizeAsync(
        string transcriptText,
        VideoMetadata metadata,
        string chunkSystemPrompt,
        string combineSystemPrompt,
        CancellationToken ct)
    {
        var words = transcriptText.Split(' ', StringSplitOptions.RemoveEmptyEntries);
        var chunks = SplitIntoChunks(words, _summarizerSettings.ChunkWordLimit, overlapWords: 200);

        Console.WriteLine($"\n  [Chunking] Transcript split into {chunks.Count} chunks for processing...");

        // Map phase: summarize each chunk in sequence
        // (Parallel would be faster but could hit rate limits — sequential is safer)
        var chunkSummaries = new List<string>(chunks.Count);
        for (int i = 0; i < chunks.Count; i++)
        {
            Console.Write($"  [Chunk {i + 1}/{chunks.Count}] Summarizing");
            var chunkText = string.Join(" ", chunks[i]);
            var prompt = $"This is segment {i + 1} of {chunks.Count} from the video \"{metadata.Title}\":\n\n{chunkText}";
            var summary = await CallChatCompletionAsync(chunkSystemPrompt, prompt, ct);
            chunkSummaries.Add(summary);
        }

        // Reduce phase: combine all chunk summaries into one coherent summary
        Console.Write("  [Combine] Merging chunk summaries into final summary");
        var combinedInput = string.Join("\n\n---\n\n",
            chunkSummaries.Select((s, i) => $"Segment {i + 1} summary:\n{s}"));

        var combinePrompt = $"Video: \"{metadata.Title}\" by {metadata.ChannelTitle}\n\n" +
                            $"The following are summaries of {chunks.Count} consecutive segments:\n\n{combinedInput}";

        return await CallChatCompletionAsync(combineSystemPrompt, combinePrompt, ct);
    }

    // ─────────────────────────────────────────────────────────────────────────
    // Helpers
    // ─────────────────────────────────────────────────────────────────────────

    /// <summary>
    /// Sends a system + user message pair to the Chat Completions endpoint
    /// and returns the assistant's reply text.
    /// </summary>
    private async Task<string> CallChatCompletionAsync(
        string systemPrompt,
        string userMessage,
        CancellationToken ct)
    {
        var messages = new List<ChatMessage>
        {
            new SystemChatMessage(systemPrompt),
            new UserChatMessage(userMessage)
        };

        var options = new ChatCompletionOptions
        {
            MaxOutputTokenCount = _llmSettings.MaxTokens
        };

        var sw = System.Diagnostics.Stopwatch.StartNew();
        var fullContent = new System.Text.StringBuilder();

        try
        {
            var streamingUpdates = _chatClient.CompleteChatStreamingAsync(messages, options, ct);

            await foreach (var update in streamingUpdates)
            {
                foreach (var part in update.ContentUpdate)
                {
                    if (!string.IsNullOrEmpty(part.Text))
                    {
                        if (fullContent.Length == 0)
                        {
                            // First token received!
                            Console.Write(" (working)");
                        }

                        fullContent.Append(part.Text);

                        // Show progress: print a dot every ~50 characters of output
                        // or just periodically. For now, let's just do a dot every update
                        // to show it's alive.
                        if (fullContent.Length % 20 == 0) Console.Write(".");
                    }
                }
            }
        }
        finally
        {
            sw.Stop();
            Console.WriteLine($" Done! ({sw.Elapsed.TotalSeconds:F1}s)");
        }

        return fullContent.ToString();
    }

    /// <summary>
    /// Builds the user-turn prompt for a single-pass summarization.
    /// Including the title and channel anchors the model to the subject matter,
    /// which reduces hallucination on ambiguous ASR transcripts.
    /// </summary>
    private static string BuildUserPrompt(VideoMetadata metadata, string transcriptText)
    {
        return $"""
            Video title: {metadata.Title}
            Channel: {metadata.ChannelTitle}
            Published: {metadata.PublishedAt:MMMM d, yyyy}
            Duration: {metadata.FormattedDuration}

            Full transcript:
            {transcriptText}
            """;
    }

    /// <summary>
    /// Splits a word array into overlapping chunks of roughly <paramref name="chunkSize"/> words.
    /// The overlap prevents the model from missing context at chunk boundaries.
    /// </summary>
    private static List<string[]> SplitIntoChunks(string[] words, int chunkSize, int overlapWords)
    {
        var chunks = new List<string[]>();
        int start = 0;

        while (start < words.Length)
        {
            int end = Math.Min(start + chunkSize, words.Length);
            chunks.Add(words[start..end]);

            // Next chunk starts after current chunk minus the overlap window
            start = end - overlapWords;

            // Guard: if remaining words are less than the overlap, we're done
            if (start >= words.Length - overlapWords) break;
        }

        return chunks;
    }

    /// <summary>
    /// Returns a human-readable warning when transcript quality may affect summary accuracy.
    /// Returns null for high-confidence sources (no warning needed).
    /// </summary>
    private static string? BuildQualityWarning(TranscriptSource source) =>
        source switch
        {
            TranscriptSource.AutoGenerated =>
                "⚠  This summary is based on YouTube's auto-generated captions (ASR). " +
                "The transcript may contain errors, especially for technical terms, names, or accented speech.",

            TranscriptSource.MetadataOnly =>
                "⚠  No captions were available. This summary is based on the video's title " +
                "and description only — it may be incomplete or inaccurate.",

            TranscriptSource.CommunityContributed =>
                "ℹ  This summary is based on community-contributed captions. " +
                "Quality is generally good but not guaranteed.",

            _ => null  // OwnerPublished — no warning needed
        };
}