343 lines
15 KiB
C#
343 lines
15 KiB
C#
using OpenAI;
|
||
using OpenAI.Chat;
|
||
using YoutubeSummarizer.Configuration;
|
||
using YoutubeSummarizer.Models;
|
||
|
||
namespace YoutubeSummarizer.Services;
|
||
|
||
/// <summary>
|
||
/// Sends transcript text to OpenAI's Chat Completions API and returns a
|
||
/// structured summary.
|
||
///
|
||
/// Long transcripts (word count > ChunkWordLimit) are handled with a
|
||
/// "map-reduce" strategy:
|
||
/// 1. Split the transcript into overlapping chunks.
|
||
/// 2. Summarize each chunk independently (map phase).
|
||
/// 3. Combine chunk summaries into a final cohesive summary (reduce phase).
|
||
///
|
||
/// This keeps individual API calls within model context limits while still
|
||
/// producing an accurate summary of long-form content like hour-long lectures.
|
||
/// </summary>
|
||
public sealed class SummarizerService
|
||
{
|
||
private readonly LlmSettings _llmSettings;
|
||
private readonly SummarizerSettings _summarizerSettings;
|
||
private readonly ChatClient _chatClient;
|
||
|
||
// System prompt used for single-pass and chunk summarization.
|
||
// Keeping it focused on facts and structure produces better summaries
|
||
// than open-ended "summarize this" prompts.
|
||
private const string ChunkSystemPrompt = """
|
||
You are a precise, factual assistant that summarizes YouTube video transcripts.
|
||
When given a transcript segment, produce a concise summary that:
|
||
- Captures the key points, arguments, and conclusions
|
||
- Preserves any specific facts, names, dates, or statistics mentioned
|
||
- Uses bullet points for individual points, then a short paragraph for the overall gist
|
||
- Omits filler words, repeated phrases, and off-topic tangents
|
||
- Does NOT add information not present in the transcript
|
||
Respond with only the summary text, no preamble.
|
||
""";
|
||
|
||
// Personal Information Filter — concise relevance-based summary.
|
||
private const string PersonalFilterSystemPrompt = """
|
||
You are a concise, factual assistant that applies a Personal Information Filter
|
||
to YouTube video transcripts. When given a transcript, respond with EXACTLY
|
||
three sections and nothing else:
|
||
|
||
Summary – A concise, plain-English summary in 1–2 sentences.
|
||
|
||
Why it matters – Directly evaluate relevance only against these priorities:
|
||
time, finances, health, family, service to others.
|
||
If none apply, say so clearly.
|
||
|
||
Priority tag – End with a single word verdict: ACT, MONITOR, or IGNORE.
|
||
|
||
Constraints:
|
||
- Do not timestamp or number entries.
|
||
- Do not infer user interest beyond what is explicitly provided.
|
||
- Do not expand or add context unless the user requests it.
|
||
- The burden of interest is on the user.
|
||
- Respond with only the three sections above, no preamble.
|
||
""";
|
||
|
||
// Personal Filter combine prompt for long transcripts.
|
||
private const string PersonalFilterCombinePrompt = """
|
||
You are a concise, factual assistant. You will receive several partial summaries
|
||
of consecutive segments of a YouTube video, each formatted with Summary,
|
||
Why it matters, and Priority tag sections. Combine them into a single response
|
||
using the same three-section format:
|
||
|
||
Summary – A concise, plain-English summary of the entire video in 1–2 sentences.
|
||
|
||
Why it matters – Directly evaluate relevance only against these priorities:
|
||
time, finances, health, family, service to others.
|
||
If none apply, say so clearly.
|
||
|
||
Priority tag – A single word verdict: ACT, MONITOR, or IGNORE.
|
||
|
||
Respond with only these three sections, no preamble.
|
||
""";
|
||
|
||
// Used in the reduce phase to combine chunk summaries coherently.
|
||
private const string CombineSystemPrompt = """
|
||
You are a precise, factual assistant. You will receive several partial summaries
|
||
of consecutive segments of a YouTube video. Your task is to combine them into
|
||
a single, coherent, well-structured summary that:
|
||
- Flows as a unified narrative, not as a list of sub-summaries
|
||
- Preserves all key facts, names, dates, and statistics
|
||
- Uses bullet points for supporting details beneath each main topic
|
||
- Omits redundant information that appears across multiple segments
|
||
- Concludes with a 2–3 sentence takeaway paragraph
|
||
Respond with only the combined summary, no preamble.
|
||
""";
|
||
|
||
public SummarizerService(LlmSettings llmSettings, SummarizerSettings summarizerSettings)
|
||
{
|
||
_llmSettings = llmSettings;
|
||
_summarizerSettings = summarizerSettings;
|
||
|
||
// Initialize the client with the specified model and endpoint.
|
||
// We use the OpenAI SDK's ability to point to any OpenAI-compatible API (like Ollama).
|
||
_chatClient = new ChatClient(
|
||
model: llmSettings.Model,
|
||
credential: new System.ClientModel.ApiKeyCredential(llmSettings.ApiKey),
|
||
options: new OpenAIClientOptions
|
||
{
|
||
Endpoint = new Uri(llmSettings.BaseUrl),
|
||
NetworkTimeout = TimeSpan.FromSeconds(llmSettings.TimeoutSeconds)
|
||
});
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
// Public API
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
|
||
/// <summary>
|
||
/// Produces a <see cref="VideoSummary"/> from the video's metadata and transcript.
|
||
/// Automatically routes to single-pass or chunked strategy based on word count.
|
||
/// </summary>
|
||
public async Task<VideoSummary> SummarizeAsync(
|
||
VideoMetadata metadata,
|
||
VideoTranscript transcript,
|
||
SummaryMode mode = SummaryMode.Standard,
|
||
CancellationToken ct = default)
|
||
{
|
||
string summaryText;
|
||
|
||
// Select prompt set based on mode
|
||
var chunkPrompt = mode == SummaryMode.PersonalFilter
|
||
? PersonalFilterSystemPrompt : ChunkSystemPrompt;
|
||
var combinePrompt = mode == SummaryMode.PersonalFilter
|
||
? PersonalFilterCombinePrompt : CombineSystemPrompt;
|
||
|
||
if (transcript.WordCount <= _summarizerSettings.ChunkWordLimit)
|
||
{
|
||
// Short video — single API call is sufficient
|
||
summaryText = await SinglePassSummarizeAsync(transcript.Text, metadata, chunkPrompt, ct);
|
||
}
|
||
else
|
||
{
|
||
// Long video — chunk-and-combine strategy
|
||
summaryText = await ChunkedSummarizeAsync(transcript.Text, metadata, chunkPrompt, combinePrompt, ct);
|
||
}
|
||
|
||
// Attach a quality warning when the transcript quality is uncertain
|
||
var warning = BuildQualityWarning(transcript.Source);
|
||
|
||
return new VideoSummary
|
||
{
|
||
Metadata = metadata,
|
||
SummaryText = summaryText,
|
||
TranscriptSource = transcript.Source,
|
||
QualityWarning = warning,
|
||
ModelUsed = _llmSettings.Model
|
||
};
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
// Summarization strategies
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
|
||
/// <summary>
|
||
/// Single-pass: sends the entire transcript in one API call.
|
||
/// Best for videos under ~30 minutes (roughly 3000–4000 words).
|
||
/// </summary>
|
||
private async Task<string> SinglePassSummarizeAsync(
|
||
string transcriptText,
|
||
VideoMetadata metadata,
|
||
string systemPrompt,
|
||
CancellationToken ct)
|
||
{
|
||
var userMessage = BuildUserPrompt(metadata, transcriptText);
|
||
return await CallChatCompletionAsync(systemPrompt, userMessage, ct);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Map-reduce: splits long transcripts, summarizes each chunk, then combines.
|
||
///
|
||
/// Overlap: each chunk ends with a brief overlap window (last ~200 words of
|
||
/// the previous chunk) so the model retains context across chunk boundaries
|
||
/// and avoids abrupt topic changes in the summaries.
|
||
/// </summary>
|
||
private async Task<string> ChunkedSummarizeAsync(
|
||
string transcriptText,
|
||
VideoMetadata metadata,
|
||
string chunkSystemPrompt,
|
||
string combineSystemPrompt,
|
||
CancellationToken ct)
|
||
{
|
||
var words = transcriptText.Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
||
var chunks = SplitIntoChunks(words, _summarizerSettings.ChunkWordLimit, overlapWords: 200);
|
||
|
||
Console.WriteLine($"\n [Chunking] Transcript split into {chunks.Count} chunks for processing...");
|
||
|
||
// Map phase: summarize each chunk in sequence
|
||
// (Parallel would be faster but could hit rate limits — sequential is safer)
|
||
var chunkSummaries = new List<string>(chunks.Count);
|
||
for (int i = 0; i < chunks.Count; i++)
|
||
{
|
||
Console.Write($" [Chunk {i + 1}/{chunks.Count}] Summarizing");
|
||
var chunkText = string.Join(" ", chunks[i]);
|
||
var prompt = $"This is segment {i + 1} of {chunks.Count} from the video \"{metadata.Title}\":\n\n{chunkText}";
|
||
var summary = await CallChatCompletionAsync(chunkSystemPrompt, prompt, ct);
|
||
chunkSummaries.Add(summary);
|
||
}
|
||
|
||
// Reduce phase: combine all chunk summaries into one coherent summary
|
||
Console.Write(" [Combine] Merging chunk summaries into final summary");
|
||
var combinedInput = string.Join("\n\n---\n\n",
|
||
chunkSummaries.Select((s, i) => $"Segment {i + 1} summary:\n{s}"));
|
||
|
||
var combinePrompt = $"Video: \"{metadata.Title}\" by {metadata.ChannelTitle}\n\n" +
|
||
$"The following are summaries of {chunks.Count} consecutive segments:\n\n{combinedInput}";
|
||
|
||
return await CallChatCompletionAsync(combineSystemPrompt, combinePrompt, ct);
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
// Helpers
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
|
||
/// <summary>
|
||
/// Sends a system + user message pair to the Chat Completions endpoint
|
||
/// and returns the assistant's reply text.
|
||
/// </summary>
|
||
private async Task<string> CallChatCompletionAsync(
|
||
string systemPrompt,
|
||
string userMessage,
|
||
CancellationToken ct)
|
||
{
|
||
var messages = new List<ChatMessage>
|
||
{
|
||
new SystemChatMessage(systemPrompt),
|
||
new UserChatMessage(userMessage)
|
||
};
|
||
|
||
var options = new ChatCompletionOptions
|
||
{
|
||
MaxOutputTokenCount = _llmSettings.MaxTokens
|
||
};
|
||
|
||
var sw = System.Diagnostics.Stopwatch.StartNew();
|
||
var fullContent = new System.Text.StringBuilder();
|
||
|
||
try
|
||
{
|
||
var streamingUpdates = _chatClient.CompleteChatStreamingAsync(messages, options, ct);
|
||
|
||
await foreach (var update in streamingUpdates)
|
||
{
|
||
foreach (var part in update.ContentUpdate)
|
||
{
|
||
if (!string.IsNullOrEmpty(part.Text))
|
||
{
|
||
if (fullContent.Length == 0)
|
||
{
|
||
// First token received!
|
||
Console.Write(" (working)");
|
||
}
|
||
|
||
fullContent.Append(part.Text);
|
||
|
||
// Show progress: print a dot every ~50 characters of output
|
||
// or just periodically. For now, let's just do a dot every update
|
||
// to show it's alive.
|
||
if (fullContent.Length % 20 == 0) Console.Write(".");
|
||
}
|
||
}
|
||
}
|
||
}
|
||
finally
|
||
{
|
||
sw.Stop();
|
||
Console.WriteLine($" Done! ({sw.Elapsed.TotalSeconds:F1}s)");
|
||
}
|
||
|
||
return fullContent.ToString();
|
||
}
|
||
|
||
/// <summary>
|
||
/// Builds the user-turn prompt for a single-pass summarization.
|
||
/// Including the title and channel anchors the model to the subject matter,
|
||
/// which reduces hallucination on ambiguous ASR transcripts.
|
||
/// </summary>
|
||
private static string BuildUserPrompt(VideoMetadata metadata, string transcriptText)
|
||
{
|
||
return $"""
|
||
Video title: {metadata.Title}
|
||
Channel: {metadata.ChannelTitle}
|
||
Published: {metadata.PublishedAt:MMMM d, yyyy}
|
||
Duration: {metadata.FormattedDuration}
|
||
|
||
Full transcript:
|
||
{transcriptText}
|
||
""";
|
||
}
|
||
|
||
/// <summary>
|
||
/// Splits a word array into overlapping chunks of roughly <paramref name="chunkSize"/> words.
|
||
/// The overlap prevents the model from missing context at chunk boundaries.
|
||
/// </summary>
|
||
private static List<string[]> SplitIntoChunks(string[] words, int chunkSize, int overlapWords)
|
||
{
|
||
var chunks = new List<string[]>();
|
||
int start = 0;
|
||
|
||
while (start < words.Length)
|
||
{
|
||
int end = Math.Min(start + chunkSize, words.Length);
|
||
chunks.Add(words[start..end]);
|
||
|
||
// Next chunk starts after current chunk minus the overlap window
|
||
start = end - overlapWords;
|
||
|
||
// Guard: if remaining words are less than the overlap, we're done
|
||
if (start >= words.Length - overlapWords) break;
|
||
}
|
||
|
||
return chunks;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Returns a human-readable warning when transcript quality may affect summary accuracy.
|
||
/// Returns null for high-confidence sources (no warning needed).
|
||
/// </summary>
|
||
private static string? BuildQualityWarning(TranscriptSource source) =>
|
||
source switch
|
||
{
|
||
TranscriptSource.AutoGenerated =>
|
||
"⚠ This summary is based on YouTube's auto-generated captions (ASR). " +
|
||
"The transcript may contain errors, especially for technical terms, names, or accented speech.",
|
||
|
||
TranscriptSource.MetadataOnly =>
|
||
"⚠ No captions were available. This summary is based on the video's title " +
|
||
"and description only — it may be incomplete or inaccurate.",
|
||
|
||
TranscriptSource.CommunityContributed =>
|
||
"ℹ This summary is based on community-contributed captions. " +
|
||
"Quality is generally good but not guaranteed.",
|
||
|
||
_ => null // OwnerPublished — no warning needed
|
||
};
|
||
}
|