summarizer/TranscriptFileService.cs

213 lines
9.3 KiB
C#

using System.Text;
using YoutubeSummarizer.Models;
namespace YoutubeSummarizer.Services;
/// <summary>
/// Saves video metadata and timestamped transcript to a plain text file.
/// The file is formatted with metadata at the top followed by the transcript
/// organized by timestamps.
/// </summary>
public static class TranscriptFileService
{
/// <summary>
/// Saves the transcript and metadata to a text file in the specified directory.
/// Returns the full path to the saved file.
/// </summary>
public static async Task<string> SaveAsync(
VideoMetadata metadata,
VideoTranscript transcript,
string? summaryText = null,
string? outputDirectory = null,
CancellationToken ct = default)
{
outputDirectory ??= Environment.CurrentDirectory;
Directory.CreateDirectory(outputDirectory);
// Build a safe filename from the video title
var safeTitle = SanitizeFileName(metadata.Title);
var fileName = $"{safeTitle}_{metadata.VideoId}.txt";
var filePath = Path.Combine(outputDirectory, fileName);
var sb = new StringBuilder();
// ── Metadata section ─────────────────────────────────────────────────
sb.AppendLine("════════════════════════════════════════════════════════════════");
sb.AppendLine(" VIDEO METADATA");
sb.AppendLine("════════════════════════════════════════════════════════════════");
sb.AppendLine();
sb.AppendLine($" Title: {metadata.Title}");
sb.AppendLine($" Channel: {metadata.ChannelTitle}");
sb.AppendLine($" Published: {metadata.PublishedAt:MMMM d, yyyy}");
sb.AppendLine($" Duration: {metadata.FormattedDuration}");
sb.AppendLine($" Video ID: {metadata.VideoId}");
sb.AppendLine($" URL: https://youtu.be/{metadata.VideoId}");
if (!string.IsNullOrWhiteSpace(summaryText))
{
sb.AppendLine();
sb.AppendLine(" ── SUMMARY ──────────────────────────────────────────────");
sb.AppendLine();
foreach (var wrappedLine in WordWrap(summaryText, maxWidth: 72))
{
sb.AppendLine($" {wrappedLine}");
}
}
sb.AppendLine();
// ── Transcript source ────────────────────────────────────────────────
var sourceLabel = transcript.Source switch
{
TranscriptSource.OwnerPublished => "Owner-published captions",
TranscriptSource.CommunityContributed => "Community-contributed captions",
TranscriptSource.AutoGenerated => "Auto-generated (ASR)",
TranscriptSource.MetadataOnly => "Metadata only (no captions)",
_ => "Unknown"
};
sb.AppendLine($" Transcript Source: {sourceLabel}");
sb.AppendLine($" Word Count: {transcript.WordCount:N0}");
sb.AppendLine($" Saved: {DateTimeOffset.UtcNow:yyyy-MM-dd HH:mm} UTC");
sb.AppendLine();
// ── Transcript section ───────────────────────────────────────────────
sb.AppendLine("════════════════════════════════════════════════════════════════");
sb.AppendLine(" TRANSCRIPT");
sb.AppendLine("════════════════════════════════════════════════════════════════");
sb.AppendLine();
if (transcript.Segments.Count > 0)
{
// Group segments into blocks by time intervals for readability
// Each block groups consecutive segments within ~30 seconds
var blocks = GroupSegmentsByInterval(transcript.Segments, intervalSeconds: 30);
foreach (var block in blocks)
{
var firstTimestamp = block[0].FormattedTimestamp;
sb.AppendLine($" [{firstTimestamp}]");
// Combine the text for segments in this time block
var blockText = string.Join(" ", block.Select(s => s.Text));
foreach (var wrappedLine in WordWrap(blockText, maxWidth: 72))
{
sb.AppendLine($" {wrappedLine}");
}
sb.AppendLine();
}
}
else
{
// No timestamps available — write plain text
sb.AppendLine(" (No timestamp data available)");
sb.AppendLine();
foreach (var wrappedLine in WordWrap(transcript.Text, maxWidth: 72))
{
sb.AppendLine($" {wrappedLine}");
}
sb.AppendLine();
}
sb.AppendLine("════════════════════════════════════════════════════════════════");
sb.AppendLine(" END OF TRANSCRIPT");
sb.AppendLine("════════════════════════════════════════════════════════════════");
await File.WriteAllTextAsync(filePath, sb.ToString(), ct);
return filePath;
}
// ─────────────────────────────────────────────────────────────────────────
// Helpers
// ─────────────────────────────────────────────────────────────────────────
/// <summary>
/// Groups timestamped segments into blocks based on a time interval.
/// This produces readable chunks (e.g. every 30 seconds) instead of
/// one line per subtitle cue.
/// </summary>
private static List<List<TimestampedSegment>> GroupSegmentsByInterval(
IReadOnlyList<TimestampedSegment> segments,
int intervalSeconds)
{
var blocks = new List<List<TimestampedSegment>>();
if (segments.Count == 0) return blocks;
var currentBlock = new List<TimestampedSegment> { segments[0] };
var blockStartTime = segments[0].Start;
for (int i = 1; i < segments.Count; i++)
{
if ((segments[i].Start - blockStartTime).TotalSeconds >= intervalSeconds)
{
blocks.Add(currentBlock);
currentBlock = new List<TimestampedSegment>();
blockStartTime = segments[i].Start;
}
currentBlock.Add(segments[i]);
}
if (currentBlock.Count > 0)
blocks.Add(currentBlock);
return blocks;
}
/// <summary>
/// Removes characters that are invalid in file names.
/// Truncates to a reasonable length to avoid path-length issues.
/// </summary>
private static string SanitizeFileName(string title)
{
var invalid = Path.GetInvalidFileNameChars();
var sb = new StringBuilder(title.Length);
foreach (var ch in title)
{
if (Array.IndexOf(invalid, ch) < 0)
sb.Append(ch);
else
sb.Append('_');
}
// Replace runs of spaces/underscores with a single underscore
var result = System.Text.RegularExpressions.Regex.Replace(
sb.ToString().Trim(), @"[\s_]+", "_");
// Truncate to keep file paths manageable
return result.Length > 80 ? result[..80] : result;
}
/// <summary>
/// Word-wraps text at the specified width, breaking at word boundaries.
/// </summary>
private static IEnumerable<string> WordWrap(string text, int maxWidth)
{
foreach (var paragraph in text.Split('\n'))
{
if (string.IsNullOrWhiteSpace(paragraph))
{
yield return string.Empty;
continue;
}
var words = paragraph.Split(' ', StringSplitOptions.RemoveEmptyEntries);
var current = new StringBuilder();
foreach (var word in words)
{
if (current.Length + word.Length + 1 > maxWidth)
{
yield return current.ToString();
current.Clear();
}
if (current.Length > 0) current.Append(' ');
current.Append(word);
}
if (current.Length > 0)
yield return current.ToString();
}
}
}