213 lines
9.3 KiB
C#
213 lines
9.3 KiB
C#
using System.Text;
|
|
using YoutubeSummarizer.Models;
|
|
|
|
namespace YoutubeSummarizer.Services;
|
|
|
|
/// <summary>
|
|
/// Saves video metadata and timestamped transcript to a plain text file.
|
|
/// The file is formatted with metadata at the top followed by the transcript
|
|
/// organized by timestamps.
|
|
/// </summary>
|
|
public static class TranscriptFileService
|
|
{
|
|
/// <summary>
|
|
/// Saves the transcript and metadata to a text file in the specified directory.
|
|
/// Returns the full path to the saved file.
|
|
/// </summary>
|
|
public static async Task<string> SaveAsync(
|
|
VideoMetadata metadata,
|
|
VideoTranscript transcript,
|
|
string? summaryText = null,
|
|
string? outputDirectory = null,
|
|
CancellationToken ct = default)
|
|
{
|
|
outputDirectory ??= Environment.CurrentDirectory;
|
|
Directory.CreateDirectory(outputDirectory);
|
|
|
|
// Build a safe filename from the video title
|
|
var safeTitle = SanitizeFileName(metadata.Title);
|
|
var fileName = $"{safeTitle}_{metadata.VideoId}.txt";
|
|
var filePath = Path.Combine(outputDirectory, fileName);
|
|
|
|
var sb = new StringBuilder();
|
|
|
|
// ── Metadata section ─────────────────────────────────────────────────
|
|
sb.AppendLine("════════════════════════════════════════════════════════════════");
|
|
sb.AppendLine(" VIDEO METADATA");
|
|
sb.AppendLine("════════════════════════════════════════════════════════════════");
|
|
sb.AppendLine();
|
|
sb.AppendLine($" Title: {metadata.Title}");
|
|
sb.AppendLine($" Channel: {metadata.ChannelTitle}");
|
|
sb.AppendLine($" Published: {metadata.PublishedAt:MMMM d, yyyy}");
|
|
sb.AppendLine($" Duration: {metadata.FormattedDuration}");
|
|
sb.AppendLine($" Video ID: {metadata.VideoId}");
|
|
sb.AppendLine($" URL: https://youtu.be/{metadata.VideoId}");
|
|
|
|
if (!string.IsNullOrWhiteSpace(summaryText))
|
|
{
|
|
sb.AppendLine();
|
|
sb.AppendLine(" ── SUMMARY ──────────────────────────────────────────────");
|
|
sb.AppendLine();
|
|
foreach (var wrappedLine in WordWrap(summaryText, maxWidth: 72))
|
|
{
|
|
sb.AppendLine($" {wrappedLine}");
|
|
}
|
|
}
|
|
|
|
sb.AppendLine();
|
|
|
|
// ── Transcript source ────────────────────────────────────────────────
|
|
var sourceLabel = transcript.Source switch
|
|
{
|
|
TranscriptSource.OwnerPublished => "Owner-published captions",
|
|
TranscriptSource.CommunityContributed => "Community-contributed captions",
|
|
TranscriptSource.AutoGenerated => "Auto-generated (ASR)",
|
|
TranscriptSource.MetadataOnly => "Metadata only (no captions)",
|
|
_ => "Unknown"
|
|
};
|
|
sb.AppendLine($" Transcript Source: {sourceLabel}");
|
|
sb.AppendLine($" Word Count: {transcript.WordCount:N0}");
|
|
sb.AppendLine($" Saved: {DateTimeOffset.UtcNow:yyyy-MM-dd HH:mm} UTC");
|
|
sb.AppendLine();
|
|
|
|
// ── Transcript section ───────────────────────────────────────────────
|
|
sb.AppendLine("════════════════════════════════════════════════════════════════");
|
|
sb.AppendLine(" TRANSCRIPT");
|
|
sb.AppendLine("════════════════════════════════════════════════════════════════");
|
|
sb.AppendLine();
|
|
|
|
if (transcript.Segments.Count > 0)
|
|
{
|
|
// Group segments into blocks by time intervals for readability
|
|
// Each block groups consecutive segments within ~30 seconds
|
|
var blocks = GroupSegmentsByInterval(transcript.Segments, intervalSeconds: 30);
|
|
|
|
foreach (var block in blocks)
|
|
{
|
|
var firstTimestamp = block[0].FormattedTimestamp;
|
|
sb.AppendLine($" [{firstTimestamp}]");
|
|
|
|
// Combine the text for segments in this time block
|
|
var blockText = string.Join(" ", block.Select(s => s.Text));
|
|
foreach (var wrappedLine in WordWrap(blockText, maxWidth: 72))
|
|
{
|
|
sb.AppendLine($" {wrappedLine}");
|
|
}
|
|
sb.AppendLine();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// No timestamps available — write plain text
|
|
sb.AppendLine(" (No timestamp data available)");
|
|
sb.AppendLine();
|
|
foreach (var wrappedLine in WordWrap(transcript.Text, maxWidth: 72))
|
|
{
|
|
sb.AppendLine($" {wrappedLine}");
|
|
}
|
|
sb.AppendLine();
|
|
}
|
|
|
|
sb.AppendLine("════════════════════════════════════════════════════════════════");
|
|
sb.AppendLine(" END OF TRANSCRIPT");
|
|
sb.AppendLine("════════════════════════════════════════════════════════════════");
|
|
|
|
await File.WriteAllTextAsync(filePath, sb.ToString(), ct);
|
|
return filePath;
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────────
|
|
// Helpers
|
|
// ─────────────────────────────────────────────────────────────────────────
|
|
|
|
/// <summary>
|
|
/// Groups timestamped segments into blocks based on a time interval.
|
|
/// This produces readable chunks (e.g. every 30 seconds) instead of
|
|
/// one line per subtitle cue.
|
|
/// </summary>
|
|
private static List<List<TimestampedSegment>> GroupSegmentsByInterval(
|
|
IReadOnlyList<TimestampedSegment> segments,
|
|
int intervalSeconds)
|
|
{
|
|
var blocks = new List<List<TimestampedSegment>>();
|
|
if (segments.Count == 0) return blocks;
|
|
|
|
var currentBlock = new List<TimestampedSegment> { segments[0] };
|
|
var blockStartTime = segments[0].Start;
|
|
|
|
for (int i = 1; i < segments.Count; i++)
|
|
{
|
|
if ((segments[i].Start - blockStartTime).TotalSeconds >= intervalSeconds)
|
|
{
|
|
blocks.Add(currentBlock);
|
|
currentBlock = new List<TimestampedSegment>();
|
|
blockStartTime = segments[i].Start;
|
|
}
|
|
currentBlock.Add(segments[i]);
|
|
}
|
|
|
|
if (currentBlock.Count > 0)
|
|
blocks.Add(currentBlock);
|
|
|
|
return blocks;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Removes characters that are invalid in file names.
|
|
/// Truncates to a reasonable length to avoid path-length issues.
|
|
/// </summary>
|
|
private static string SanitizeFileName(string title)
|
|
{
|
|
var invalid = Path.GetInvalidFileNameChars();
|
|
var sb = new StringBuilder(title.Length);
|
|
|
|
foreach (var ch in title)
|
|
{
|
|
if (Array.IndexOf(invalid, ch) < 0)
|
|
sb.Append(ch);
|
|
else
|
|
sb.Append('_');
|
|
}
|
|
|
|
// Replace runs of spaces/underscores with a single underscore
|
|
var result = System.Text.RegularExpressions.Regex.Replace(
|
|
sb.ToString().Trim(), @"[\s_]+", "_");
|
|
|
|
// Truncate to keep file paths manageable
|
|
return result.Length > 80 ? result[..80] : result;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Word-wraps text at the specified width, breaking at word boundaries.
|
|
/// </summary>
|
|
private static IEnumerable<string> WordWrap(string text, int maxWidth)
|
|
{
|
|
foreach (var paragraph in text.Split('\n'))
|
|
{
|
|
if (string.IsNullOrWhiteSpace(paragraph))
|
|
{
|
|
yield return string.Empty;
|
|
continue;
|
|
}
|
|
|
|
var words = paragraph.Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
|
var current = new StringBuilder();
|
|
|
|
foreach (var word in words)
|
|
{
|
|
if (current.Length + word.Length + 1 > maxWidth)
|
|
{
|
|
yield return current.ToString();
|
|
current.Clear();
|
|
}
|
|
|
|
if (current.Length > 0) current.Append(' ');
|
|
current.Append(word);
|
|
}
|
|
|
|
if (current.Length > 0)
|
|
yield return current.ToString();
|
|
}
|
|
}
|
|
}
|