import * as fs from "node:fs/promises";
import * as os from "node:os";
import * as path from "node:path";
import { ptree, Snowflake } from "@oh-my-pi/pi-utils";
import { settings } from "../../config/settings";
import { throwIfAborted } from "../../tools/tool-errors";
import { ensureTool } from "../../utils/tools-manager";
import { extractWithParallel, findParallelApiKey, getParallelExtractContent } from "../parallel";
import type { RenderResult, SpecialHandler } from "./types";
import { buildResult, formatMediaDuration, formatNumber } from "./types";
interface YouTubeUrl {
videoId: string;
playlistId?: string;
}
/**
* Parse YouTube URL into components
*/
function parseYouTubeUrl(url: string): YouTubeUrl | null {
try {
const parsed = new URL(url);
const hostname = parsed.hostname.replace(/^www\./, "");
// youtube.com/watch?v=VIDEO_ID
if ((hostname === "youtube.com" || hostname === "m.youtube.com") && parsed.pathname === "/watch") {
const videoId = parsed.searchParams.get("v");
const playlistId = parsed.searchParams.get("list") || undefined;
if (videoId) return { videoId, playlistId };
}
// youtube.com/v/VIDEO_ID or youtube.com/embed/VIDEO_ID
if (hostname === "youtube.com" || hostname === "m.youtube.com") {
const match = parsed.pathname.match(/^\/(v|embed)\/([a-zA-Z0-9_-]{11})/);
if (match) return { videoId: match[2] };
}
// youtu.be/VIDEO_ID
if (hostname === "youtu.be") {
const videoId = parsed.pathname.slice(1).split("/")[0];
if (videoId && /^[a-zA-Z0-9_-]{11}$/.test(videoId)) {
return { videoId };
}
}
// youtube.com/shorts/VIDEO_ID
if (hostname === "youtube.com" && parsed.pathname.startsWith("/shorts/")) {
const videoId = parsed.pathname.replace("/shorts/", "").split("/")[0];
if (videoId && /^[a-zA-Z0-9_-]{11}$/.test(videoId)) {
return { videoId };
}
}
} catch {}
return null;
}
/**
* Clean VTT subtitle content to plain text
*/
function cleanVttToText(vtt: string): string {
const lines = vtt.split("\n");
const textLines: string[] = [];
let lastLine = "";
for (const line of lines) {
// Skip WEBVTT header, timestamps, and metadata
if (
line.startsWith("WEBVTT") ||
line.startsWith("Kind:") ||
line.startsWith("Language:") ||
line.match(/^\d{2}:\d{2}/) || // Timestamp lines
line.match(/^[a-f0-9-]{36}$/) || // UUID cue identifiers
line.match(/^\d+$/) || // Numeric cue identifiers
line.includes("-->") ||
line.trim() === ""
) {
continue;
}
// Remove inline timestamp tags like <00:00:01.520>
let cleaned = line.replace(/<\d{2}:\d{2}:\d{2}\.\d{3}>/g, "");
// Remove other VTT tags like
cleaned = cleaned.replace(/<\/?[^>]+>/g, "");
cleaned = cleaned.trim();
// Skip duplicates (auto-generated captions often repeat)
if (cleaned && cleaned !== lastLine) {
textLines.push(cleaned);
lastLine = cleaned;
}
}
return textLines.join(" ").replace(/\s+/g, " ").trim();
}
/**
* Handle YouTube URLs - fetch metadata and transcript
*/
export const handleYouTube: SpecialHandler = async (
url: string,
timeout: number,
userSignal?: AbortSignal,
): Promise => {
throwIfAborted(userSignal);
const yt = parseYouTubeUrl(url);
if (!yt) return null;
const signal = ptree.combineSignals(userSignal, timeout * 1000);
const fetchedAt = new Date().toISOString();
const notes: string[] = [];
const videoUrl = `https://www.youtube.com/watch?v=${yt.videoId}`;
// Prefer Parallel extract when credentials are available
if (settings.get("providers.parallelFetch") && (await findParallelApiKey())) {
try {
const parallelResult = await extractWithParallel([videoUrl], {
objective: "Extract the main content of this YouTube video page",
excerpts: true,
fullContent: false,
signal,
});
const firstDocument = parallelResult.results[0];
if (firstDocument) {
const content = getParallelExtractContent(firstDocument);
if (content.trim().length > 100) {
return buildResult(content, {
url,
finalUrl: videoUrl,
method: "parallel",
fetchedAt,
notes: ["Used Parallel extract for YouTube"],
});
}
}
} catch {
throwIfAborted(signal);
}
}
// Ensure yt-dlp is available (auto-download if missing)
const ytdlp = await ensureTool("yt-dlp", { signal, silent: true });
if (!ytdlp) {
return {
url,
finalUrl: url,
contentType: "text/plain",
method: "youtube-no-ytdlp",
content: "YouTube video detected but yt-dlp could not be installed.",
fetchedAt: new Date().toISOString(),
truncated: false,
notes: ["yt-dlp installation failed"],
};
}
const execOptions = {
mode: "group" as const,
signal,
allowNonZero: true,
allowAbort: true,
stderr: "full" as const,
};
// Fetch video metadata
const metaResult = await ptree.exec(
[ytdlp, "--dump-json", "--no-warnings", "--no-playlist", "--skip-download", videoUrl],
execOptions,
);
let title = "YouTube Video";
let channel = "";
let description = "";
let duration = 0;
let uploadDate = "";
let viewCount = 0;
if (metaResult.ok && metaResult.stdout.trim()) {
try {
const meta = JSON.parse(metaResult.stdout) as {
title?: string;
channel?: string;
uploader?: string;
description?: string;
duration?: number;
upload_date?: string;
view_count?: number;
};
title = meta.title || title;
channel = meta.channel || meta.uploader || "";
description = meta.description || "";
duration = meta.duration || 0;
uploadDate = meta.upload_date || "";
viewCount = meta.view_count || 0;
} catch {}
}
// Format upload date
let formattedDate = "";
if (uploadDate && uploadDate.length === 8) {
formattedDate = `${uploadDate.slice(0, 4)}-${uploadDate.slice(4, 6)}-${uploadDate.slice(6, 8)}`;
}
// Try to fetch subtitles
let transcript = "";
let transcriptSource = "";
// First, list available subtitles
const listResult = await ptree.exec(
[ytdlp, "--list-subs", "--no-warnings", "--no-playlist", "--skip-download", videoUrl],
execOptions,
);
const hasManualSubs = listResult.stdout.includes("[info] Available subtitles");
const hasAutoSubs = listResult.stdout.includes("[info] Available automatic captions");
// Create temp directory for subtitle download
const tmpDir = os.tmpdir();
const tmpBase = path.join(tmpDir, `yt-${yt.videoId}-${Snowflake.next()}`);
try {
// Try manual subtitles first (English preferred)
if (hasManualSubs) {
const subResult = await ptree.exec(
[
ytdlp,
"--write-sub",
"--sub-lang",
"en,en-US,en-GB",
"--sub-format",
"vtt",
"--skip-download",
"--no-warnings",
"--no-playlist",
"-o",
tmpBase,
videoUrl,
],
execOptions,
);
if (subResult.ok) {
// Find the downloaded subtitle file using glob
const subFiles = await Array.fromAsync(new Bun.Glob(`${tmpBase}*.vtt`).scan({ absolute: true }));
if (subFiles.length > 0) {
const vttContent = await Bun.file(subFiles[0]).text();
transcript = cleanVttToText(vttContent);
transcriptSource = "manual";
notes.push("Using manual subtitles");
}
}
}
// Fall back to auto-generated captions
if (!transcript && hasAutoSubs) {
const autoResult = await ptree.exec(
[
ytdlp,
"--write-auto-sub",
"--sub-lang",
"en,en-US,en-GB",
"--sub-format",
"vtt",
"--skip-download",
"--no-warnings",
"--no-playlist",
"-o",
tmpBase,
videoUrl,
],
execOptions,
);
if (autoResult.ok) {
const subFiles = await Array.fromAsync(new Bun.Glob(`${tmpBase}*.vtt`).scan({ absolute: true }));
if (subFiles.length > 0) {
const vttContent = await Bun.file(subFiles[0]).text();
transcript = cleanVttToText(vttContent);
transcriptSource = "auto-generated";
notes.push("Using auto-generated captions");
}
}
}
} finally {
throwIfAborted(signal);
// Cleanup temp files (fire-and-forget with error suppression)
Array.fromAsync(new Bun.Glob(`${tmpBase}*`).scan({ absolute: true }))
.then(tmpFiles => Promise.all(tmpFiles.map(f => fs.unlink(f).catch(() => {}))))
.catch(() => {});
}
// Build markdown output
let md = `# ${title}\n\n`;
if (channel) md += `**Channel:** ${channel}\n`;
if (formattedDate) md += `**Uploaded:** ${formattedDate}\n`;
if (duration > 0) md += `**Duration:** ${formatMediaDuration(duration)}\n`;
if (viewCount > 0) md += `**Views:** ${formatNumber(viewCount)}\n`;
md += `**Video ID:** ${yt.videoId}\n\n`;
if (description) {
// Truncate long descriptions
const descPreview = description.length > 1000 ? `${description.slice(0, 1000)}…` : description;
md += `---\n\n## Description\n\n${descPreview}\n\n`;
}
if (transcript) {
md += `---\n\n## Transcript (${transcriptSource})\n\n${transcript}\n`;
} else {
notes.push("No subtitles/captions available");
md += `---\n\n*No transcript available for this video.*\n`;
}
return buildResult(md, { url, finalUrl: videoUrl, method: "youtube", fetchedAt, notes });
};