import { tool } from "ai";
import z from "zod";
import { NodeHtmlMarkdown } from "node-html-markdown";
const DEFAULT_USER_AGENT =
"Chara-Codes (+https://github.com/chara-codes/chara)";
const DEFAULT_MAX_LENGTH = 5000;
const DEFAULT_TIMEOUT = 30000; // 30 seconds
interface FetchResult {
content: string;
contentType: string;
statusCode: number;
url: string;
isHtml: boolean;
}
interface FetchError {
error: string;
code?: string;
statusCode?: number;
}
/**
* Convert HTML to Markdown using node-html-markdown
*/
function htmlToMarkdown(html: string): string {
return NodeHtmlMarkdown.translate(html);
}
/**
* Check if content appears to be HTML
*/
function isHtmlContent(content: string, contentType: string): boolean {
const lowerContent = content.toLowerCase();
const lowerContentType = contentType.toLowerCase();
return (
lowerContentType.includes("text/html") ||
lowerContent.includes(" {
try {
const response = await globalThis.fetch(url, {
headers: { "User-Agent": userAgent },
signal: AbortSignal.timeout(timeout),
redirect: "follow",
});
if (!response.ok) {
return {
error: `Failed to fetch ${url} - HTTP ${response.status}: ${response.statusText}`,
code: "HTTP_ERROR",
statusCode: response.status,
};
}
const contentType = response.headers.get("content-type") || "";
const content = await response.text();
const isHtml = isHtmlContent(content, contentType);
return {
content,
contentType,
statusCode: response.status,
url: response.url, // This will be the final URL after redirects
isHtml,
};
} catch (error) {
if (error instanceof Error) {
if (error.name === "AbortError" || error.message.includes("AbortError")) {
return {
error: `Request timeout: Failed to fetch ${url} within ${timeout}ms`,
code: "TIMEOUT",
};
}
return {
error: `Network error: ${error.message}`,
code: "NETWORK_ERROR",
};
}
return {
error: `Unknown error occurred while fetching ${url}`,
code: "UNKNOWN_ERROR",
};
}
}
export const fetchTool = tool({
description: `Fetches a URL from the internet and optionally extracts its contents as markdown.
This tool grants you internet access. You can fetch the most up-to-date information from websites.
HTML content is automatically converted to markdown for better readability, but you can request raw HTML if needed.`,
inputSchema: z.object({
url: z.string().describe("URL to fetch (must be a valid HTTP/HTTPS URL)"),
maxLength: z
.number()
.int()
.positive()
.max(1000000)
.default(DEFAULT_MAX_LENGTH)
.describe("Maximum number of characters to return"),
startIndex: z
.number()
.int()
.min(0)
.default(0)
.describe(
"Starting character index for pagination - useful if previous fetch was truncated"
),
raw: z
.boolean()
.default(false)
.describe("Return raw HTML content without markdown conversion"),
timeout: z
.number()
.int()
.positive()
.max(60000)
.default(DEFAULT_TIMEOUT)
.describe("Request timeout in milliseconds (max 60 seconds)"),
}),
execute: async ({
url,
maxLength = DEFAULT_MAX_LENGTH,
startIndex = 0,
raw = false,
timeout = DEFAULT_TIMEOUT,
}) => {
// Validate URL
try {
new URL(url);
} catch {
return `Invalid URL: ${url}`;
}
// Fetch the URL
const result = await fetchUrl(url, DEFAULT_USER_AGENT, timeout);
// Handle fetch errors
if ("error" in result) {
return `${result.error}`;
}
// Process content
let processedContent = result.content;
let prefix = "";
// Convert HTML to markdown unless raw is requested
if (result.isHtml && !raw) {
try {
processedContent = htmlToMarkdown(result.content);
} catch (error) {
// If markdown conversion fails, fall back to raw HTML
processedContent = result.content;
prefix = `Warning: HTML to markdown conversion failed, showing raw HTML\nContent type: ${result.contentType}\n\n`;
}
} else if (!result.isHtml) {
prefix = `Content type: ${result.contentType}\n\n`;
}
// Handle pagination
const originalLength = processedContent.length;
if (startIndex >= originalLength) {
return `${prefix}Contents of ${result.url}:\n\nNo more content available. Start index ${startIndex} exceeds content length ${originalLength}.`;
}
const endIndex = Math.min(startIndex + maxLength, originalLength);
const truncatedContent = processedContent.slice(startIndex, endIndex);
// Add pagination info if content was truncated
let paginationInfo = "";
if (endIndex < originalLength) {
const remainingChars = originalLength - endIndex;
paginationInfo = `\n\nContent truncated. Showing characters ${startIndex}-${endIndex} of ${originalLength}. ${remainingChars} characters remaining. Use startIndex=${endIndex} to continue.`;
}
return `${prefix}Contents of ${result.url}:\n\n${truncatedContent}${paginationInfo}`;
},
});