import * as cheerio from 'cheerio'; import { Logger } from './Logger'; const logger = Logger.getInstance().getLogger(); /** * Utility class for parsing HTML content to extract Terabox-specific data. * This is crucial for bypassing the initial share page protection. */ export class DataParser { /** * Extracts essential parameters (share ID, file list, etc.) from the Terabox share page HTML. * The parameters are usually embedded in a script tag as JSON data. * @param html The HTML content of the Terabox share page. * @returns An object containing the extracted parameters, or null if parsing fails. */ public static extractShareParams(html: string): { shareId: string, uk: string, sign: string, timestamp: string, fileList: any[] } | null { logger.debug('Starting HTML content parsing for share parameters...'); try { const $ = cheerio.load(html); let shareDataScript: string | null = null; // Terabox embeds data in a script tag, usually containing 'file_list' $('script').each((i, element) => { const scriptContent = $(element).html(); if (scriptContent && scriptContent.includes('file_list')) { shareDataScript = scriptContent; return false; // Break the loop } }); if (!shareDataScript) { logger.warn('Could not find the script tag containing file_list data.'); return null; } const scriptContent: string = shareDataScript; // Regex to find the JSON object containing the data // We look for a pattern like: "file_list":[{...}], "shareid":"...", "uk":"...", "sign":"...", "timestamp":"..." const fileListMatch = scriptContent.match(/"file_list":(\[.*?\])/); const shareIdMatch = scriptContent.match(/"shareid":"(.*?)"/); const ukMatch = scriptContent.match(/"uk":"(.*?)"/); const signMatch = scriptContent.match(/"sign":"(.*?)"/); const timestampMatch = scriptContent.match(/"timestamp":"(.*?)"/); if (fileListMatch && shareIdMatch && ukMatch && signMatch && timestampMatch) { const fileListJson = fileListMatch[1]; const fileList = JSON.parse(fileListJson); const params = { shareId: shareIdMatch[1], uk: ukMatch[1], sign: signMatch[1], timestamp: timestampMatch[1], fileList: fileList, }; logger.info(params, 'Successfully extracted all share parameters.'); return params; } logger.warn('Failed to extract all necessary parameters from the script content.'); return null; } catch (error: any) { logger.error({ error: error.message || error }, 'Error during HTML parsing in DataParser.extractShareParams'); return null; } } /** * Extracts the fs_id (File System ID) from the file list. * This is the ID needed for the direct download API call. * @param fileList The list of files extracted from the share page. * @returns The fs_id of the first file, or null if not found. */ public static extractFsId(fileList: any[]): string | null { if (fileList && fileList.length > 0 && fileList[0].fs_id) { return fileList[0].fs_id.toString(); } logger.warn('Could not extract fs_id from the file list.'); return null; } }