{"version":3,"file":"content/extract-content.mjs","sources":["webpack://@agent-infra/browser-context/./src/content/extract-content.ts"],"sourcesContent":["/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\nimport type { Page } from 'puppeteer-core';\nimport { Defuddle, DefuddleOptions } from 'defuddle/node';\nimport { READABILITY_SCRIPT } from './readability-script.js';\nimport { toMarkdown } from './to-markdown.js';\n\nexport const extractWithReadability = async (\n  page: Page,\n  options: {\n    markdown?: boolean;\n  } = {},\n): Promise<{ title: string; content: string }> => {\n  // Extract content using Readability algorithm on a document clone to prevent DOM flickering\n  const extractionResult = await page.evaluate((readabilityScript) => {\n    // Initialize Readability from script\n    const Readability = new Function(\n      'module',\n      `${readabilityScript}\\nreturn module.exports`,\n    )({});\n\n    // Create a deep clone of the document to avoid modifying the visible DOM\n    const documentClone = document.cloneNode(true) as Document;\n\n    // Clean up the cloned document\n    documentClone\n      .querySelectorAll(\n        'script,noscript,style,link,svg,img,video,iframe,canvas,.reflist',\n      )\n      .forEach((el) => el.remove());\n\n    // Parse content from the clone\n    const article = new Readability(documentClone).parse();\n    const content = article?.content || '';\n    const title = document.title;\n\n    return {\n      content,\n      title: article?.title || title,\n    };\n  }, READABILITY_SCRIPT);\n\n  return options?.markdown\n    ? {\n        title: extractionResult.title,\n        content: toMarkdown(extractionResult.content),\n      }\n    : extractionResult;\n};\n\nexport const extractWithDefuddle = async (\n  html: string,\n  url: string,\n  options: DefuddleOptions,\n): Promise<{ title: string; content: string }> => {\n  const { title, content } = await Defuddle(html, url, options);\n\n  return {\n    title,\n    content,\n  };\n};\n\n/**\n * Extract content from a page using Defuddle or Readability\n * page html -> markdown\n * @param page - The page to extract content from\n * @returns The title and content of the page\n */\nexport const extractContent = async (\n  page: Page,\n): Promise<{ title: string; content: string }> => {\n  const pageSourceHTML = await page.content();\n\n  try {\n    return await extractWithDefuddle(pageSourceHTML, page.url(), {\n      markdown: true,\n    });\n  } catch (e) {\n    return await extractWithReadability(page as any, {\n      markdown: true,\n    });\n  }\n};\n"],"names":["extractWithReadability","page","options","extractionResult","readabilityScript","Readability","Function","documentClone","document","el","article","content","title","READABILITY_SCRIPT","toMarkdown","extractWithDefuddle","html","url","Defuddle","extractContent","pageSourceHTML","e"],"mappings":";;;;;;;AASO,MAAMA,yBAAyB,OACpCC,MACAC,UAEI,CAAC,CAAC;IAGN,MAAMC,mBAAmB,MAAMF,KAAK,QAAQ,CAAC,CAACG;QAE5C,MAAMC,cAAc,IAAIC,SACtB,UACA,GAAGF,kBAAkB,uBAAuB,CAAC,EAC7C,CAAC;QAGH,MAAMG,gBAAgBC,SAAS,SAAS,CAAC;QAGzCD,cACG,gBAAgB,CACf,mEAED,OAAO,CAAC,CAACE,KAAOA,GAAG,MAAM;QAG5B,MAAMC,UAAU,IAAIL,YAAYE,eAAe,KAAK;QACpD,MAAMI,UAAUD,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,OAAO,AAAD,KAAK;QACpC,MAAME,QAAQJ,SAAS,KAAK;QAE5B,OAAO;YACLG;YACA,OAAOD,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,KAAK,AAAD,KAAKE;QAC3B;IACF,GAAGC;IAEH,OAAOX,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,QAAQ,AAAD,IACnB;QACE,OAAOC,iBAAiB,KAAK;QAC7B,SAASW,WAAWX,iBAAiB,OAAO;IAC9C,IACAA;AACN;AAEO,MAAMY,sBAAsB,OACjCC,MACAC,KACAf;IAEA,MAAM,EAAEU,KAAK,EAAED,OAAO,EAAE,GAAG,MAAMO,SAASF,MAAMC,KAAKf;IAErD,OAAO;QACLU;QACAD;IACF;AACF;AAQO,MAAMQ,iBAAiB,OAC5BlB;IAEA,MAAMmB,iBAAiB,MAAMnB,KAAK,OAAO;IAEzC,IAAI;QACF,OAAO,MAAMc,oBAAoBK,gBAAgBnB,KAAK,GAAG,IAAI;YAC3D,UAAU;QACZ;IACF,EAAE,OAAOoB,GAAG;QACV,OAAO,MAAMrB,uBAAuBC,MAAa;YAC/C,UAAU;QACZ;IACF;AACF"}