{"version":3,"file":"webbrowser.cjs","names":["Tool","RecursiveCharacterTextSplitter","Document","formatDocumentsAsString","MemoryVectorStore","RunnableSequence","StringOutputParser"],"sources":["../../src/tools/webbrowser.ts"],"sourcesContent":["import type { BaseLanguageModelInterface } from \"@langchain/core/language_models/base\";\nimport { Document } from \"@langchain/core/documents\";\nimport type { EmbeddingsInterface } from \"@langchain/core/embeddings\";\n\nimport * as cheerio from \"cheerio\";\nimport {\n  CallbackManager,\n  CallbackManagerForToolRun,\n} from \"@langchain/core/callbacks/manager\";\n\nimport { Tool, ToolParams } from \"@langchain/core/tools\";\nimport { RunnableSequence } from \"@langchain/core/runnables\";\nimport { StringOutputParser } from \"@langchain/core/output_parsers\";\nimport {\n  RecursiveCharacterTextSplitter,\n  TextSplitter,\n} from \"@langchain/textsplitters\";\nimport { MemoryVectorStore } from \"../vectorstores/memory.js\";\n\nimport { formatDocumentsAsString } from \"../util/document.js\";\n\nexport const parseInputs = (inputs: string): [string, string] => {\n  const [baseUrl, task] = inputs.split(\",\").map((input) => {\n    let t = input.trim();\n    t = t.startsWith('\"') ? t.slice(1) : t;\n    t = t.endsWith('\"') ? t.slice(0, -1) : t;\n    // it likes to put / at the end of urls, wont matter for task\n    t = t.endsWith(\"/\") ? t.slice(0, -1) : t;\n    return t.trim();\n  });\n\n  return [baseUrl, task];\n};\n\nexport const getText = (\n  html: string,\n  baseUrl: string,\n  summary: boolean\n): string => {\n  // scriptingEnabled so noscript elements are parsed\n  const $ = cheerio.load(html, { scriptingEnabled: true });\n\n  let text = \"\";\n\n  // lets only get the body if its a summary, dont need to summarize header or footer etc\n  const rootElement = summary ? \"body \" : \"*\";\n\n  // oxlint-disable-next-line @typescript-eslint/no-explicit-any\n  $(`${rootElement}:not(style):not(script):not(svg)`).each((_i, elem: any) => {\n    // we dont want duplicated content as we drill down so remove children\n    let content = $(elem).clone().children().remove().end().text().trim();\n    const $el = $(elem);\n\n    // if its an ahref, print the content and url\n    let href = $el.attr(\"href\");\n    if ($el.prop(\"tagName\")?.toLowerCase() === \"a\" && href) {\n      if (!href.startsWith(\"http\")) {\n        try {\n          href = new URL(href, baseUrl).toString();\n        } catch {\n          // if this fails thats fine, just no url for this\n          href = \"\";\n        }\n      }\n\n      const imgAlt = $el.find(\"img[alt]\").attr(\"alt\")?.trim();\n      if (imgAlt) {\n        content += ` ${imgAlt}`;\n      }\n\n      text += ` [${content}](${href})`;\n    }\n    // otherwise just print the content\n    else if (content !== \"\") {\n      text += ` ${content}`;\n    }\n  });\n\n  return text.trim().replace(/\\n+/g, \" \");\n};\n\nconst getHtml = async (baseUrl: string, h: Headers, config: RequestConfig) => {\n  const domain = new URL(baseUrl).hostname;\n\n  const headers = { ...h };\n  // these appear to be positional, which means they have to exist in the headers passed in\n  headers.Host = domain;\n  headers[\"Alt-Used\"] = domain;\n\n  let htmlResponse;\n  try {\n    const fetchOptions: RequestInit = {\n      method: \"GET\",\n      headers,\n      credentials: config.withCredentials ? \"include\" : \"same-origin\",\n      ...config,\n    };\n\n    htmlResponse = await fetch(baseUrl, fetchOptions);\n\n    if (!htmlResponse.ok) {\n      throw new Error(`http response ${htmlResponse.status}`);\n    }\n  } catch (e) {\n    if (\n      e &&\n      typeof e === \"object\" &&\n      \"message\" in e &&\n      typeof e.message === \"string\" &&\n      e.message.includes(\"http response\")\n    ) {\n      throw e;\n    }\n    throw e;\n  }\n\n  const allowedContentTypes = [\n    \"text/html\",\n    \"application/json\",\n    \"application/xml\",\n    \"application/javascript\",\n    \"text/plain\",\n  ];\n\n  const contentType = htmlResponse.headers.get(\"content-type\") || \"\";\n  const contentTypeArray = contentType.split(\";\");\n  if (\n    contentTypeArray[0] &&\n    !allowedContentTypes.includes(contentTypeArray[0])\n  ) {\n    throw new Error(\"returned page was not utf8\");\n  }\n  return htmlResponse.text();\n};\n\nconst DEFAULT_HEADERS = {\n  Accept:\n    \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8\",\n  \"Accept-Encoding\": \"gzip, deflate\",\n  \"Accept-Language\": \"en-US,en;q=0.5\",\n  \"Alt-Used\": \"LEAVE-THIS-KEY-SET-BY-TOOL\",\n  Connection: \"keep-alive\",\n  Host: \"LEAVE-THIS-KEY-SET-BY-TOOL\",\n  Referer: \"https://www.google.com/\",\n  \"Sec-Fetch-Dest\": \"document\",\n  \"Sec-Fetch-Mode\": \"navigate\",\n  \"Sec-Fetch-Site\": \"cross-site\",\n  \"Upgrade-Insecure-Requests\": \"1\",\n  \"User-Agent\":\n    \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0\",\n};\n\n// oxlint-disable-next-line @typescript-eslint/no-explicit-any\ntype Headers = Record<string, any>;\n\n/**\n * Configuration options for fetch requests, similar to axios config but for fetch\n */\nexport interface RequestConfig extends Omit<RequestInit, \"headers\"> {\n  withCredentials?: boolean;\n}\n\n/**\n * Defines the arguments that can be passed to the WebBrowser constructor.\n * It extends the ToolParams interface and includes properties for a\n * language model, embeddings, HTTP headers, an Axios configuration, a\n * callback manager, and a text splitter.\n */\nexport interface WebBrowserArgs extends ToolParams {\n  model: BaseLanguageModelInterface;\n\n  embeddings: EmbeddingsInterface;\n\n  headers?: Headers;\n\n  requestConfig?: RequestConfig;\n\n  /** @deprecated */\n  callbackManager?: CallbackManager;\n\n  textSplitter?: TextSplitter;\n}\n\n/**\n * A class designed to interact with web pages, either to extract\n * information from them or to summarize their content. It uses the native\n * fetch API to send HTTP requests and the cheerio library to parse the\n * returned HTML.\n * @example\n * ```typescript\n * const browser = new WebBrowser({\n *   model: new ChatOpenAI({ model: \"gpt-4o-mini\", temperature: 0 }),\n *   embeddings: new OpenAIEmbeddings({}),\n * });\n * const result = await browser.invoke(\"https:exampleurl.com\");\n * ```\n */\nexport class WebBrowser extends Tool {\n  static lc_name() {\n    return \"WebBrowser\";\n  }\n\n  get lc_namespace() {\n    return [...super.lc_namespace, \"webbrowser\"];\n  }\n\n  private model: BaseLanguageModelInterface;\n\n  private embeddings: EmbeddingsInterface;\n\n  private headers: Headers;\n\n  private requestConfig: RequestConfig;\n\n  private textSplitter: TextSplitter;\n\n  constructor({\n    model,\n    headers,\n    embeddings,\n    requestConfig,\n    textSplitter,\n  }: WebBrowserArgs) {\n    super(...arguments);\n\n    this.model = model;\n    this.embeddings = embeddings;\n    this.headers = headers ?? DEFAULT_HEADERS;\n    this.requestConfig = {\n      withCredentials: true,\n      ...requestConfig,\n    };\n    this.textSplitter =\n      textSplitter ??\n      new RecursiveCharacterTextSplitter({\n        chunkSize: 2000,\n        chunkOverlap: 200,\n      });\n  }\n\n  /** @ignore */\n  async _call(inputs: string, runManager?: CallbackManagerForToolRun) {\n    const [baseUrl, task] = parseInputs(inputs);\n    const doSummary = !task;\n\n    let text;\n    try {\n      const html = await getHtml(baseUrl, this.headers, this.requestConfig);\n      text = getText(html, baseUrl, doSummary);\n    } catch (e) {\n      if (e) {\n        return e.toString();\n      }\n      return \"There was a problem connecting to the site\";\n    }\n\n    const texts = await this.textSplitter.splitText(text);\n\n    let context;\n    // if we want a summary grab first 4\n    if (doSummary) {\n      context = texts.slice(0, 4).join(\"\\n\");\n    }\n    // search term well embed and grab top 4\n    else {\n      const docs = texts.map(\n        (pageContent) =>\n          new Document({\n            pageContent,\n            metadata: [],\n          })\n      );\n\n      const vectorStore = await MemoryVectorStore.fromDocuments(\n        docs,\n        this.embeddings\n      );\n      const results = await vectorStore.similaritySearch(\n        task,\n        4,\n        undefined,\n        runManager?.getChild(\"vectorstore\")\n      );\n      context = formatDocumentsAsString(results);\n    }\n\n    const input = `Text:${context}\\n\\nI need ${\n      doSummary ? \"a summary\" : task\n    } from the above text, also provide up to 5 markdown links from within that would be of interest (always including URL and text). Links should be provided, if present, in markdown syntax as a list under the heading \"Relevant Links:\".`;\n\n    const chain = RunnableSequence.from([this.model, new StringOutputParser()]);\n    return chain.invoke(input, runManager?.getChild());\n  }\n\n  name = \"web-browser\";\n\n  description = `useful for when you need to find something on or summarize a webpage. input should be a comma separated list of \"ONE valid http URL including protocol\",\"what you want to find on the page or empty string for a summary\".`;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;AAqBA,MAAa,eAAe,WAAqC;CAC/D,MAAM,CAAC,SAAS,QAAQ,OAAO,MAAM,IAAI,CAAC,KAAK,UAAU;EACvD,IAAI,IAAI,MAAM,MAAM;AACpB,MAAI,EAAE,WAAW,KAAI,GAAG,EAAE,MAAM,EAAE,GAAG;AACrC,MAAI,EAAE,SAAS,KAAI,GAAG,EAAE,MAAM,GAAG,GAAG,GAAG;AAEvC,MAAI,EAAE,SAAS,IAAI,GAAG,EAAE,MAAM,GAAG,GAAG,GAAG;AACvC,SAAO,EAAE,MAAM;GACf;AAEF,QAAO,CAAC,SAAS,KAAK;;AAGxB,MAAa,WACX,MACA,SACA,YACW;CAEX,MAAM,IAAI,QAAQ,KAAK,MAAM,EAAE,kBAAkB,MAAM,CAAC;CAExD,IAAI,OAAO;AAMX,GAAE,GAHkB,UAAU,UAAU,IAGvB,kCAAkC,CAAC,MAAM,IAAI,SAAc;EAE1E,IAAI,UAAU,EAAE,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM;EACrE,MAAM,MAAM,EAAE,KAAK;EAGnB,IAAI,OAAO,IAAI,KAAK,OAAO;AAC3B,MAAI,IAAI,KAAK,UAAU,EAAE,aAAa,KAAK,OAAO,MAAM;AACtD,OAAI,CAAC,KAAK,WAAW,OAAO,CAC1B,KAAI;AACF,WAAO,IAAI,IAAI,MAAM,QAAQ,CAAC,UAAU;WAClC;AAEN,WAAO;;GAIX,MAAM,SAAS,IAAI,KAAK,WAAW,CAAC,KAAK,MAAM,EAAE,MAAM;AACvD,OAAI,OACF,YAAW,IAAI;AAGjB,WAAQ,KAAK,QAAQ,IAAI,KAAK;aAGvB,YAAY,GACnB,SAAQ,IAAI;GAEd;AAEF,QAAO,KAAK,MAAM,CAAC,QAAQ,QAAQ,IAAI;;AAGzC,MAAM,UAAU,OAAO,SAAiB,GAAY,WAA0B;CAC5E,MAAM,SAAS,IAAI,IAAI,QAAQ,CAAC;CAEhC,MAAM,UAAU,EAAE,GAAG,GAAG;AAExB,SAAQ,OAAO;AACf,SAAQ,cAAc;CAEtB,IAAI;AACJ,KAAI;EACF,MAAM,eAA4B;GAChC,QAAQ;GACR;GACA,aAAa,OAAO,kBAAkB,YAAY;GAClD,GAAG;GACJ;AAED,iBAAe,MAAM,MAAM,SAAS,aAAa;AAEjD,MAAI,CAAC,aAAa,GAChB,OAAM,IAAI,MAAM,iBAAiB,aAAa,SAAS;UAElD,GAAG;AACV,MACE,KACA,OAAO,MAAM,YACb,aAAa,KACb,OAAO,EAAE,YAAY,YACrB,EAAE,QAAQ,SAAS,gBAAgB,CAEnC,OAAM;AAER,QAAM;;CAGR,MAAM,sBAAsB;EAC1B;EACA;EACA;EACA;EACA;EACD;CAGD,MAAM,oBADc,aAAa,QAAQ,IAAI,eAAe,IAAI,IAC3B,MAAM,IAAI;AAC/C,KACE,iBAAiB,MACjB,CAAC,oBAAoB,SAAS,iBAAiB,GAAG,CAElD,OAAM,IAAI,MAAM,6BAA6B;AAE/C,QAAO,aAAa,MAAM;;AAG5B,MAAM,kBAAkB;CACtB,QACE;CACF,mBAAmB;CACnB,mBAAmB;CACnB,YAAY;CACZ,YAAY;CACZ,MAAM;CACN,SAAS;CACT,kBAAkB;CAClB,kBAAkB;CAClB,kBAAkB;CAClB,6BAA6B;CAC7B,cACE;CACH;;;;;;;;;;;;;;;AA+CD,IAAa,aAAb,cAAgCA,sBAAAA,KAAK;CACnC,OAAO,UAAU;AACf,SAAO;;CAGT,IAAI,eAAe;AACjB,SAAO,CAAC,GAAG,MAAM,cAAc,aAAa;;CAG9C;CAEA;CAEA;CAEA;CAEA;CAEA,YAAY,EACV,OACA,SACA,YACA,eACA,gBACiB;AACjB,QAAM,GAAG,UAAU;AAEnB,OAAK,QAAQ;AACb,OAAK,aAAa;AAClB,OAAK,UAAU,WAAW;AAC1B,OAAK,gBAAgB;GACnB,iBAAiB;GACjB,GAAG;GACJ;AACD,OAAK,eACH,gBACA,IAAIC,yBAAAA,+BAA+B;GACjC,WAAW;GACX,cAAc;GACf,CAAC;;;CAIN,MAAM,MAAM,QAAgB,YAAwC;EAClE,MAAM,CAAC,SAAS,QAAQ,YAAY,OAAO;EAC3C,MAAM,YAAY,CAAC;EAEnB,IAAI;AACJ,MAAI;AAEF,UAAO,QADM,MAAM,QAAQ,SAAS,KAAK,SAAS,KAAK,cAAc,EAChD,SAAS,UAAU;WACjC,GAAG;AACV,OAAI,EACF,QAAO,EAAE,UAAU;AAErB,UAAO;;EAGT,MAAM,QAAQ,MAAM,KAAK,aAAa,UAAU,KAAK;EAErD,IAAI;AAEJ,MAAI,UACF,WAAU,MAAM,MAAM,GAAG,EAAE,CAAC,KAAK,KAAK;OAGnC;GACH,MAAM,OAAO,MAAM,KAChB,gBACC,IAAIC,0BAAAA,SAAS;IACX;IACA,UAAU,EAAE;IACb,CAAC,CACL;AAYD,aAAUC,sBAAAA,wBANM,OAJI,MAAMC,4BAAAA,kBAAkB,cAC1C,MACA,KAAK,WACN,EACiC,iBAChC,MACA,GACA,KAAA,GACA,YAAY,SAAS,cAAc,CACpC,CACyC;;EAG5C,MAAM,QAAQ,QAAQ,QAAQ,aAC5B,YAAY,cAAc,KAC3B;AAGD,SADcC,0BAAAA,iBAAiB,KAAK,CAAC,KAAK,OAAO,IAAIC,+BAAAA,oBAAoB,CAAC,CAAC,CAC9D,OAAO,OAAO,YAAY,UAAU,CAAC;;CAGpD,OAAO;CAEP,cAAc"}