import * as p_throttle from 'p-throttle'; import { AIFunctionsProvider } from '@agentic/core'; import { KyInstance } from 'ky'; import { z } from 'zod'; declare namespace diffbot { const API_BASE_URL = "https://api.diffbot.com"; const KNOWLEDGE_GRAPH_API_BASE_URL = "https://kg.diffbot.com"; const throttle: (function_: F) => p_throttle.ThrottledFunction; interface ExtractOptions { /** Specify optional fields to be returned from any fully-extracted pages, e.g.: &fields=querystring,links. See available fields within each API's individual documentation pages. * @see https://docs.diffbot.com/reference/extract-optional-fields */ fields?: string[]; /** (*Undocumented*) Pass paging=false to disable automatic concatenation of multiple-page articles. (By default, Diffbot will concatenate up to 20 pages of a single article.) */ paging?: boolean; /** Pass discussion=false to disable automatic extraction of comments or reviews from pages identified as articles or products. This will not affect pages identified as discussions. */ discussion?: boolean; /** Sets a value in milliseconds to wait for the retrieval/fetch of content from the requested URL. The default timeout for the third-party response is 30 seconds (30000). */ timeout?: number; /** Used to specify the IP address of a custom proxy that will be used to fetch the target page, instead of Diffbot's default IPs/proxies. (Ex: &proxy=168.212.226.204) */ proxy?: string; /** Used to specify the authentication parameters that will be used with the proxy specified in the &proxy parameter. (Ex: &proxyAuth=username:password) */ proxyAuth?: string; /** `none` will instruct Extract to not use proxies, even if proxies have been enabled for this particular URL globally. */ useProxy?: string; /** @see https://docs.diffbot.com/reference/extract-custom-javascript */ customJs?: string; /** @see https://docs.diffbot.com/reference/extract-custom-headers */ customHeaders?: Record; } interface ExtractAnalyzeOptions extends ExtractOptions { /** URL of the web page to process */ url: string; /** By default the Analyze API will fully extract all pages that match an existing Automatic API -- articles, products or image pages. Set mode to a specific page-type (e.g., mode=article) to extract content only from that specific page-type. All other pages will simply return the default Analyze fields. */ mode?: string; /** Force any non-extracted pages (those with a type of "other") through a specific API. For example, to route all "other" pages through the Article API, pass &fallback=article. Pages that utilize this functionality will return a fallbackType field at the top-level of the response and a originalType field within each extracted object, both of which will indicate the fallback API used. */ fallback?: string; } interface ExtractArticleOptions extends ExtractOptions { /** URL of the web page to process */ url: string; /** Set the maximum number of automatically-generated tags to return. By default a maximum of ten tags will be returned. */ maxTags?: number; /** Set the minimum relevance score of tags to return, between 0.0 and 1.0. By default only tags with a score equal to or above 0.5 will be returned. */ tagConfidence?: number; /** Used to request the output of the Diffbot Natural Language API in the field naturalLanguage. Example: &naturalLanguage=entities,facts,categories,sentiment. */ naturalLanguage?: string[]; } interface ExtractResponse { request: DiffbotRequest; objects: DiffbotObject[]; } type ExtractArticleResponse = ExtractResponse; interface ExtractAnalyzeResponse extends ExtractResponse { type: string; title: string; humanLanguage: string; } interface DiffbotObject { type: string; title: string; pageUrl: string; diffbotUri: string; description?: string; date?: string; sentiment?: number; author?: string; estimatedDate?: string; publisherRegion?: string; icon?: string; siteName?: string; publisherCountry?: string; humanLanguage?: string; authorUrl?: string; html?: string; text?: string; images?: Image[]; tags?: Tag[]; categories?: ObjectCategory[]; authors?: Author[]; breadcrumb?: Breadcrumb[]; items?: ListItem[]; meta?: any; } interface ListItem { title: string; link: string; summary: string; image?: string; } interface Author { name: string; link: string; } interface ObjectCategory { score: number; name: string; id: string; } interface Breadcrumb { link: string; name: string; } interface Image { url: string; diffbotUri: string; naturalWidth: number; naturalHeight: number; width: number; height: number; isCached?: boolean; primary?: boolean; } interface Tag { score: number; sentiment: number; count: number; label: string; uri: string; rdfTypes: string[]; } interface DiffbotRequest { pageUrl: string; api: string; version: number; } interface KnowledgeGraphSearchOptions { type?: 'query' | 'text' | 'queryTextFallback' | 'crawl'; query: string; col?: string; from?: number; size?: number; filter?: string; jsonmode?: 'extended' | 'id'; nonCanonicalFacts?: boolean; noDedupArticles?: boolean; cluster?: 'all' | 'best' | 'dedupe'; report?: boolean; } interface KnowledgeGraphEnhanceOptions { type: EntityType; id?: string; name?: string; url?: string; phone?: string; email?: string; employer?: string; title?: string; school?: string; location?: string; ip?: string; customId?: string; size?: number; threshold?: number; refresh?: boolean; search?: boolean; useCache?: boolean; filter?: string; jsonmode?: 'extended' | 'id'; nonCanonicalFacts?: boolean; } interface KnowledgeGraphResponse { data: KnowledgeGraphNode[]; version: number; hits: number; results: number; kgversion: string; diffbot_type: string; facet?: boolean; errors?: any[]; } interface KnowledgeGraphNode { score: number; esscore?: number; entity: KnowledgeGraphEntity; entity_ctx: any; errors: string[]; callbackQuery: string; upperBound: number; lowerBound: number; count: number; value: string; uri: string; } interface KnowledgeGraphEntity { id: string; diffbotUri: string; type?: string; name: string; images: Image[]; origins: string[]; nbOrigins?: number; gender?: Gender; githubUri?: string; importance?: number; description?: string; homepageUri?: string; allNames?: string[]; skills?: Partial[]; crawlTimestamp?: number; summary?: string; image?: string; types?: string[]; nbIncomingEdges?: number; allUris?: string[]; employments?: Employment[]; locations?: Location[]; location?: Location; allOriginHashes?: string[]; nameDetail?: NameDetail; } type EntityType = 'Organization' | 'Place'; const EnhanceEntityOptionsSchema: z.ZodObject<{ type: z.ZodEnum<["Person", "Organization"]>; id: z.ZodOptional; name: z.ZodOptional]>>; url: z.ZodOptional]>>; phone: z.ZodOptional; email: z.ZodOptional; employer: z.ZodOptional; title: z.ZodOptional; school: z.ZodOptional; location: z.ZodOptional; ip: z.ZodOptional; customId: z.ZodOptional; threshold: z.ZodOptional; refresh: z.ZodOptional; search: z.ZodOptional; size: z.ZodOptional; }, "strip", z.ZodTypeAny, { type: "Organization" | "Person"; url?: string | string[] | undefined; id?: string | undefined; name?: string | string[] | undefined; phone?: string | undefined; email?: string | undefined; employer?: string | undefined; title?: string | undefined; school?: string | undefined; location?: string | undefined; ip?: string | undefined; customId?: string | undefined; threshold?: number | undefined; refresh?: boolean | undefined; search?: boolean | undefined; size?: number | undefined; }, { type: "Organization" | "Person"; url?: string | string[] | undefined; id?: string | undefined; name?: string | string[] | undefined; phone?: string | undefined; email?: string | undefined; employer?: string | undefined; title?: string | undefined; school?: string | undefined; location?: string | undefined; ip?: string | undefined; customId?: string | undefined; threshold?: number | undefined; refresh?: boolean | undefined; search?: boolean | undefined; size?: number | undefined; }>; type EnhanceEntityOptions = z.infer; interface EnhanceEntityResponse { version: number; hits: number; kgversion: string; request_ctx: RequestCtx; data: EnhanceEntityResult[]; errors: any[]; } interface RequestCtx { query: Query; query_ctx: QueryCtx; } interface Query { type: string; name: string[]; } interface QueryCtx { search: string; } interface EnhanceEntityResult { score: number; esscore: number; entity: Entity; errors: any[]; } interface Entity { name: string; type: EntityType; id: string; summary?: string; description?: string; homepageUri?: string; twitterUri?: string; linkedInUri?: string; githubUri?: string; crunchbaseUri?: string; googlePlusUri?: string; facebookUri?: string; angellistUri?: string; wikipediaUri?: string; diffbotUri?: string; origin?: string; origins?: string[]; allUris?: string[]; nbOrigins?: number; nbIncomingEdges?: number; nbFollowers?: number; nbLocations?: number; nbEmployees?: number; nbEmployeesMin?: number; nbEmployeesMax?: number; nbActiveEmployeeEdges?: number; nbUniqueInvestors?: number; educations?: Education[]; nationalities?: Nationality[]; fullName?: string; allNames?: string[]; skills?: Partial[]; children?: BasicEntity[]; height?: number; image?: string; images?: Image[]; allOriginHashes?: string[]; nameDetail?: NameDetail; parents?: BasicEntity[]; gender?: Gender; importance?: number; monthlyTraffic?: number; monthlyTrafficGrowth?: number; wikipediaPageviews?: number; wikipediaPageviewsLastQuarterGrowth?: number; wikipediaPageviewsLastYear?: number; wikipediaPageviewsLastYearGrowth?: number; wikipediaPageviewsLastQuarter?: number; wikipediaPageviewsGrowth?: number; birthPlace?: Location; types?: string[]; unions?: Union[]; languages?: Language[]; employments?: Employment[]; birthDate?: DateTime; religion?: Partial; awards?: Award[]; netWorth?: Amount; allDescriptions?: string[]; locations?: Location[]; location?: Location; interests?: Interest[]; suppliers?: BasicEntity[]; subsidiaries?: BasicEntity[]; ipo?: { date: DateTime; stockExchange: string; }; motto?: string; logo?: string; foundingDate?: DateTime; totalInvestment?: Amount; naicsClassification2017?: any[]; naicsClassification?: any[]; sicClassification?: any[]; naceClassification?: any[]; iSicClassification?: any[]; employeeCategories?: any[]; emailAddresses?: EmailAddress[]; age?: number; isPublic?: boolean; isAcquired?: boolean; isDissolved?: boolean; isNonProfit?: boolean; crawlTimestamp?: number; founders?: BasicEntity[]; boardMembers?: BasicEntity[]; ceo?: BasicEntity; investments?: Investment[]; acquiredBy?: BasicEntity[]; diffbotClassification?: any[]; blogUri?: string; descriptors?: string[]; industries?: string[]; partnerships?: BasicEntity[]; categories?: Category[]; customers?: BasicEntity[]; technographics?: Technographic[]; stock?: Stock; companiesHouseIds?: string[]; yearlyRevenues?: AnnualRevenue[]; revenue?: Amount; parentCompany?: BasicEntity; legalEntities?: BasicEntity[]; } interface AnnualRevenue { revenue: Amount; isCurrent: boolean; year: number; filingDate: DateTime; revenueDate: DateTime; } interface Technographic { technology: Partial; categories: string[]; } interface Category { level?: number; isPrimary?: boolean; name: string; diffbotUri?: string; targetDiffbotId?: string; type?: string; } interface Investment { date: DateTime; amount?: Amount; isCurrent: boolean; series: string; investors: BasicEntity[]; } interface Amount { currency: string; value: number; } interface EmailAddress { contactString: string; type: string; } interface Education { institution: BasicEntity; isCurrent?: boolean; major?: BasicEntity; degree?: BasicEntity; from?: DateTime; to?: DateTime; } interface DateTime { str: string; precision: number; timestamp: number; } interface Nationality { name: string; type: string; } interface Image { url: string; primary?: boolean; } interface NameDetail { firstName: string; lastName: string; middleName?: string[]; } interface Gender { normalizedValue: string; } interface Stock { symbol: string; isCurrent: boolean; exchange: string; } interface Union { person: BasicEntity; from?: DateTime; to?: DateTime; type?: string; } interface BasicEntity { name: string; summary: string; type: string; image?: string; types?: string[]; diffbotUri: string; targetDiffbotId: string; } interface Language { str: string; normalizedValue: string; } interface Employment { isCurrent?: boolean; title?: string; description?: string; employer?: BasicEntity; location?: Location; categories?: Partial[]; from?: DateTime; to?: DateTime; } interface Location { isCurrent: boolean; country?: BasicEntity; address?: string; city?: BasicEntity; street?: string; metroArea?: BasicEntity; subregion?: BasicEntity; surfaceForm?: string; latitude?: number; longitude?: number; postalCode?: string; region?: BasicEntity; precision?: number; } interface Award { title: string; date?: DateTime; } interface Interest { name: string; type: string; } } /** * Diffbot provides web page classification and scraping. It also provides * access to a knowledge graph with the ability to perform person and company * data enrichment. * * @see https://docs.diffbot.com */ declare class DiffbotClient extends AIFunctionsProvider { protected readonly ky: KyInstance; protected readonly kyKnowledgeGraph: KyInstance; protected readonly apiKey: string; protected readonly apiBaseUrl: string; protected readonly apiKnowledgeGraphBaseUrl: string; constructor({ apiKey, apiBaseUrl, apiKnowledgeGraphBaseUrl, timeoutMs, throttle, ky }?: { apiKey?: string; apiBaseUrl?: string; apiKnowledgeGraphBaseUrl?: string; timeoutMs?: number; throttle?: boolean; ky?: KyInstance; }); /** * Scrapes and extracts structured data from a web page. Also classifies the web page as one of several types (article, product, discussion, job, image, video, list, event, or other). */ analyzeUrl(options: diffbot.ExtractAnalyzeOptions): Promise; /** * Scrapes and extracts clean article text from news articles, blog posts, and other text-heavy web pages. */ extractArticleFromUrl(options: diffbot.ExtractArticleOptions): Promise; /** * Resolves and enriches a partial person or organization entity. */ enhanceEntity(opts: diffbot.EnhanceEntityOptions): Promise; searchKnowledgeGraph(options: diffbot.KnowledgeGraphSearchOptions): Promise; enhanceKnowledgeGraph(options: diffbot.KnowledgeGraphEnhanceOptions): Promise; protected _extract(endpoint: string, options: diffbot.ExtractOptions): Promise; } export { DiffbotClient, diffbot };