import { RequestQueue } from 'crawlee'; import { ParsedUrl } from '@autogram/url-tools'; import { UrlFilterInput } from '../../tools/urls/filter-url.js'; import { InternalSpiderContext } from '../../index.js'; import { PageRegion } from '../../tools/html/index.js'; export type PageLinkRegion = PageRegion & { /** * One or more CSS selectors used to find links in a given region. * * These selectors should find elements with either a `src` or `href` tag; they're used after * a "bucket" of HMTL has been identified and specific URL-bearing elements must be found * inside of it. */ linkSelectors?: string | string[]; /** * Region-specific override for the filter that determines whether a URL should be saved * to the crawl database. */ save?: UrlFilterInput; /** * Region-specific override for the filter that determines whether a URL should be enqueued * for crawling. */ crawl?: UrlFilterInput; /** * Region-specific override for the label for each saved URL and Link. If none is specified, * this defaults to the name of the region being processed. */ label?: string; /** * Region-specific override for the default Request handler that should be used when processing * the URL during a crawl. */ handler?: string; }; /** * Configuration options for Spidergram's URL enqueing options. */ export interface EnqueueUrlOptions { /** * Limits the number of links that will be enqueued by this call; useful for * selecting a subset of links on the page for testing. * * @defaultValue Infinity */ limit?: number; /** * One or more CSS selectors used to locate links on the page. * * @defaultValue 'a' */ selectors?: string | string[]; /** * A list of page regions in which different link-discovery rules should be applied. * By default, the region's name is saved as the {@link LinksTo.label} property on the * connection between a {@link Resource} and a {@link UniqueUrl}. * * @example Find and label links inside the following tags * `regions: ['heading', 'main', 'footer'],` * * @example Find and label links with specific selectors * ``` * regions: { * heading: 'heading.site-hero', * main: 'main > div.content', * footer: 'div#global__footer-v2.DONTDELETE', * }, * ``` * * @example Find links in regions, using custom settings for one region * ``` * regions: { * heading: 'heading.site-hero', * main: 'main > div.content', * footer: { * selector: 'div#global__footer-v2.DONTDELETE', * linkSelector: 'a:not(.endlessRedirect)', * enqueue: UrlMatchStrategy.SameHostname * }, * }, * ``` */ regions?: string[] | Record; /** * A filter condition to determine which links will be saved as * {@apilink UniqueUrl} objects in the current project's graph. * * @type {boolean | string | RegExp | UrlMatchStrategy | UrlFilterWithContext} * @default UrlMatchStrategy.All */ save?: UrlFilterInput; /** * A filter condition to determine which links will be enqueued for crawling. * * @type {boolean | string | RegExp | UrlMatchStrategy | UrlFilterWithContext} * @default UrlMatchStrategy.SameDomain */ crawl?: UrlFilterInput; /** * Don't save or enqueue the link if it already exists in the graph. * * *NOTE:* Only affects the saving and enqueuing of {@apilink UniqueUrl} * objects themselves; LinksTo records connection crawled Resources to * the UniqueUrl may still be created. * * @type {boolean} * @default true */ discardExisting?: boolean; /** * Ignore links that can't be parsed by the {@apilink URL} constructor. * * *NOTE:* Only affects the saving of {@apilink UniqueUrl} objects; unparsable * links are never enqueued for crawling. * * @type {boolean} * @default false */ discardUnparsable?: boolean; /** * Ignore HTML tags that are found by the selector, but have no `href` * property. * * *NOTE:* Only affects the saving of {@apilink UniqueUrl} objects; unparsable * links are never enqueued for crawling. * * @type {boolean} * @default true */ discardEmpty?: boolean; /** * Ignore links that only contain an anchor, e.g. `Scroll to top` * * @type {boolean} * @default true */ discardlocalAnchors?: boolean; /** * Ignore HTML tags with protocols other than `http` and `https`. * * *NOTE:* Only affects the saving of {@apilink UniqueUrl} objects; unparsable * links are never enqueued for crawling. * * @type {boolean} * @default true */ discardNonWeb?: boolean; /** * The number of repeated path elements that cause Spidergram to consider a * path 'recursive' and stop crawling it. * * For example, a threshold of 3 would flag `http://example.com/~/~/~` as recursive, * and it would not be crawled. * * @type {number} * @default 3 */ recursivePathThreshold?: number; /** * The base URL that should be used when parsing relative URLs. If none * is specified, this defaults to the URL of the page being parsed for URLs. * * @default {boolean} * @type {string} */ baseUrl?: string; /** * The RequestQueue used to enqueue the links. * * By default this is the current crawl's internal queue, but a custom * queue can be passed in for special handling (ie, creating a separate * list of URLs for page screenshots after the main crawl completes) * * @type {string} */ requestQueue?: RequestQueue; /** * A label applied to any {@apilink Request} objects created when the * URLs are enqueued for crawling. This can be used to control which * handlers process the resulting HTTP responses. * * @default {undefined} * @type {string} */ handler?: string; /** * If URLs found in this operation are enqueued, move them to the front * ofr the request queue. * * @default {false} * @type {boolean} */ prioritize?: boolean; /** * An optional normalizer function to override the crawl- and project-wide * normalizer defaults. * * @default {undefined} * @type {UrlMutatorWithContext} */ normalizer?: UrlMutatorWithContext; } export type UrlMutatorWithContext = (found: ParsedUrl, context?: T) => ParsedUrl; //# sourceMappingURL=enqueue-options.d.ts.map