/**
 * Copyright 2023 Continue
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import { Octokit } from "@octokit/rest";
import cheerio from "cheerio";
// import fetch from "node-fetch";
const fetch = require("node-fetch");
import { URL } from "url";

const IGNORE_PATHS_ENDING_IN = [
	"favicon.ico",
	"robots.txt",
	".rst.txt",
	"genindex",
	"py-modindex",
	"search.html",
	"search",
	"genindex.html",
	"changelog",
	"changelog.html",
];

const GITHUB_PATHS_TO_TRAVERSE = ["/blob/", "/tree/"];

async function crawlGithubRepo(baseUrl: URL) {
	const octokit = new Octokit({
		auth: undefined,
	});

	const [_, owner, repo] = baseUrl.pathname.split("/");

	let dirContentsConfig = {
		owner: owner,
		repo: repo,
	};

	const tree = await octokit.request(
		"GET /repos/{owner}/{repo}/git/trees/{tree_sha}",
		{
			owner,
			repo,
			tree_sha: "main",
			headers: {
				"X-GitHub-Api-Version": "2022-11-28",
			},
			recursive: "true",
		},
	);

	const paths = tree.data.tree
		.filter((file) => file.type === "blob" && file.path?.endsWith(".md"))
		.map((file) => baseUrl.pathname + "/tree/main/" + file.path);

	return paths;
}

async function getLinksFromUrl(url: string, path: string) {
	const baseUrl = new URL(url);
	const location = new URL(path, url);
	let response;
	try {
		response = await fetch(location.toString());
	} catch (error: unknown) {
		if (error instanceof Error && error.message.includes("maximum redirect")) {
			console.error("Maximum redirect reached for: ", location.toString());
			return {
				html: "",
				links: [],
			};
		} else {
			console.error(error);
			return {
				html: "",
				links: [],
			};
		}
	}

	const html = await response.text();
	let links: string[] = [];

	if (url.includes("github.com")) {
		return {
			html,
			links,
		};
	}

	const $ = cheerio.load(html);

	$("a").each((_, element) => {
		const href = $(element).attr("href");
		if (!href) {
			return;
		}

		const parsedUrl = new URL(href, url);
		if (
			parsedUrl.hostname === baseUrl.hostname
			// parsedUrl.pathname.startsWith(baseUrl.pathname)
		) {
			links.push(parsedUrl.pathname);
		}
	});

	links = [...new Set(links)].filter((link) => {
		return (
			!link.includes("#") &&
			!IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending))
		);
	});

	return {
		html,
		links,
	};
}

function splitUrl(url: URL) {
	const baseUrl = `${url.protocol}//${url.hostname}`;
	const basePath = url.pathname;
	return {
		baseUrl,
		basePath,
	};
}

export type PageData = {
	url: string;
	path: string;
	html: string;
};

export async function* crawlPage(url: URL): AsyncGenerator<PageData> {
	const { baseUrl, basePath } = splitUrl(url);
	let paths: string[] = [basePath];

	if (url.hostname === "github.com") {
		const githubLinks = await crawlGithubRepo(url);
		paths = [...paths, ...githubLinks];
	}

	let index = 0;

	while (index < paths.length) {
		const promises = paths
			.slice(index, index + 50)
			.map((path) => getLinksFromUrl(baseUrl, path));

		const results = await Promise.all(promises);

		for (const { html, links } of results) {
			if (html !== "") {
				yield {
					url: url.toString(),
					path: paths[index],
					html: html,
				};
			}

			for (let link of links) {
				if (!paths.includes(link)) {
					paths.push(link);
				}
			}

			index++;
		}

		paths = paths.filter((path) =>
			results.some(
				(result) => result.html !== "" && result.links.includes(path),
			),
		);
	}
}