/* First we download the pricing html page using the fetch api from as text: https://cloud.google.com/compute/all-pricing, and much more. */ import cheerio from "cheerio"; import { fetchGpuData, parseGpuData } from "./gpu-pricing"; import { getDisks } from "./disk-pricing"; import { getZones, ZoneData } from "./zones"; async function fetchPricingData() { const gcloudUrl = "https://cloud.google.com/compute/all-pricing"; const response = await fetch(gcloudUrl, { headers: { "Accept-Language": "en-US", }, }); const body = await response.text(); return body; } export async function parsePricingData() { const body = await fetchPricingData(); // Use cheerio to load the HTML const $ = cheerio.load(body); const tables: any[] = []; for (const x of $("cloudx-pricing-table")) { const data = $(x); const tableLayout = data.attr("layout"); if (!tableLayout) { continue; } const json = tableLayout .replace(/True/g, "true") .replace(/False/g, "false") .replace(/'/g, '"'); const layout = JSON.parse(json); if (layout?.rows) { const type = data.prev().attr("id") ?? ""; if ( type.includes("image") || type.includes("disk") || type.includes("localssd") ) { // These are templated in some mysterious way so the data isn't available // to parse out of the raw html, so we just delete them. continue; } if (tables[type] != null) { console.log("warning -- overwriting", type); } tables.push(layout.rows); } } // The gpu data is stored in a separate iframe in a different format. // This gets fixed later. const gpuUrl = $("iframe").attr("src"); let gpus; if (!gpuUrl) { console.warn("GPU data is missing"); gpus = {}; } else { gpus = await parseGpuData(await fetchGpuData(gpuUrl)); } const disks = await getDisks(); const zones = await getZones(); return { tables, gpus, disks, zones, }; } const PREFIX = [ "us", "europe", "asia", "northamerica", "southamerica", "australia", "me", "africa", ]; function toRegion(key: string): string { if (key.includes("-")) return key; for (const prefix of PREFIX) { if (key.startsWith(prefix)) { return `${prefix}-${key.slice(prefix.length)}`; } } throw Error(`unknown region: "${key}"`); } function formatCostMap(costMap?: { [region: string]: string }): | { [region: string]: number; } | undefined { if (costMap == null) { return costMap; } const result = {}; for (const key in costMap) { result[toRegion(key)] = parseFloat(costMap[key]); } return result; } export interface PriceData { prices?: { [region: string]: number }; spot?: { [region: string]: number }; vcpu?: number; memory?: number; count?: number; // for gpu's only max?: number; // for gpu's only } function toInteger(s?: string): number | undefined { if (s == null) return s; return parseInt(s.split(" ")[0]); } export function machineTypeToPriceData({ tables, gpus, disks, zones }): { machineTypes: { [machineType: string]: PriceData }; disks: { "pd-standard": { prices: { [zone: string]: number } }; "pd-ssd": { prices: { [zone: string]: number } }; "pd-balanced": { prices: { [zone: string]: number } }; }; accelerators: { [acceleratorType: string]: PriceData }; zones: { [zone: string]: ZoneData }; } { const machineTypes: { [name: string]: PriceData } = {}; for (const rows of tables) { let foundOnDemand = false; const headings = rows[0].cells.map((heading) => { if (!foundOnDemand && heading.toLowerCase().includes("price")) { // For some reason there are a dozen choices for the column headings for non-spot instances. // It's always the first one that contains "price", I think. foundOnDemand = true; return "on-demand"; } return heading.split(" ")[0].toLowerCase().split("(")[0]; }); if (headings[0] != "machine") { // this is part of table given how to make a custom machine type continue; } for (let i = 1; i < rows.length; i++) { const { cells } = rows[i]; if (cells[0].includes("custom-machine-type")) { continue; } const row: any = {}; for (let j = 0; j < headings.length; j++) { row[headings[j]] = cells[j]; } const machineType = row.machine.split(" ")[0]; let vcpu = toInteger(row.virtual ?? row.vcpu ?? row.vcpus ?? row.cores); // There's a special case with pricing for the shared cpu cases, which are // e2-micro, e2-small, e2-medium, where the vcpu that is the input for // computing the spot price is 0.25, 0.5, 1, respectively. if (machineType == "e2-micro") { vcpu = 0.25; } else if (machineType == "e2-small") { vcpu = 0.5; } else if (machineType == "e2-medium") { vcpu = 1; } // VERY important to get the actual amount of memory, and not just round to an integer // since we will use this later when recomputing all the prices based on the SKU data! const memory = parseFloat(row["memory"]); const prices = formatCostMap( (row.price ?? row["on-demand"])?.priceByRegion, ); machineTypes[machineType] = { prices, spot: {}, // gets filled in based on data/pricing.csv; assume initially no discount vcpu, memory, }; } } const accelerators: { [acceleratorType: string]: PriceData } = {}; for (const name in gpus) { const d = gpus[name]; accelerators[toApiAcceleratorType(name)] = { ...d, prices: formatCostMap(d.prices), spot: formatCostMap(d.spot), }; } for (const name in disks) { // makes the format consistent with the PriceData interface, // and also makes it easy to add more data about each disk // later if we need to (e.g., about speed?) disks[name] = { prices: disks[name] }; } return { machineTypes, accelerators, disks, zones }; } // In scraping data we use the names in the data sources. // However, we want to instead use exactly the same names // as in the GCP API. The possibilities for name here are // 'NVIDIA T4', 'NVIDIA P4', 'NVIDIA V100', 'NVIDIA P100', 'NVIDIA K80', function toApiAcceleratorType(name: string): string { const family = name.split(" ")[1].toLowerCase(); return `nvidia-tesla-${family}`; }