/** * Web Sources Extractor * * Scrapes DOJ Press Releases and White House briefings * for AI regulation and preemption signals * * @author Jason Pellerin AI Solutionist */ import { log } from 'apify'; import * as cheerio from 'cheerio'; import type { Input, ExtractionResult, PreemptionEvent, } from '../utils/types.js'; import { calculatePreemptionRisk, generateRagChunks } from '../analysis/risk-scorer.js'; const DELAY_MS = 1000; // Respectful scraping delay // ═══════════════════════════════════════════════════════════════════════════ // DOJ Press Releases // ═══════════════════════════════════════════════════════════════════════════ export async function extractDojActions(input: Input): Promise> { log.info('⚖️ Extracting DOJ press releases and guidance...'); const startTime = Date.now(); const results: PreemptionEvent[] = []; const errors: string[] = []; let requestCount = 0; // Try multiple search strategies const searchStrategies = [ 'artificial intelligence', 'technology', 'discrimination', 'algorithm', 'preemption', 'state law', ]; try { // Try primary DOJ news page first (recent news) const dojNewsUrl = 'https://www.justice.gov/news'; log.info(`Fetching DOJ news: ${dojNewsUrl}`); const response = await fetch(dojNewsUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', }, }); requestCount++; if (!response.ok) { log.warning(`DOJ website returned ${response.status}`); throw new Error(`DOJ website returned ${response.status}`); } const html = await response.text(); const $ = cheerio.load(html); log.info(`DOJ page loaded, parsing content...`); // Try multiple selector patterns for DOJ website const selectors = [ '.views-row', '.news-item', 'article', '.node--type-press-release', '.view-content .item-list li', ]; let itemsFound = 0; for (const selector of selectors) { $(selector).each((i, el) => { if (results.length >= input.maxResults) return false; const $row = $(el); // Try multiple ways to find title/link const titleEl = $row.find('a[href*="/news"], a[href*="/pr/"], .title a, h2 a, h3 a').first(); const title = titleEl.text().trim() || $row.find('h2, h3, .title').first().text().trim(); const url = titleEl.attr('href'); const date = $row.find('time, .date, .field-date, [datetime]').first().text().trim() || $row.find('[datetime]').attr('datetime') || ''; const snippet = $row.find('p, .body, .summary, .teaser').first().text().trim(); if (!title) return; itemsFound++; // Check relevance with broader criteria const textToCheck = (title + ' ' + snippet).toLowerCase(); const isRelevant = searchStrategies.some(term => textToCheck.includes(term.toLowerCase())) || isRelevantToAIPreemption(title + ' ' + snippet); if (!isRelevant) return; const fullUrl = url?.startsWith('http') ? url : `https://www.justice.gov${url}`; const event = transformToDojEvent({ title, url: fullUrl, date, snippet, }, input); if (event && event.preemptionRisk.score >= input.riskScoreThreshold) { results.push(event); } }); if (itemsFound > 0) break; // Found items with this selector } log.info(`Found ${itemsFound} DOJ items, ${results.length} relevant to AI/preemption`); } catch (error) { const msg = `DOJ extraction failed: ${error}`; log.warning(msg); errors.push(msg); } return { success: errors.length === 0, data: results, totalFound: results.length, extracted: results.length, errors, warnings: [], metrics: { durationMs: Date.now() - startTime, requestCount, rateLimitHits: 0, retries: 0, deduplicatedCount: 0, }, }; } // ═══════════════════════════════════════════════════════════════════════════ // White House Briefings // ═══════════════════════════════════════════════════════════════════════════ export async function extractWhiteHouseActions(input: Input): Promise> { log.info('🏛️ Extracting White House briefings and actions...'); const startTime = Date.now(); const results: PreemptionEvent[] = []; const errors: string[] = []; let requestCount = 0; try { // White House briefing room const whUrl = 'https://www.whitehouse.gov/briefing-room/presidential-actions/'; log.debug(`Fetching White House actions: ${whUrl}`); const response = await fetch(whUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; HyperCognate/1.0; +https://jasonpellerin.com)', 'Accept': 'text/html', }, }); requestCount++; if (!response.ok) { throw new Error(`White House website returned ${response.status}`); } const html = await response.text(); const $ = cheerio.load(html); // Parse presidential actions $('article, .wp-block-post').each((i, el) => { if (results.length >= input.maxResults) return false; const $article = $(el); const title = $article.find('h2 a, .entry-title a').text().trim(); const url = $article.find('h2 a, .entry-title a').attr('href'); const date = $article.find('time').attr('datetime') || $article.find('.date').text().trim(); const excerpt = $article.find('.excerpt, .entry-summary').text().trim(); if (!title || !isRelevantToAIPreemption(title + ' ' + excerpt)) return; const event = transformToWhiteHouseEvent({ title, url: url || '', date, excerpt, }, input); if (event && event.preemptionRisk.score >= input.riskScoreThreshold) { results.push(event); } }); log.info(`Found ${results.length} relevant White House actions`); } catch (error) { const msg = `White House extraction failed: ${error}`; log.warning(msg); errors.push(msg); } return { success: errors.length === 0, data: results, totalFound: results.length, extracted: results.length, errors, warnings: [], metrics: { durationMs: Date.now() - startTime, requestCount, rateLimitHits: 0, retries: 0, deduplicatedCount: 0, }, }; } // ═══════════════════════════════════════════════════════════════════════════ // Transform Functions // ═══════════════════════════════════════════════════════════════════════════ function transformToDojEvent( item: { title: string; url: string; date: string; snippet: string }, input: Input ): PreemptionEvent | null { const id = `doj-${Buffer.from(item.url).toString('base64').slice(0, 20)}`; // Determine event type from title const eventType = item.title.toLowerCase().includes('guidance') ? 'doj_guidance' : 'doj_action'; const affectedStates = detectAffectedStates(item.title + ' ' + item.snippet, input.targetStates); const baseEvent: Partial = { id, eventType, title: item.title, summary: item.snippet || item.title, source: 'doj_press', sourceUrl: item.url, datePublished: parseDate(item.date), dateDiscovered: new Date().toISOString(), federalAuthority: 'doj', affectedStates, affectedProvisions: detectAffectedProvisions(item.title + ' ' + item.snippet), extractedAt: new Date().toISOString(), }; // Calculate risk const risk = calculatePreemptionRisk(baseEvent as PreemptionEvent, input); const ragChunks = generateRagChunks(baseEvent as PreemptionEvent); return { ...baseEvent, preemptionRisk: risk.risk, complianceImpact: risk.impact, ragChunks, citations: [], relatedEvents: [], } as PreemptionEvent; } function transformToWhiteHouseEvent( item: { title: string; url: string; date: string; excerpt: string }, input: Input ): PreemptionEvent | null { const id = `wh-${Buffer.from(item.url).toString('base64').slice(0, 20)}`; // Check if it's an executive order const isEO = /executive order|E\.?O\.?\s*\d/i.test(item.title); const eventType = isEO ? 'executive_order' : 'agency_guidance'; const affectedStates = detectAffectedStates(item.title + ' ' + item.excerpt, input.targetStates); const baseEvent: Partial = { id, eventType, title: item.title, summary: item.excerpt || item.title, source: 'whitehouse', sourceUrl: item.url, datePublished: parseDate(item.date), dateDiscovered: new Date().toISOString(), federalAuthority: 'executive_branch', affectedStates, affectedProvisions: detectAffectedProvisions(item.title + ' ' + item.excerpt), extractedAt: new Date().toISOString(), }; // Calculate risk const risk = calculatePreemptionRisk(baseEvent as PreemptionEvent, input); const ragChunks = generateRagChunks(baseEvent as PreemptionEvent); return { ...baseEvent, preemptionRisk: risk.risk, complianceImpact: risk.impact, ragChunks, citations: [], relatedEvents: [], } as PreemptionEvent; } // ═══════════════════════════════════════════════════════════════════════════ // Helper Functions // ═══════════════════════════════════════════════════════════════════════════ function isRelevantToAIPreemption(text: string): boolean { const relevancePatterns = [ // AI/Tech terms /artificial intelligence/i, /\bAI\b/, /algorithm/i, /automated/i, /machine learning/i, /deep learning/i, /neural network/i, /data.*(privacy|protection|governance)/i, /generative/i, /chatbot/i, /facial recognition/i, /predictive/i, // Preemption/Regulatory terms /preempt/i, /federal.*state/i, /state.*regulation/i, /supremacy/i, /regulatory.*conflict/i, // Discrimination/Bias terms /disparate impact/i, /discrimination/i, /bias/i, /civil rights/i, /equal.*opportunity/i, // Policy terms /technology policy/i, /innovation/i, /digital/i, /consumer protection/i, /antitrust/i, ]; return relevancePatterns.some(pattern => pattern.test(text)); } function detectAffectedStates(text: string, targetStates: string[]): string[] { const statePatterns: Record = { 'CO': /\b(colorado|colo\.|co\b)/i, 'CA': /\b(california|calif\.|ca\b)/i, 'CT': /\b(connecticut|conn\.|ct\b)/i, 'IL': /\b(illinois|ill\.|il\b)/i, 'TX': /\b(texas|tex\.|tx\b)/i, 'NY': /\b(new york|n\.y\.|ny\b)/i, 'VA': /\b(virginia|va\b)/i, 'WA': /\b(washington state|wash\.|wa\b)/i, }; const mentioned = Object.entries(statePatterns) .filter(([_, pattern]) => pattern.test(text)) .map(([state]) => state); if (mentioned.length > 0) { return mentioned.filter(s => targetStates.includes(s) || targetStates.includes('ALL')); } // If no specific states but general AI regulation, assume all targets affected if (/state.*regulation|preempt.*state/i.test(text)) { return targetStates.filter(s => s !== 'ALL'); } return []; } function detectAffectedProvisions(text: string): string[] { const provisions: string[] = []; if (/SB.?24.?205|Colorado AI Act|CAIA/i.test(text)) { provisions.push('Colorado SB 24-205'); } if (/SB.?1047|frontier AI/i.test(text)) { provisions.push('California SB 1047'); } if (/algorithmic discrimination|disparate impact/i.test(text)) { provisions.push('Algorithmic Discrimination Provisions'); } if (/impact assessment/i.test(text)) { provisions.push('Impact Assessment Requirements'); } return provisions; } function parseDate(dateStr: string): string { if (!dateStr) return new Date().toISOString(); try { const parsed = new Date(dateStr); if (!isNaN(parsed.getTime())) { return parsed.toISOString(); } } catch { // Fall through } return new Date().toISOString(); } function delay(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); }