/** * CDP Edge — ML Clustering (Fase 1) * Handlers das rotas /api/segmentation/* */ import { tryParseJson } from '../utils.js'; import { Env } from '../../types.js'; // ── Tipos ─────────────────────────────────────────────────────────────────────── interface KmeansResult { assignments: number[]; centroids: number[][]; } interface ClusterStats { c: number; size: number; pct: number; avgLtv: number; avgEng: number; avgDays: number; topSource: string; topState: string; topIntent: string; } interface Cluster { cluster_id: number; name: string; size: number; percentage: number; action_recommendation: string; characteristics: { avg_ltv_class: number; avg_engagement_score: number; avg_intention_level: number; avg_days_since_lead: number; dominant_countries: string[]; dominant_states: string[]; dominant_utm_sources: string[]; top_features: string[]; }; } interface ClusterInfo { cluster_id: number; name: string; action: string; } // ── Helpers K-means vetorial ────────────────────────────────────────────────── function _cosDist(a: number[], b: number[]): number { let dot = 0, na = 0, nb = 0; for (let i = 0; i < a.length; i++) { dot += a[i]*b[i]; na += a[i]*a[i]; nb += b[i]*b[i]; } return 1 - dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-10); } function _kmeansRun(vectors: number[][], k: number, maxIter = 25): KmeansResult { const n = vectors.length, dim = vectors[0].length; const centroids = [vectors[Math.floor(Math.random() * n)]]; while (centroids.length < k) { const dists = vectors.map(v => Math.min(...centroids.map(c => _cosDist(v, c)))); const sum = dists.reduce((a, b) => a + b, 0); let r = Math.random() * sum, cumul = 0; for (let i = 0; i < n; i++) { cumul += dists[i]; if (cumul >= r) { centroids.push(vectors[i]); break; } } if (centroids.length < k) centroids.push(vectors[Math.floor(Math.random() * n)]); } let assignments = new Array(n).fill(0); for (let iter = 0; iter < maxIter; iter++) { let changed = false; for (let i = 0; i < n; i++) { let best = 0, bestD = Infinity; for (let c = 0; c < k; c++) { const d = _cosDist(vectors[i], centroids[c]); if (d < bestD) { bestD = d; best = c; } } if (assignments[i] !== best) { assignments[i] = best; changed = true; } } if (!changed) break; for (let c = 0; c < k; c++) { const members = vectors.filter((_, i) => assignments[i] === c); if (!members.length) continue; for (let d = 0; d < dim; d++) centroids[c][d] = members.reduce((s, v) => s + v[d], 0) / members.length; } } return { assignments, centroids }; } function _silhouette(vectors: number[][], assignments: number[], k: number): number { const n = vectors.length; let total = 0; for (let i = 0; i < n; i++) { const ci = assignments[i]; const same = vectors.filter((_, j) => j !== i && assignments[j] === ci); const a = same.length ? same.reduce((s, v) => s + _cosDist(vectors[i], v), 0) / same.length : 0; let b = Infinity; for (let c = 0; c < k; c++) { if (c === ci) continue; const other = vectors.filter((_, j) => assignments[j] === c); if (other.length) b = Math.min(b, other.reduce((s, v) => s + _cosDist(vectors[i], v), 0) / other.length); } total += b === Infinity ? 0 : (b - a) / Math.max(a, b); } return Math.round((total / n) * 1000) / 1000; } function _buildLeadProfile(l: any): string { return [ `LTV: ${l.predicted_ltv_class || 'desconhecido'}`, `engajamento: ${Math.round(l.engagement_score || 0)}`, `intenção: ${l.intention_level || 'desconhecida'}`, `origem: ${l.utm_source || 'direto'}`, `canal: ${l.utm_medium || 'desconhecido'}`, `país: ${l.country || 'BR'}`, `estado: ${l.state || ''}`, `hora: ${l.hour_of_day || 12}h`, (l.is_weekend ? 'fim-de-semana' : 'dia-útil'), `recência: ${l.days_since_lead || 0} dias`, ].filter(Boolean).join(', '); } // ── POST /api/segmentation/cluster ──────────────────────────────────────────── // Clustering real: embeddinggemma-300m → K-means vetorial → Granite para nomear export async function handleSegmentationCluster(env: Env, request: Request, headers: Headers): Promise { if (!env.DB) return new Response(JSON.stringify({ error: 'DB não configurado' }), { status: 503, headers }); if (!env.AI) return new Response(JSON.stringify({ error: 'Workers AI não configurado' }), { status: 503, headers }); const url = new URL(request.url); const algorithm = url.searchParams.get('algorithm') || 'kmeans'; const nClusters = Math.min(10, Math.max(2, parseInt(url.searchParams.get('n_clusters') || '5'))); const clientVertical = url.searchParams.get('vertical') || 'general'; const forceRecluster = url.searchParams.get('force') === 'true'; if (!['kmeans', 'dbscan', 'hierarchical'].includes(algorithm)) { return new Response(JSON.stringify({ error: 'algorithm deve ser: kmeans, dbscan ou hierarchical', received: algorithm }), { status: 400, headers }); } try { if (!forceRecluster) { const existing = await env.DB.prepare(` SELECT id, created_at, cluster_name FROM ml_segments WHERE clustering_algorithm = ? AND is_active = 1 AND client_vertical = ? ORDER BY created_at DESC LIMIT 1 `).bind(algorithm, clientVertical).first(); if (existing) { const ageDays = (Date.now() - new Date((existing as any).created_at).getTime()) / 864e5; if (ageDays < 7) { return new Response(JSON.stringify({ success: true, message: 'Cluster existente ainda válido (< 7 dias). Use ?force=true para re-clustering.', cluster_id: (existing as any).id, cluster_name: (existing as any).cluster_name, age_days: Math.round(ageDays * 10) / 10, use_existing: true, }), { status: 200, headers }); } } } const leadsRes = await env.DB.prepare(` SELECT id, predicted_ltv_class, engagement_score, intention_level, country, state, utm_source, utm_medium, bot_score, CAST(strftime('%H', created_at) AS INTEGER) AS hour_of_day, CAST(julianday('now') - julianday(created_at) AS INTEGER) AS days_since_lead, CASE WHEN strftime('%w', created_at) IN ('0','6') THEN 1 ELSE 0 END AS is_weekend FROM leads WHERE created_at >= datetime('now', '-6 months') AND (bot_score IS NULL OR bot_score < 2) ORDER BY RANDOM() LIMIT 2000 `).all(); const leads = leadsRes.results || []; if (leads.length < 50) { return new Response(JSON.stringify({ error: 'Dados insuficientes para clustering. Mínimo: 50 leads.', leads_found: leads.length, required: 50 }), { status: 400, headers }); } const startTime = Date.now(); const sample = leads.slice(0, 100); const profiles = sample.map(_buildLeadProfile); // Embeddings reais via embeddinggemma-300m const embRes = await env.AI.run('@cf/baai/bge-m3', { text: profiles }); const vectors = (embRes as any).data as number[][]; if (!vectors || vectors.length < nClusters) throw new Error(`embeddinggemma retornou ${vectors?.length ?? 0} vetores`); // K-means vetorial real const { assignments } = _kmeansRun(vectors, nClusters); const silhouetteScore = _silhouette(vectors, assignments, nClusters); // Agregação por cluster para nomear com Granite const clusterStats: (ClusterStats | null)[] = Array.from({ length: nClusters }, (_, c) => { const members = sample.filter((_, i) => assignments[i] === c); if (!members.length) return null; const ltvMap: Record = { High: 1, Medium: 0.5, Low: 0 }; const avgLtv = members.reduce((s: number, l: any) => s + (ltvMap[l.predicted_ltv_class] ?? 0), 0) / members.length; const avgEng = members.reduce((s: number, l: any) => s + (l.engagement_score || 0), 0) / members.length; const avgDays = members.reduce((s: number, l: any) => s + (l.days_since_lead || 0), 0) / members.length; const freq = (arr: string[]) => arr.length ? [...arr.reduce((m,s) => m.set(s,(m.get(s)||0)+1), new Map())].sort((a,b)=>b[1]-a[1])[0]?.[0] : null; return { c, size: members.length, pct: Math.round(members.length / sample.length * 100), avgLtv, avgEng, avgDays, topSource: freq(members.map((l: any) => l.utm_source).filter(Boolean)) || 'direto', topState: freq(members.map((l: any) => l.state).filter(Boolean)) || 'BR', topIntent: freq(members.map((l: any) => l.intention_level).filter(Boolean)) || 'desconhecida', }; }).filter(Boolean) as ClusterStats[]; // Type guard function to filter null values function isNotNull(value: T | null): value is T { return value !== null; } const validClusterStats = clusterStats.filter(isNotNull); // Granite apenas para nomear segmentos const namingPrompt = `Você é especialista em segmentação de clientes. Dê um nome descritivo em português e uma recomendação de campanha para cada segmento. Retorne SOMENTE JSON válido: {"segments":[{"cluster_id":0,"name":"...","action":"..."},...]} ${validClusterStats.map(s => `Cluster ${s.c}: LTV=${s.avgLtv.toFixed(2)}, engajamento=${s.avgEng.toFixed(0)}, intenção="${s.topIntent}", origem="${s.topSource}", estado="${s.topState}", recência=${s.avgDays.toFixed(0)} dias, tamanho=${s.size}`).join('\n')}`; const nameRes = await env.AI.run('@cf/ibm-granite/granite-4.0-h-micro', { messages: [{ role: 'user', content: namingPrompt }], max_tokens: 800 }); let clusterNames: Record = {}; try { const m = ((nameRes as any)?.response || '').match(/\{[\s\S]*\}/); if (m) { const parsed = JSON.parse(m[0]); (parsed.segments || []).forEach((s: any) => { if (typeof s.cluster_id === 'number') { clusterNames[s.cluster_id] = { cluster_id: s.cluster_id, name: s.name || `Segmento ${s.cluster_id + 1}`, action: s.action || '', }; } }); } } catch { /* usa nomes fallback */ } const duration = Date.now() - startTime; const clusters: Cluster[] = validClusterStats.map(s => ({ cluster_id: s.c, name: clusterNames[s.c]?.name || `Segmento ${s.c + 1}`, size: s.size, percentage: s.pct, action_recommendation: clusterNames[s.c]?.action || '', characteristics: { avg_ltv_class: s.avgLtv, avg_engagement_score: s.avgEng, avg_intention_level: s.avgLtv, avg_days_since_lead: s.avgDays, dominant_countries: ['BR'], dominant_states: [s.topState || 'BR'], dominant_utm_sources: [s.topSource || 'direto'], top_features: ['ltv', 'engagement', 'intention'], }, })); await env.DB.prepare(`UPDATE ml_segments SET is_active = 0 WHERE clustering_algorithm = ? AND client_vertical = ? AND is_active = 1`).bind(algorithm, clientVertical).run(); const now = new Date().toISOString(); for (const cluster of clusters) { const ch = cluster.characteristics; await env.DB.prepare(` INSERT INTO ml_segments ( cluster_id, cluster_name, clustering_algorithm, client_vertical, size, percentage, avg_ltv_class, avg_behavior_score, avg_engagement_score, avg_intention_level, avg_days_since_lead, dominant_countries, dominant_states, dominant_utm_sources, dominant_features, silhouette_score, action_recommendations, bid_recommendations, campaign_recommendations, is_active, created_at, updated_at ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,1,?,?) `).bind( cluster.cluster_id, cluster.name, algorithm, clientVertical, cluster.size, cluster.percentage, ch.avg_ltv_class, ch.avg_engagement_score, ch.avg_engagement_score, ch.avg_intention_level, ch.avg_days_since_lead, JSON.stringify(ch.dominant_countries), JSON.stringify(ch.dominant_states), JSON.stringify(ch.dominant_utm_sources), JSON.stringify(ch.top_features), silhouetteScore, JSON.stringify([cluster.action_recommendation]), JSON.stringify([]), JSON.stringify([]), now, now, ).run(); } try { await env.DB.prepare(` INSERT INTO ml_clustering_history (clustering_id, started_at, completed_at, algorithm, n_leads_processed, n_clusters_created, total_duration_ms, workers_ai_neurons_used, status, parameters, results_summary) VALUES (0, ?, datetime('now'), ?, ?, ?, ?, ?, 'completed', ?, ?) `).bind(new Date(startTime).toISOString(), algorithm, leads.length, clusters.length, duration, Math.ceil(duration * 0.01), JSON.stringify({ algorithm, n_clusters: nClusters, vertical: clientVertical, engine: 'embeddinggemma-300m+kmeans' }), JSON.stringify({ clusters: clusters.length, silhouette: silhouetteScore }), ).run(); } catch (e: any) { console.error('[Segmentation] history log error:', e?.message || String(e)); } return new Response(JSON.stringify({ success: true, algorithm, engine: 'embeddinggemma-300m + kmeans vetorial', n_clusters: clusters.length, client_vertical: clientVertical, leads_analyzed: leads.length, sample_embedded: sample.length, duration_ms: duration, silhouette_score: silhouetteScore, clusters, generated_at: now, }), { status: 200, headers }); } catch (err: any) { console.error('[Segmentation] cluster error:', err?.message || String(err)); try { if (env.DB) await env.DB.prepare(` INSERT INTO ml_clustering_history (clustering_id, started_at, algorithm, n_leads_processed, n_clusters_created, total_duration_ms, workers_ai_neurons_used, status, error_message, parameters, results_summary) VALUES (0, datetime('now'), ?, 0, 0, 0, 0, 'failed', ?, ?, '{}') `).bind(algorithm, err?.message || String(err), JSON.stringify({ algorithm, n_clusters: nClusters })).run(); } catch { /* não bloquear */ } return new Response(JSON.stringify({ error: 'Erro ao executar clustering', message: err?.message || String(err) }), { status: 500, headers }); } } // ── GET /api/segmentation/list ──────────────────────────────────────────────── export async function handleSegmentationList(env: Env, request: Request, headers: Headers): Promise { if (!env.DB) return new Response(JSON.stringify({ error: 'DB não configurado' }), { status: 503, headers }); const url = new URL(request.url); const algorithm = url.searchParams.get('algorithm') || null; const vertical = url.searchParams.get('vertical') || null; try { const conditions: string[] = ['is_active = 1']; const bindings: (string | number)[] = []; if (algorithm) { conditions.push('clustering_algorithm = ?'); bindings.push(algorithm); } if (vertical) { conditions.push('client_vertical = ?'); bindings.push(vertical); } const result = await env.DB.prepare(` SELECT id, cluster_id, cluster_name, clustering_algorithm, client_vertical, size, percentage, avg_ltv_class, avg_behavior_score, avg_engagement_score, avg_intention_level, avg_days_since_lead, silhouette_score, dominant_countries, dominant_states, dominant_utm_sources, dominant_features, action_recommendations, bid_recommendations, campaign_recommendations, is_active, created_at, updated_at FROM ml_segments WHERE ${conditions.join(' AND ')} ORDER BY created_at DESC LIMIT 50 `).bind(...bindings).all(); const segments = (result.results || []).map((s: any) => ({ ...s, dominant_countries: tryParseJson(s.dominant_countries, []), dominant_states: tryParseJson(s.dominant_states, []), dominant_utm_sources: tryParseJson(s.dominant_utm_sources, []), dominant_features: tryParseJson(s.dominant_features, []), action_recommendations: tryParseJson(s.action_recommendations, []), bid_recommendations: tryParseJson(s.bid_recommendations, []), campaign_recommendations: tryParseJson(s.campaign_recommendations, []), })); return new Response(JSON.stringify({ success: true, total: segments.length, segments }), { status: 200, headers }); } catch (err: any) { console.error('[Segmentation] list error:', err?.message || String(err)); return new Response(JSON.stringify({ error: err?.message || String(err) }), { status: 500, headers }); } } // ── GET /api/segmentation/outliers ─────────────────────────────────────────── export async function handleSegmentationOutliers(env: Env, request: Request, headers: Headers): Promise { if (!env.DB) return new Response(JSON.stringify({ error: 'DB não configurado' }), { status: 503, headers }); const url = new URL(request.url); const limit = Math.min(parseInt(url.searchParams.get('limit') || '50'), 200); const days = parseInt(url.searchParams.get('days') || '30'); try { const result = await env.DB.prepare(` SELECT msm.lead_id, msm.cluster_id, msm.confidence, msm.is_outlier, msm.outlier_reason, msm.assigned_at, l.email, l.phone, l.country, l.state, l.city, l.utm_source, l.bot_score, l.engagement_score, l.intention_level, l.created_at AS lead_created_at FROM ml_segment_members msm LEFT JOIN leads l ON CAST(msm.lead_id AS INTEGER) = l.id WHERE msm.is_outlier = 1 AND msm.assigned_at >= datetime('now', '-' || ? || ' days') ORDER BY msm.assigned_at DESC LIMIT ? `).bind(days, limit).all(); return new Response(JSON.stringify({ success: true, total: (result.results || []).length, period_days: days, outliers: result.results || [] }), { status: 200, headers }); } catch (err: any) { console.error('[Segmentation] outliers error:', err?.message || String(err)); return new Response(JSON.stringify({ error: err?.message || String(err) }), { status: 500, headers }); } } // ── PUT /api/segmentation/update ───────────────────────────────────────────── export async function handleSegmentationUpdate(env: Env, request: Request, headers: Headers): Promise { if (!env.DB) return new Response(JSON.stringify({ error: 'DB não configurado' }), { status: 503, headers }); let body: any; try { body = await request.json(); } catch { return new Response(JSON.stringify({ error: 'JSON inválido no body da requisição' }), { status: 400, headers }); } const { cluster_id, action_recommendations, bid_recommendations, campaign_recommendations } = body; if (cluster_id === undefined || cluster_id === null) { return new Response(JSON.stringify({ error: 'cluster_id é obrigatório' }), { status: 400, headers }); } try { const sets: string[] = []; const bindings: (string | number)[] = []; if (action_recommendations !== undefined) { sets.push('action_recommendations = ?'); bindings.push(JSON.stringify(action_recommendations)); } if (bid_recommendations !== undefined) { sets.push('bid_recommendations = ?'); bindings.push(JSON.stringify(bid_recommendations)); } if (campaign_recommendations !== undefined) { sets.push('campaign_recommendations = ?'); bindings.push(JSON.stringify(campaign_recommendations)); } if (sets.length === 0) { return new Response(JSON.stringify({ error: 'Nenhum campo válido para atualizar (action_recommendations, bid_recommendations, campaign_recommendations)' }), { status: 400, headers }); } sets.push("updated_at = datetime('now')"); bindings.push(cluster_id); await env.DB.prepare(`UPDATE ml_segments SET ${sets.join(', ')} WHERE id = ?`).bind(...bindings).run(); return new Response(JSON.stringify({ success: true, cluster_id, fields_updated: sets.length - 1 }), { status: 200, headers }); } catch (err: any) { console.error('[Segmentation] update error:', err?.message || String(err)); return new Response(JSON.stringify({ error: err?.message || String(err) }), { status: 500, headers }); } }