import { initContract } from '@ts-rest/core'; import { z } from 'zod'; import { errorResponseSchema, insufficientTokensErrorSchema } from '../schemas/common.schemas'; const c = initContract(); /** Intent shape matches mobile TransactionIntent (record_sale, stock_query, restock, credit_summary, credit_payment). */ export const voiceIntentSchema = z.object({ type: z.enum(['record_sale', 'stock_query', 'restock', 'credit_summary', 'credit_payment']), productId: z.string().optional().nullable(), quantity: z.number().optional(), unitPrice: z.number().optional(), customerId: z.string().optional(), amount: z.number().optional(), }); export type VoiceIntent = z.infer; const voiceBatchItemSchema = z.object({ type: z.literal('record_sale'), productId: z.string(), quantity: z.number(), unitPrice: z.number().optional(), }); export type VoiceBatchItem = z.infer; export const voiceParseResponseSchema = z.object({ intent: voiceIntentSchema.nullable(), batchIntents: z.array(voiceBatchItemSchema).nullable(), }); export type VoiceParseResponse = z.infer; export const voiceTranscribeResponseSchema = z.object({ transcript: z.string(), intent: voiceIntentSchema.nullable(), batchIntents: z.array(voiceBatchItemSchema).nullable(), }); export type VoiceTranscribeResponse = z.infer; /** One flat intent per row for ActionPreview-style review (ambient / multi-turn transcript). */ export const voiceAmbientExtractResponseSchema = z.object({ intents: z.array(voiceIntentSchema), }); export type VoiceAmbientExtractResponse = z.infer; export const shopkeeperVoiceContract = c.router({ /** Parse transcript to intent. Rule-based first; LLM fallback when null. Used by mobile when local parse fails. */ parseTranscript: { method: 'POST', path: '/shopkeeper/voice/parse', body: z.object({ transcript: z.string(), productHints: z.string().optional(), // comma-separated shop product names for LLM entity resolution }), responses: { 200: voiceParseResponseSchema, 401: errorResponseSchema, 402: insufficientTokensErrorSchema, }, summary: 'Parse voice transcript to transaction intent (keyword + optional LLM fallback)', }, /** * Transcribe audio and parse intent in one round trip. * Mobile sends base64-encoded WAV (16kHz mono); API calls Whisper then VoiceParseService. * Used as fallback when local regex parse fails and Whisper accuracy is needed. * All AI calls stay server-side — no OpenAI key on device. */ transcribeAudio: { method: 'POST', path: '/shopkeeper/voice/transcribe', body: z.object({ audioBase64: z.string(), language: z.string().optional(), // 'en' | 'ha' | 'yo' | 'ig' productHints: z.string().optional(), // comma-separated shop product names for Whisper prompt }), responses: { 200: voiceTranscribeResponseSchema, 401: errorResponseSchema, 402: insufficientTokensErrorSchema, 422: errorResponseSchema, }, summary: 'Transcribe audio via Whisper and parse intent — single round trip, AI stays server-side', }, /** * After ambient listening ends, send the full rolling transcript. LLM extracts every concrete * shop instruction (sales, stock, credit, restock), ignoring small talk. Same intent shapes as /voice/parse. */ extractAmbientIntents: { method: 'POST', path: '/shopkeeper/voice/extract-ambient', body: z.object({ transcript: z.string(), productHints: z.string().optional(), }), responses: { 200: voiceAmbientExtractResponseSchema, 401: errorResponseSchema, 402: insufficientTokensErrorSchema, }, summary: 'Extract structured transaction intents from a long conversation transcript (ambient listening hand-off)', }, });