import { OpenAIEvaluator, createEvaluator } from "../src/evaluator"; import { AgentRailsConfig } from "../src/types"; // Mock OpenAI const mockCreate = jest.fn(); jest.mock("openai", () => { return { __esModule: true, default: jest.fn().mockImplementation(() => ({ chat: { completions: { create: mockCreate, }, }, })), }; }); describe("Evaluator", () => { const mockConfig: AgentRailsConfig = { llm: { provider: "openai", apiKey: "test-api-key", model: "gpt-4-turbo-preview", temperature: 0.3, }, agent: async (input: any) => "response", }; describe("createEvaluator", () => { it("should create an OpenAI evaluator", () => { const evaluator = createEvaluator(mockConfig); expect(evaluator).toBeInstanceOf(OpenAIEvaluator); }); it("should throw error for unsupported provider", () => { const invalidConfig: any = { ...mockConfig, llm: { ...mockConfig.llm, provider: "unsupported" }, }; expect(() => createEvaluator(invalidConfig)).toThrow( "Unsupported LLM provider" ); }); }); describe("OpenAIEvaluator", () => { let evaluator: OpenAIEvaluator; beforeEach(() => { evaluator = new OpenAIEvaluator(mockConfig); mockCreate.mockClear(); }); it("should evaluate response as passed", async () => { mockCreate.mockResolvedValue({ choices: [ { message: { content: JSON.stringify({ passed: true, reasoning: "Response is appropriate", }), }, }, ], }); const result = await evaluator.evaluate( "Hello", "Hi there!", "Should greet", ["Hello!", "Hi!"] ); expect(result.passed).toBe(true); expect(result.reasoning).toBe("Response is appropriate"); expect(mockCreate).toHaveBeenCalledTimes(1); }); it("should evaluate response as failed", async () => { mockCreate.mockResolvedValue({ choices: [ { message: { content: JSON.stringify({ passed: false, reasoning: "Response is not relevant", }), }, }, ], }); const result = await evaluator.evaluate( "What is 2+2?", "Hello", "Should answer math question" ); expect(result.passed).toBe(false); expect(result.reasoning).toBe("Response is not relevant"); }); it("should handle structured input and output", async () => { mockCreate.mockResolvedValue({ choices: [ { message: { content: JSON.stringify({ passed: true, reasoning: "Correctly processed structured data", }), }, }, ], }); const result = await evaluator.evaluate( { type: "query", data: "test" }, { status: "success", result: "processed" } ); expect(result.passed).toBe(true); expect(mockCreate).toHaveBeenCalledTimes(1); // Check that structured data was stringified in prompt const callArgs = mockCreate.mock.calls[0][0]; expect(callArgs.messages[1].content).toContain("type"); expect(callArgs.messages[1].content).toContain("query"); }); it("should throw error if LLM returns no content", async () => { mockCreate.mockResolvedValue({ choices: [{ message: {} }], }); await expect(evaluator.evaluate("input", "output")).rejects.toThrow( "No response from LLM evaluator" ); }); it("should throw error if LLM call fails", async () => { mockCreate.mockRejectedValue(new Error("API Error")); await expect(evaluator.evaluate("input", "output")).rejects.toThrow( "LLM evaluation failed" ); }); it("should include expected behavior in prompt", async () => { mockCreate.mockResolvedValue({ choices: [ { message: { content: JSON.stringify({ passed: true, reasoning: "Good" }), }, }, ], }); await evaluator.evaluate( "input", "output", "Expected behavior description" ); const callArgs = mockCreate.mock.calls[0][0]; expect(callArgs.messages[1].content).toContain("Expected Behavior"); expect(callArgs.messages[1].content).toContain( "Expected behavior description" ); }); it("should include example responses in prompt", async () => { mockCreate.mockResolvedValue({ choices: [ { message: { content: JSON.stringify({ passed: true, reasoning: "Good" }), }, }, ], }); await evaluator.evaluate("input", "output", undefined, [ "Example 1", "Example 2", ]); const callArgs = mockCreate.mock.calls[0][0]; expect(callArgs.messages[1].content).toContain( "Example Appropriate Responses" ); expect(callArgs.messages[1].content).toContain("Example 1"); expect(callArgs.messages[1].content).toContain("Example 2"); }); it("should use configured model and temperature", async () => { mockCreate.mockResolvedValue({ choices: [ { message: { content: JSON.stringify({ passed: true, reasoning: "Good" }), }, }, ], }); await evaluator.evaluate("input", "output"); const callArgs = mockCreate.mock.calls[0][0]; expect(callArgs.model).toBe("gpt-4-turbo-preview"); expect(callArgs.temperature).toBe(0.3); }); }); });