flowchart-workshop test-case-validator.ts

65.2% Statements 356/546
48.64% Branches 18/37
58.33% Functions 7/12
65.2% Lines 356/546
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547 1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
9x
9x
9x
9x
9x
63x
9x
9x
9x
9x
9x
9x
5x
9x
4x
4x
9x
9x
9x
 
9x
9x
9x
63x
54x
54x
63x
9x
9x
1x
1x
1x
1x
1x
7x
7x
7x
7x
7x
7x
7x
7x
7x
7x
7x
7x
7x
140x
140x
140x
 
 
 
 
140x
7x
7x
7x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
5x
5x
5x
5x
5x
5x
5x
5x
5x
 
 
5x
5x
5x
5x
5x
5x
5x
5x
5x
5x
5x
5x
5x
5x
5x
5x
 
 
 
 
 
5x
1x
1x
1x
1x
1x
1x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1x
1x
1x
1x
1x
1x
2x
2x
2x
2x
 
 
 
 
 
 
 
 
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
 
 
 
 
 
 
 
 
2x
1x
1x
1x
1x
1x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1x
1x
1x
1x
1x
1x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
 
 
 
 
2x
1x
1x
1x
1x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
 
 
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
1x
1x
1x
1x
1x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
2x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2x
2x
2x
2x
2x
 
 
 
1x
1x
1x
1x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1x
1x
1x
1x
1x
1x
1x
1x
1x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1x
1x
1x
1x
1x
 
 
 
 
 
  /**
 * Test Case Validator for Flowchart Workshop
 *
 * Validates that display.answer produces the expected output for each test case.
 * This catches bugs like "4 1/1" instead of "5" before users encounter them.
 *
 * @module flowchart-workshop/test-case-validator
 */
 
import type {
  FlowchartDefinition,
  ProblemExample,
  ProblemValue,
  VariableDefinition,
  StateSnapshot,
} from '../flowcharts/schema'
import { evaluate, type EvalContext } from '../flowcharts/evaluator'
import { analyzeFlowchart, type FlowchartPath } from '../flowcharts/path-analysis'
import type { ExecutableFlowchart } from '../flowcharts/schema'
import { loadFlowchart, simulateWalk, extractAnswer } from '../flowcharts/loader'
 
// =============================================================================
// Types
// =============================================================================
 
/**
 * Result of running a single test case
 */
export interface TestResult {
  /** The example that was tested */
  example: ProblemExample
  /** The actual answer produced by display.answer */
  actualAnswer: string | null
  /** The expected answer from the test case */
  expectedAnswer: string
  /** Whether the test passed (actual === expected after trim) */
  passed: boolean
  /** Error message if evaluation failed */
  error?: string
  /** Computation trace snapshots (when run with ExecutableFlowchart) */
  snapshots?: StateSnapshot[]
}
 
/**
 * Report of path coverage by test cases
 */
export interface CoverageReport {
  /** Total number of unique paths through the flowchart */
  totalPaths: number
  /** Number of paths covered by at least one test case */
  coveredPaths: number
  /** Descriptions of paths not covered by any test */
  uncoveredPaths: string[]
  /** Percentage of paths covered */
  coveragePercent: number
  /** Whether path enumeration hit the safety limit (flowchart has more paths than counted) */
  pathsLimitReached?: boolean
}
 
/**
 * Complete validation report
 */
export interface ValidationReport {
  /** Whether all tests passed */
  passed: boolean
  /** Results for each test case */
  results: TestResult[]
  /** Path coverage information */
  coverage: CoverageReport
  /** Summary counts */
  summary: {
    total: number
    passed: number
    failed: number
    errors: number
  }
}
 
// =============================================================================
// Core Validation Functions
// =============================================================================
 
/**
 * Normalize example values to fix common LLM output issues:
 * 1. Convert string numbers to actual numbers ("1" -> 1)
 * 2. Strip wrapper quotes from strings ("'+'" -> "+")
 */
function normalizeExampleValues(
  values: Record<string, ProblemValue>
): Record<string, ProblemValue> {
  const normalized: Record<string, ProblemValue> = {}
  for (const [key, value] of Object.entries(values)) {
    if (typeof value === 'string') {
      let processed = value.trim()
 
      // Strip wrapper quotes if present (LLM sometimes outputs "'+'" instead of "+")
      // Handles both single quotes ('x') and double quotes ("x")
      if (
        (processed.startsWith("'") && processed.endsWith("'")) ||
        (processed.startsWith('"') && processed.endsWith('"'))
      ) {
        processed = processed.slice(1, -1)
      }
 
      // Check if it's a valid number (integer or decimal)
      if (processed !== '' && !isNaN(Number(processed))) {
        normalized[key] = Number(processed)
      } else {
        normalized[key] = processed
      }
    } else {
      normalized[key] = value
    }
  }
  return normalized
}
 
/**
 * Initialize computed variables for a given set of problem values.
 * This mimics what the flowchart walker does at runtime.
 */
function initializeComputed(
  variables: Record<string, VariableDefinition>,
  problemValues: Record<string, ProblemValue>
): Record<string, ProblemValue> {
  const computed: Record<string, ProblemValue> = {}
  const context: EvalContext = {
    problem: problemValues,
    computed,
    userState: {},
  }
 
  // Initialize variables in order (earlier ones can reference earlier ones)
  for (const [name, def] of Object.entries(variables)) {
    try {
      computed[name] = evaluate(def.init, context)
    } catch (err) {
      // If a variable fails to initialize, set it to null and continue
      console.warn(`Failed to initialize variable ${name}:`, err)
      computed[name] = null as unknown as ProblemValue
    }
  }
 
  return computed
}
 
/**
 * Evaluate display.answer for a given set of problem values.
 * This is THE canonical function for computing answers from flowcharts.
 * Used by both worksheet generation and test validation.
 *
 * @param definition - The flowchart definition
 * @param exampleValues - Problem input values (will be normalized)
 * @returns The answer string and optional error
 */
export function evaluateDisplayAnswer(
  definition: FlowchartDefinition,
  exampleValues: Record<string, ProblemValue>
): { answer: string | null; error?: string } {
  // Normalize values - convert string numbers to actual numbers,
  // strip wrapper quotes from strings (LLM sometimes outputs "'+'" instead of "+")
  const normalizedValues = normalizeExampleValues(exampleValues)
 
  // display.answer is required for all flowcharts
  if (!definition.display?.answer) {
    return { answer: null, error: 'No display.answer defined' }
  }
 
  try {
    // Initialize computed variables (use empty object if no variables defined)
    const computed = initializeComputed(definition.variables || {}, normalizedValues)
 
    // Create evaluation context
    const context: EvalContext = {
      problem: normalizedValues,
      computed,
      userState: {},
    }
 
    // Evaluate display.answer
    const result = evaluate(definition.display.answer, context)
    return { answer: String(result) }
  } catch (err) {
    return {
      answer: null,
      error: err instanceof Error ? err.message : 'Evaluation failed',
    }
  }
}
 
/**
 * Run a single test case and return the result
 * Uses FlowchartDefinition only - for quick validation without full flowchart
 */
export function runTestCase(definition: FlowchartDefinition, example: ProblemExample): TestResult {
  if (!example.expectedAnswer) {
    return {
      example,
      actualAnswer: null,
      expectedAnswer: '',
      passed: false,
      error: 'No expectedAnswer defined for this test case',
    }
  }

  const { answer, error } = evaluateDisplayAnswer(definition, example.values)

  if (error) {
    return {
      example,
      actualAnswer: null,
      expectedAnswer: example.expectedAnswer,
      passed: false,
      error,
    }
  }

  // Compare after trimming whitespace
  const normalizedActual = answer?.trim() ?? ''
  const normalizedExpected = example.expectedAnswer.trim()
  const passed = normalizedActual === normalizedExpected

  return {
    example,
    actualAnswer: answer,
    expectedAnswer: example.expectedAnswer,
    passed,
  }
}
 
/**
 * Run a single test case using an ExecutableFlowchart.
 * Uses simulateWalk + extractAnswer for unified answer computation.
 */
export function runTestCaseWithFlowchart(
  flowchart: ExecutableFlowchart,
  example: ProblemExample
): TestResult {
  if (!example.expectedAnswer) {
    return {
      example,
      actualAnswer: null,
      expectedAnswer: '',
      passed: false,
      error: 'No expectedAnswer defined for this test case',
    }
  }
 
  // Use simulateWalk + extractAnswer for unified computation
  try {
    const normalizedValues = normalizeExampleValues(example.values)
    const terminalState = simulateWalk(flowchart, normalizedValues)
    const { display: answerDisplay } = extractAnswer(flowchart, terminalState)
    const answer = answerDisplay.text || null
 
    // Compare after trimming whitespace
    const normalizedActual = answer?.trim() ?? ''
    const normalizedExpected = example.expectedAnswer.trim()
    const passed = normalizedActual === normalizedExpected
 
    return {
      example,
      actualAnswer: answer,
      expectedAnswer: example.expectedAnswer,
      passed,
      snapshots: terminalState.snapshots,
    }
  } catch (err) {
    return {
      example,
      actualAnswer: null,
      expectedAnswer: example.expectedAnswer,
      passed: false,
      error: err instanceof Error ? err.message : 'Evaluation failed',
    }
  }
}
 
/**
 * Run all test cases for a flowchart definition
 */
export function validateTestCases(definition: FlowchartDefinition): ValidationReport {
  const examples = definition.problemInput.examples || []
  const results: TestResult[] = []

  // Run each test case that has an expectedAnswer
  for (const example of examples) {
    if (example.expectedAnswer) {
      results.push(runTestCase(definition, example))
    }
  }

  // Calculate summary
  const summary = {
    total: results.length,
    passed: results.filter((r) => r.passed).length,
    failed: results.filter((r) => !r.passed && !r.error).length,
    errors: results.filter((r) => r.error).length,
  }

  // Calculate coverage (simplified - we'll enhance this when we have an executable flowchart)
  const coverage: CoverageReport = {
    totalPaths: 0,
    coveredPaths: 0,
    uncoveredPaths: [],
    coveragePercent: 0,
  }

  return {
    passed: results.every((r) => r.passed),
    results,
    coverage,
    summary,
  }
}
 
/**
 * Validate test cases with full coverage analysis.
 * Uses simulateWalk + extractAnswer for validation - the unified
 * computation path used by worksheet generation.
 */
export async function validateTestCasesWithCoverage(
  definition: FlowchartDefinition,
  mermaidContent: string
): Promise<ValidationReport> {
  // Try to build executable flowchart for accurate validation
  try {
    const flowchart = await loadFlowchart(definition, mermaidContent)
    const examples = definition.problemInput.examples || []
 
    // Run tests using the SAME code path as worksheet generation
    const results: TestResult[] = []
    for (const example of examples) {
      if (example.expectedAnswer) {
        results.push(runTestCaseWithFlowchart(flowchart, example))
      }
    }
 
    // Calculate summary
    const summary = {
      total: results.length,
      passed: results.filter((r) => r.passed).length,
      failed: results.filter((r) => !r.passed && !r.error).length,
      errors: results.filter((r) => r.error).length,
    }
 
    // Calculate coverage
    const coverage = await checkCoverage(flowchart, examples)
 
    return {
      passed: results.every((r) => r.passed),
      results,
      coverage,
      summary,
    }
  } catch (err) {
    // If we can't build the flowchart, fall back to basic validation
    console.warn('Could not build flowchart for accurate validation:', err)
    return validateTestCases(definition)
  }
}
 
/**
 * Check test coverage against enumerated paths
 */
export async function checkCoverage(
  flowchart: ExecutableFlowchart,
  examples: ProblemExample[]
): Promise<CoverageReport> {
  // Get all paths through the flowchart
  const analysis = analyzeFlowchart(flowchart)
  const paths = analysis.paths
 
  // Track which paths are covered
  const coveredPathIndices = new Set<number>()
 
  // For each example with expectedAnswer, trace its path through the flowchart
  for (const example of examples) {
    if (!example.expectedAnswer) continue
 
    // Trace the path this example would take
    const pathIndex = findMatchingPath(flowchart, example, paths)
    if (pathIndex !== -1) {
      coveredPathIndices.add(pathIndex)
    }
  }
 
  // Build list of uncovered path descriptions
  const uncoveredPaths: string[] = []
  for (let i = 0; i < paths.length; i++) {
    if (!coveredPathIndices.has(i)) {
      uncoveredPaths.push(describeFlowchartPath(paths[i], flowchart))
    }
  }
 
  const totalPaths = paths.length
  const coveredPaths = coveredPathIndices.size
  const coveragePercent = totalPaths > 0 ? Math.round((coveredPaths / totalPaths) * 100) : 100
 
  return {
    totalPaths,
    coveredPaths,
    uncoveredPaths,
    coveragePercent,
    pathsLimitReached: analysis.pathsLimitReached,
  }
}
 
/**
 * Find which path an example would take through the flowchart.
 * Returns the path index or -1 if no match found.
 */
function findMatchingPath(
  flowchart: ExecutableFlowchart,
  example: ProblemExample,
  paths: FlowchartPath[]
): number {
  // Normalize values - convert string numbers to actual numbers
  const normalizedValues = normalizeExampleValues(example.values)
 
  // Initialize computed variables (use empty object if no variables defined)
  const computed = initializeComputed(flowchart.definition.variables || {}, normalizedValues)
  const context: EvalContext = {
    problem: normalizedValues,
    computed,
    userState: {},
  }
 
  // For each path, check if all constraints are satisfied
  for (let i = 0; i < paths.length; i++) {
    const path = paths[i]
    let matches = true
 
    for (const constraint of path.constraints) {
      try {
        const result = evaluate(constraint.expression, context)
        // For multi-option decisions, compare string values directly
        if (constraint.requiredValue !== undefined) {
          if (String(result) !== constraint.requiredValue) {
            matches = false
            break
          }
        } else {
          // For binary decisions, use boolean comparison
          const actualOutcome = Boolean(result)
          if (actualOutcome !== constraint.requiredOutcome) {
            matches = false
            break
          }
        }
      } catch {
        // If evaluation fails, this constraint doesn't match
        matches = false
        break
      }
    }
 
    if (matches) {
      return i
    }
  }

  return -1
}
 
/**
 * Generate a human-readable description of a flowchart path
 */
function describeFlowchartPath(path: FlowchartPath, flowchart: ExecutableFlowchart): string {
  const parts: string[] = []

  for (const constraint of path.constraints) {
    if (constraint.optionValue === '__skip__' || constraint.optionValue === '__not_skipped__') {
      continue
    }

    const node = flowchart.nodes[constraint.nodeId]
    if (node?.definition.type === 'decision') {
      const decisionDef = node.definition
      const option = decisionDef.options.find((o) => o.value === constraint.optionValue)
      if (option?.pathLabel) {
        parts.push(option.pathLabel)
      } else if (option?.label) {
        parts.push(option.label.slice(0, 15))
      }
    }
  }

  return parts.length > 0 ? parts.join(' → ') : 'Default path'
}
 
// =============================================================================
// LLM Feedback Formatting
// =============================================================================
 
/**
 * Format validation failures as feedback for the LLM to fix
 */
export function formatFailuresForLLM(report: ValidationReport): string {
  if (report.passed) {
    return 'All test cases passed!'
  }

  const lines: string[] = ['## Test Case Failures\n']

  for (const result of report.results) {
    if (!result.passed) {
      lines.push(`### "${result.example.name}"`)
      lines.push(`- **Input values**: ${JSON.stringify(result.example.values)}`)
      lines.push(`- **Expected answer**: "${result.expectedAnswer}"`)

      if (result.error) {
        lines.push(`- **Error**: ${result.error}`)
      } else {
        lines.push(`- **Actual answer**: "${result.actualAnswer}"`)
        lines.push('')
        lines.push(
          'Please fix the `display.answer` expression so it produces the expected output for these inputs.'
        )
      }
      lines.push('')
    }
  }

  if (report.coverage.uncoveredPaths.length > 0) {
    lines.push('## Uncovered Paths')
    lines.push(
      'The following flowchart paths are not covered by any test case. Consider adding examples:'
    )
    for (const path of report.coverage.uncoveredPaths) {
      lines.push(`- ${path}`)
    }
  }

  return lines.join('\n')
}
 
/**
 * Format a single test failure for display in the UI
 */
export function formatTestFailure(result: TestResult): string {
  if (result.error) {
    return `Error: ${result.error}`
  }
  return `Expected "${result.expectedAnswer}" but got "${result.actualAnswer}"`
}