All files / web/src/app/api/realtime/euclid/markup validation.ts

100% Statements 78/78
81.25% Branches 13/16
100% Functions 3/3
100% Lines 78/78

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 791x 1x 1x 1x 6x 6x 1x 1x 1x 1x 1x 1x 3x 3x 3x 3x 3x 3x 8x 8x 3x 3x 3x 8x 8x 4x 4x 4x 8x 3x 3x 3x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 12x 362x 362x 358x 358x 8x 8x  
export const MARKER_RE = /\{(seg|tri|ang|pt|def|post|cn|prop):[A-Za-z0-9]+(?:\|[^}]*)?\}/g
 
/** Tokenize text into lowercase words (letters/digits/apostrophes). */
export function words(text: string): string[] {
  return text.toLowerCase().match(/[a-z\d']+/g) ?? []
}
 
/**
 * Word overlap ratio: fraction of original words that appear in the stripped output.
 * Returns 0–1. High values mean the model preserved most of the original prose.
 */
export function wordOverlapRatio(original: string, stripped: string): number {
  const origWords = words(original)
  if (origWords.length === 0) return 1
 
  // Build a bag (multiset) of stripped words so each can only match once
  const bag = new Map<string, number>()
  for (const w of words(stripped)) {
    bag.set(w, (bag.get(w) ?? 0) + 1)
  }
 
  let matched = 0
  for (const w of origWords) {
    const count = bag.get(w) ?? 0
    if (count > 0) {
      matched++
      bag.set(w, count - 1)
    }
  }
 
  return matched / origWords.length
}
 
/**
 * Strict validation: markers expanded to display text must be a subsequence of
 * the original. This catches cases where a marker replaces non-matching text
 * (e.g., {pt:A} eating the word "point").
 *
 * Uses stripEntityMarkers from the shared marker system to expand markers to
 * their canonical display text, then verifies the result is a subsequence of
 * the original.
 *
 * Use for user-written text where we must preserve every word exactly.
 */
export function validateMarkupStrict(
  original: string,
  marked: string,
  expandMarkers: (text: string) => string
): boolean {
  let expanded = expandMarkers(marked)
 
  // Expanded text should be a significant portion of the original
  if (expanded.length < original.length * 0.5) return false
 
  // Strip trailing punctuation the model may have added (models love adding periods)
  expanded = expanded.replace(/[.!?]+$/, '')
 
  // Collapse runs of whitespace so marker expansion doesn't break the subsequence
  // check against the original's single spaces.
  expanded = expanded.replace(/ {2,}/g, ' ').trim()
  const normalizedOriginal = original.replace(/ {2,}/g, ' ').trim()
 
  // The expanded text should be a subsequence of the original.
  // This allows the original to have extra chars (like △ before ABD) that the
  // marker expansion doesn't reproduce, while catching cases where the marker
  // replaced unrelated text (e.g., "point" → {pt:A} expands to "A", and "A"
  // is not a subsequence continuation after "damn ").
  // Case-insensitive: users may type "db" but markers canonicalize to "DB".
  const expandedLower = expanded.toLowerCase()
  const originalLower = normalizedOriginal.toLowerCase()
  let oi = 0
  for (const ch of expandedLower) {
    while (oi < originalLower.length && originalLower[oi] !== ch) oi++
    if (oi >= originalLower.length) return false
    oi++
  }
  return true
}