All files / web/src/lib/seed embedding-search.ts

0% Statements 0/159
0% Branches 0/1
0% Functions 0/1
0% Lines 0/159

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160                                                                                                                                                                                                                                                                                                                               
import { createHash } from 'crypto'
import { generateEmbedding, generateEmbeddings } from '@/lib/flowcharts/embedding'
import { cosineSimilarity } from '@/lib/flowcharts/embedding-search'
import { getProfileInfoList } from './profiles'
import type { ProfileInfo } from './types'

/**
 * Cached profile embeddings with content hash for staleness detection.
 */
let profileEmbeddings: Map<string, Float32Array> | null = null
let embeddingPromise: Promise<Map<string, Float32Array>> | null = null
let cachedContentHash: string | null = null
let cachedAt: Date | null = null

/**
 * Build the text content to embed for a profile.
 * Uses structured labels so the embedding model understands the role
 * of each field (similar to the flowchart embedding system).
 */
function buildProfileContent(profile: ProfileInfo): string {
  const parts: string[] = [
    `Profile: ${profile.name}`,
    `Description: ${profile.description}`,
    `Category: ${formatCategory(profile.category)}`,
  ]

  if (profile.expectedSessionMode) {
    parts.push(`Expected Session Mode: ${profile.expectedSessionMode}`)
  }

  if (profile.tags.length > 0) {
    parts.push(`Tags: ${profile.tags.join(', ')}`)
  }

  parts.push(`Practicing Skills: ${profile.practicingSkillCount}`)
  parts.push(`Testing Notes: ${profile.intentionNotes}`)

  return parts.join('\n')
}

function formatCategory(category: ProfileInfo['category']): string {
  switch (category) {
    case 'bkt':
      return 'Skill Mastery Levels'
    case 'session':
      return 'Session Mode Triggers'
    case 'edge':
      return 'Edge Cases & Data Robustness'
  }
}

/**
 * Compute a SHA-256 hash of all profile content to detect changes.
 */
function computeContentHash(): string {
  const profiles = getProfileInfoList()
  const allContent = profiles.map(buildProfileContent).join('\n---\n')
  return createHash('sha256').update(allContent).digest('hex').slice(0, 16)
}

/**
 * Lazily compute and cache embeddings for all seed profiles.
 * Uses a single batched API call for efficiency.
 * The promise is shared so concurrent calls don't duplicate work.
 */
async function getProfileEmbeddings(): Promise<Map<string, Float32Array>> {
  if (profileEmbeddings) return profileEmbeddings

  if (!embeddingPromise) {
    embeddingPromise = (async () => {
      const profiles = getProfileInfoList()
      const contents = profiles.map(buildProfileContent)
      const embeddings = await generateEmbeddings(contents)

      const cache = new Map<string, Float32Array>()
      for (let i = 0; i < profiles.length; i++) {
        cache.set(profiles[i].name, embeddings[i])
      }

      profileEmbeddings = cache
      cachedContentHash = computeContentHash()
      cachedAt = new Date()
      return cache
    })()
  }

  return embeddingPromise
}

export interface ProfileSearchResult {
  name: string
  similarity: number
}

export interface EmbeddingStatus {
  cached: boolean
  stale: boolean
  profileCount: number
  cachedHash: string | null
  currentHash: string
  cachedAt: string | null
}

/**
 * Check whether cached embeddings are stale (profile content has changed
 * since embeddings were generated).
 */
export function getEmbeddingStatus(): EmbeddingStatus {
  const currentHash = computeContentHash()
  const profiles = getProfileInfoList()
  return {
    cached: profileEmbeddings !== null,
    stale: cachedContentHash !== null && cachedContentHash !== currentHash,
    profileCount: profiles.length,
    cachedHash: cachedContentHash,
    currentHash,
    cachedAt: cachedAt?.toISOString() ?? null,
  }
}

/**
 * Invalidate cached embeddings and regenerate them.
 * Returns the new status after regeneration.
 */
export async function regenerateEmbeddings(): Promise<EmbeddingStatus> {
  profileEmbeddings = null
  embeddingPromise = null
  cachedContentHash = null
  cachedAt = null
  await getProfileEmbeddings()
  return getEmbeddingStatus()
}

/**
 * Search seed profiles by natural language query using embedding similarity.
 */
export async function searchProfiles(
  query: string,
  options: { limit?: number; minSimilarity?: number } = {}
): Promise<ProfileSearchResult[]> {
  const { limit = 21, minSimilarity = 0.3 } = options

  const [queryEmbedding, cache] = await Promise.all([
    generateEmbedding(query),
    getProfileEmbeddings(),
  ])

  const results: ProfileSearchResult[] = []

  for (const [name, embedding] of cache) {
    const similarity = cosineSimilarity(queryEmbedding, embedding)
    if (similarity >= minSimilarity) {
      results.push({ name, similarity })
    }
  }

  results.sort((a, b) => b.similarity - a.similarity)
  return results.slice(0, limit)
}