From 6e1bcbf0f6d2af65eda2296ce72558e4523bd2ab Mon Sep 17 00:00:00 2001 From: babblebey Date: Tue, 3 Mar 2026 22:04:41 +0100 Subject: [PATCH 1/9] refactor: streamline dictionary loading and document creation process adding `slug` to metadata --- dev/seed-vector-store.js | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/dev/seed-vector-store.js b/dev/seed-vector-store.js index 9beaa3ed..62d8ec23 100644 --- a/dev/seed-vector-store.js +++ b/dev/seed-vector-store.js @@ -1,26 +1,22 @@ import fetch from "node-fetch"; -import fs from "node:fs/promises"; -import { fileURLToPath } from "node:url"; -import { dirname, join } from "node:path"; -import { vectorStore } from "../apps/jai/index.js"; -import { JSONLoader } from "langchain/document_loaders/fs/json"; +import { Document } from "langchain/document"; import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; +import { vectorStore } from "../apps/jai/index.js"; // Load the dictionary from the jargons.dev API const response = await fetch("https://jargons.dev/api/v1/browse"); const dictionary = await response.json(); - -// Save the dictionary to the file system -const path = join(dirname(fileURLToPath(import.meta.url)), "dictionary.json"); -await fs.writeFile(path, JSON.stringify(dictionary, null, 2)); -console.log(`Saved the dictionary file to ${path}`); - -// Load the dictionary from the file system -const loader = new JSONLoader("dev/dictionary.json", ["/title", "/content"]); - -// Load the documents -const docs = await loader.load(); -console.log(`Loaded ${docs.length} documents`); +console.log(`Fetched ${dictionary.length} words from the API`); + +// Create LangChain Documents with slug metadata +const docs = dictionary.map( + (word) => + new Document({ + pageContent: `${word.title}\n\n${word.content}`, + metadata: { slug: word.slug }, + }), +); +console.log(`Created ${docs.length} documents`); // Initialize the splitter const splitter = new RecursiveCharacterTextSplitter({ @@ -30,7 +26,7 @@ const splitter = new RecursiveCharacterTextSplitter({ // Split the documents const allSplits = await splitter.splitDocuments(docs); -console.log(`Split ${allSplits.length} documents`); +console.log(`Split into ${allSplits.length} chunks`); // Add the splits to the vector store in batches const batchSize = 100; @@ -43,8 +39,4 @@ for (let i = 0; i < allSplits.length; i += batchSize) { `Added batch ${batchNum} of ${totalBatches} (${batch.length} documents) to the vector store`, ); } -console.log(`Added ${allSplits.length} splits to the vector store`); - -// Clean up -await fs.rm(path); -console.log(`Cleaned up the dictionary file at ${path}`); \ No newline at end of file +console.log(`Added ${allSplits.length} splits to the vector store`); \ No newline at end of file From 13ca893cbd9208e17f0e9a77fdd21a92c18e4329 Mon Sep 17 00:00:00 2001 From: babblebey Date: Tue, 3 Mar 2026 22:06:23 +0100 Subject: [PATCH 2/9] feat: add incremental vector store update script for Qdrant --- dev/update-vector-store.js | 193 +++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 dev/update-vector-store.js diff --git a/dev/update-vector-store.js b/dev/update-vector-store.js new file mode 100644 index 00000000..66b93a81 --- /dev/null +++ b/dev/update-vector-store.js @@ -0,0 +1,193 @@ +/** + * Incremental Vector Store (Qdrant) Update Script + * + * Updates the Qdrant vector store with only the changed dictionary words, + * rather than re-seeding the entire collection. + * + * Usage: + * node dev/update-vector-store.js --upsert slug1,slug2 --delete slug3,slug4 + * + * Flags: + * --upsert Comma-separated slugs to add or update in the vector store. + * Fetches content from the live API, deletes old chunks for each + * slug, then adds new chunks. + * --delete Comma-separated slugs to remove from the vector store. + * + * Required env vars: + * OPENAI_API_KEY, OPENAI_EMBEDDINGS_MODEL, QDRANT_URL, QDRANT_API_KEY + */ + +import fetch from "node-fetch"; +import { Document } from "langchain/document"; +import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; +import { vectorStore } from "../apps/jai/index.js"; + +const PRODUCTION_API_BASE = "https://jargons.dev/api/v1/browse"; + +// --------------------------------------------------------------------------- +// CLI argument parsing +// --------------------------------------------------------------------------- + +function parseArgs(argv) { + const args = argv.slice(2); + const upsertSlugs = []; + const deleteSlugs = []; + + for (let i = 0; i < args.length; i++) { + if (args[i] === "--upsert" && args[i + 1]) { + upsertSlugs.push( + ...args[i + 1] + .split(",") + .map((s) => s.trim()) + .filter(Boolean), + ); + i++; + } else if (args[i] === "--delete" && args[i + 1]) { + deleteSlugs.push( + ...args[i + 1] + .split(",") + .map((s) => s.trim()) + .filter(Boolean), + ); + i++; + } + } + + return { upsertSlugs, deleteSlugs }; +} + +// --------------------------------------------------------------------------- +// Qdrant helpers +// --------------------------------------------------------------------------- + +/** + * Delete all existing vector points for a given word slug. + * Uses a Qdrant payload filter on `metadata.slug`. + */ +async function deletePointsBySlug(slug) { + await vectorStore.delete({ + filter: { + must: [ + { + key: "metadata.slug", + match: { value: slug }, + }, + ], + }, + }); +} + +/** + * Fetch a single word's data from the production API. + * Returns `{ slug, title, content }` or `null` if not found. + */ +async function fetchWord(slug) { + const response = await fetch(`${PRODUCTION_API_BASE}/${slug}`); + if (!response.ok) { + if (response.status === 404) return null; + throw new Error( + `Failed to fetch word "${slug}": ${response.status} ${response.statusText}`, + ); + } + return response.json(); +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +const { upsertSlugs, deleteSlugs } = parseArgs(process.argv); + +if (upsertSlugs.length === 0 && deleteSlugs.length === 0) { + console.log("No slugs provided. Nothing to do."); + console.log( + "Usage: node dev/update-vector-store.js --upsert slug1,slug2 --delete slug3,slug4", + ); + process.exit(0); +} + +console.log("πŸš€ Starting incremental vector store update..."); +console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + +if (upsertSlugs.length > 0) { + console.log(`πŸ“ Words to upsert: ${upsertSlugs.join(", ")}`); +} +if (deleteSlugs.length > 0) { + console.log(`πŸ—‘οΈ Words to delete: ${deleteSlugs.join(", ")}`); +} + +const splitter = new RecursiveCharacterTextSplitter({ + chunkSize: 1000, + chunkOverlap: 200, +}); + +let upsertedCount = 0; +let deletedCount = 0; +let failedCount = 0; + +// ---- Handle upserts (add / update) ---------------------------------------- + +for (const slug of upsertSlugs) { + try { + console.log(`\nπŸ”„ Processing upsert for "${slug}"...`); + + // 1. Remove any existing chunks for this word + console.log(` Deleting old chunks for "${slug}"...`); + await deletePointsBySlug(slug); + + // 2. Fetch the latest content from the deployed site + const word = await fetchWord(slug); + if (!word) { + console.warn(` ⚠️ Word "${slug}" not found in production API, skipping.`); + failedCount++; + continue; + } + + // 3. Create a LangChain Document with slug metadata + const doc = new Document({ + pageContent: `${word.title}\n\n${word.content}`, + metadata: { slug: word.slug }, + }); + + // 4. Split into chunks (preserving metadata on each chunk) + const chunks = await splitter.splitDocuments([doc]); + console.log(` Split into ${chunks.length} chunk(s).`); + + // 5. Add to vector store + await vectorStore.addDocuments(chunks); + console.log(` βœ… Upserted "${slug}" (${chunks.length} chunks)`); + upsertedCount++; + } catch (error) { + console.error(` ❌ Failed to upsert "${slug}":`, error.message); + failedCount++; + } +} + +// ---- Handle deletes -------------------------------------------------------- + +for (const slug of deleteSlugs) { + try { + console.log(`\nπŸ—‘οΈ Deleting "${slug}" from vector store...`); + await deletePointsBySlug(slug); + console.log(` βœ… Deleted "${slug}"`); + deletedCount++; + } catch (error) { + console.error(` ❌ Failed to delete "${slug}":`, error.message); + failedCount++; + } +} + +// ---- Summary --------------------------------------------------------------- + +console.log("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); +console.log( + `✨ Done! Upserted: ${upsertedCount}, Deleted: ${deletedCount}, Failed: ${failedCount}`, +); + +if (failedCount > 0) { + console.error("πŸ’₯ Some operations failed. Check the logs above."); + process.exit(1); +} + +console.log("πŸŽ‰ Vector store update completed successfully!"); +process.exit(0); From 5c074343a01d1b9d52c84c95c2d3ac5dd2e67b8e Mon Sep 17 00:00:00 2001 From: babblebey Date: Tue, 3 Mar 2026 22:11:43 +0100 Subject: [PATCH 3/9] feat: add update scripts for vector store management --- package.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/package.json b/package.json index 51f50179..0f92921d 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,8 @@ "test:coverage": "vitest run --coverage", "setup": "node dev/setup.js", "seed:jai": "node --env-file=.env dev/seed-vector-store.js", + "update:jai": "node --env-file=.env dev/update-vector-store.js", + "update:jai:ci": "node dev/update-vector-store.js", "ping:qdrant": "node --env-file=.env dev/ping-qdrant-cluster.js", "ping:qdrant:ci": "node dev/ping-qdrant-cluster.js" }, From 47516541a55cde703eaa637bb6e9f054b46e0010 Mon Sep 17 00:00:00 2001 From: babblebey Date: Tue, 3 Mar 2026 22:12:05 +0100 Subject: [PATCH 4/9] feat: add workflow for updating vector store with manual and automatic triggers --- .github/workflows/update-vector-store.yml | 137 ++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 .github/workflows/update-vector-store.yml diff --git a/.github/workflows/update-vector-store.yml b/.github/workflows/update-vector-store.yml new file mode 100644 index 00000000..c554c549 --- /dev/null +++ b/.github/workflows/update-vector-store.yml @@ -0,0 +1,137 @@ +name: Update Vector Store (Qdrant) + +on: + deployment_status: + + # Allow manual triggering with custom slugs + workflow_dispatch: + inputs: + upsert_slugs: + description: "Comma-separated slugs to upsert (e.g. api,closure)" + required: false + delete_slugs: + description: "Comma-separated slugs to delete (e.g. old-term)" + required: false + +jobs: + update-vector-store: + runs-on: ubuntu-latest + + # Only run on successful production deployments (or manual trigger) + if: > + github.event_name == 'workflow_dispatch' || + ( + github.event.deployment_status.state == 'success' && + github.event.deployment.environment == 'Production' + ) + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + # Fetch enough history to diff against the previous commit + fetch-depth: 2 + + - name: Detect changed dictionary files + id: detect-changes + if: github.event_name != 'workflow_dispatch' + run: | + echo "Detecting dictionary file changes..." + + UPSERT_SLUGS="" + DELETE_SLUGS="" + + # Compare HEAD with its parent to find changed dictionary files + while IFS=$'\t' read -r status file; do + # Only process files in src/content/dictionary/ + if [[ "$file" == src/content/dictionary/*.mdx ]]; then + # Extract slug from filename (e.g. src/content/dictionary/api.mdx -> api) + slug=$(basename "$file" .mdx) + + if [[ "$status" == "D" ]]; then + # File was deleted + if [ -n "$DELETE_SLUGS" ]; then + DELETE_SLUGS="$DELETE_SLUGS,$slug" + else + DELETE_SLUGS="$slug" + fi + else + # File was added (A), modified (M), or renamed (R*) + if [ -n "$UPSERT_SLUGS" ]; then + UPSERT_SLUGS="$UPSERT_SLUGS,$slug" + else + UPSERT_SLUGS="$slug" + fi + fi + fi + done < <(git diff --name-status HEAD~1 -- src/content/dictionary/) + + echo "upsert_slugs=$UPSERT_SLUGS" >> "$GITHUB_OUTPUT" + echo "delete_slugs=$DELETE_SLUGS" >> "$GITHUB_OUTPUT" + + if [ -z "$UPSERT_SLUGS" ] && [ -z "$DELETE_SLUGS" ]; then + echo "has_changes=false" >> "$GITHUB_OUTPUT" + echo "No dictionary changes detected. Skipping update." + else + echo "has_changes=true" >> "$GITHUB_OUTPUT" + echo "Upsert slugs: $UPSERT_SLUGS" + echo "Delete slugs: $DELETE_SLUGS" + fi + + - name: Resolve slugs + id: resolve-slugs + run: | + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + echo "upsert=${{ github.event.inputs.upsert_slugs }}" >> "$GITHUB_OUTPUT" + echo "delete=${{ github.event.inputs.delete_slugs }}" >> "$GITHUB_OUTPUT" + + if [ -z "${{ github.event.inputs.upsert_slugs }}" ] && [ -z "${{ github.event.inputs.delete_slugs }}" ]; then + echo "skip=true" >> "$GITHUB_OUTPUT" + else + echo "skip=false" >> "$GITHUB_OUTPUT" + fi + else + echo "upsert=${{ steps.detect-changes.outputs.upsert_slugs }}" >> "$GITHUB_OUTPUT" + echo "delete=${{ steps.detect-changes.outputs.delete_slugs }}" >> "$GITHUB_OUTPUT" + echo "skip=${{ steps.detect-changes.outputs.has_changes != 'true' }}" >> "$GITHUB_OUTPUT" + fi + + - name: Skip if no changes + if: steps.resolve-slugs.outputs.skip == 'true' + run: echo "βœ… No dictionary changes to process. Skipping." + + - name: Setup Node.js + if: steps.resolve-slugs.outputs.skip != 'true' + uses: actions/setup-node@v4 + with: + node-version: "20" + cache: "npm" + + - name: Install dependencies + if: steps.resolve-slugs.outputs.skip != 'true' + run: npm ci + + - name: Update vector store + if: steps.resolve-slugs.outputs.skip != 'true' + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + OPENAI_EMBEDDINGS_MODEL: ${{ secrets.OPENAI_EMBEDDINGS_MODEL }} + QDRANT_URL: ${{ secrets.QDRANT_URL }} + QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }} + run: | + ARGS="" + + if [ -n "${{ steps.resolve-slugs.outputs.upsert }}" ]; then + ARGS="$ARGS --upsert ${{ steps.resolve-slugs.outputs.upsert }}" + fi + + if [ -n "${{ steps.resolve-slugs.outputs.delete }}" ]; then + ARGS="$ARGS --delete ${{ steps.resolve-slugs.outputs.delete }}" + fi + + echo "Running: npm run update:jai:ci -- $ARGS" + npm run update:jai:ci -- $ARGS + + - name: Update successful + if: steps.resolve-slugs.outputs.skip != 'true' + run: echo "βœ… Vector store update completed successfully" From c9b8c04b8b4a9e8c7b65e1eb5cf2f334fd897078 Mon Sep 17 00:00:00 2001 From: babblebey Date: Tue, 3 Mar 2026 22:12:23 +0100 Subject: [PATCH 5/9] feat: enhance README with detailed update vector store script and usage instructions --- dev/README.md | 137 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 125 insertions(+), 12 deletions(-) diff --git a/dev/README.md b/dev/README.md index 470f133a..fe4652e8 100644 --- a/dev/README.md +++ b/dev/README.md @@ -40,7 +40,6 @@ Before running this script, ensure you have: - All dependencies installed (`npm ci`) - `OPENAI_API_KEY`, `QDRANT_URL` and `QDRANT_API_KEY` environment variables properly configured in your `.env` file - Network access to fetch from jargons.dev API -- Sufficient disk space for temporary dictionary file ### Usage @@ -53,19 +52,17 @@ npm run seed:jai The script performs these steps to prepare ✨jAI's knowledge base: 1. **Data Fetching**: Downloads the complete dictionary from `https://jargons.dev/api/v1/browse` -2. **File Processing**: Saves data locally and loads it using LangChain's JSONLoader -3. **Document Splitting**: Breaks content into optimally-sized chunks (1000 chars with 200 overlap) +2. **Document Creation**: Creates LangChain `Document` objects directly from the API response, attaching `slug` metadata to each word for future incremental updates +3. **Document Splitting**: Breaks content into optimally-sized chunks (1000 chars with 200 overlap), preserving the slug metadata on every chunk 4. **Vector Store Population**: Adds processed documents to ✨jAI's vector store in batches of 100 -5. **Cleanup**: Removes temporary files and provides completion summary ### Technical Implementation The script leverages several key technologies: -- **LangChain JSONLoader**: Extracts title and content fields from dictionary entries +- **LangChain Document**: Creates documents directly from API data with `metadata.slug` for traceability - **RecursiveCharacterTextSplitter**: Intelligently splits text while preserving context - **Batch Processing**: Prevents memory issues and provides progress feedback -- **File System Operations**: Handles temporary file creation and cleanup ### Configuration Options @@ -85,25 +82,141 @@ Key parameters that can be adjusted: The script includes robust error handling for: - Network connectivity issues during API calls -- File system errors during temporary file operations - Vector store connection problems - Memory management during large batch processing ### Example Output ``` -Saved the dictionary file to /path/to/dev/dictionary.json -Loaded 500 documents -Split 1250 documents +Fetched 500 words from the API +Created 500 documents +Split into 1250 chunks Added batch 1 of 13 (100 documents) to the vector store Added batch 2 of 13 (100 documents) to the vector store ... Added 1250 splits to the vector store -Cleaned up the dictionary file at /path/to/dev/dictionary.json ``` Once completed, ✨jAI will have access to the processed dictionary content and can provide intelligent responses about software engineering terms. +> **Note:** After running a full seed, all vector points will include `metadata.slug`, which is required for incremental updates via the [Update Vector Store Script](#update-vector-store-script) to work correctly. + +## Update Vector Store Script + +This script performs **incremental updates** to ✨jAI's vector store when dictionary words are added, modified, or removed. Instead of re-seeding the entire collection, it targets only the changed words β€” making it fast and efficient for CI/CD use after new words are merged. + +### When to Use + +This script is primarily run automatically via the **Update Vector Store** GitHub Actions workflow when a new word PR is merged and the Vercel production deployment succeeds. You can also run it manually when you need to: +- Add or update specific words in the vector store +- Remove deleted words from the vector store +- Fix vector store entries for particular terms + +### Prerequisites + +Before running this script, ensure you have: +- All dependencies installed (`npm ci`) +- `OPENAI_API_KEY`, `OPENAI_EMBEDDINGS_MODEL`, `QDRANT_URL` and `QDRANT_API_KEY` environment variables properly configured in your `.env` file +- Network access to fetch from the jargons.dev production API +- The vector store has been initially seeded with `metadata.slug` on all points (via `npm run seed:jai`) + +### Usage + +**Local Development:** +```bash +npm run update:jai -- --upsert slug1,slug2 --delete slug3,slug4 +``` + +**CI/CD (without .env file):** +```bash +npm run update:jai:ci -- --upsert slug1,slug2 --delete slug3 +``` + +### Flags + +- `--upsert ` β€” Comma-separated slugs of words to add or update. For each slug, the script deletes any existing chunks in Qdrant (by `metadata.slug` filter), fetches the latest content from the production API, splits it into chunks, and adds them to the vector store. +- `--delete ` β€” Comma-separated slugs of words to remove. Deletes all chunks matching the slug from Qdrant. + +Both flags are optional, but at least one must be provided for the script to do anything. + +### How It Works + +The script performs these steps for each word: + +**For upserts (add/update):** +1. **Delete Old Chunks**: Removes existing vector points matching `metadata.slug` via a Qdrant filter +2. **Fetch Latest Content**: Downloads the word from `https://jargons.dev/api/v1/browse/{slug}` +3. **Create Document**: Builds a LangChain `Document` with `metadata.slug` for traceability +4. **Split into Chunks**: Breaks content into optimally-sized chunks (1000 chars with 200 overlap) +5. **Add to Vector Store**: Upserts the new chunks into Qdrant + +**For deletes:** +1. **Delete Chunks**: Removes all vector points matching `metadata.slug` via a Qdrant filter + +### Technical Implementation + +The script leverages several key technologies: + +- **LangChain Document**: Creates documents with `metadata.slug` for targeted updates +- **Qdrant Filter-based Deletion**: Uses `vectorStore.delete({ filter })` with a `metadata.slug` match condition to precisely target existing chunks for a word +- **RecursiveCharacterTextSplitter**: Same chunking config as the seed script (1000/200) for consistency +- **Production API**: Fetches from the deployed site to ensure the vector store matches the live content + +### Configuration Options + +Required environment variables: + +- **QDRANT_URL**: Your Qdrant cluster endpoint (e.g., `https://your-cluster.gcp.cloud.qdrant.io`) +- **QDRANT_API_KEY**: Your Qdrant cluster API key for authentication +- **OPENAI_API_KEY**: Your OpenAI API Key for generating embeddings +- **OPENAI_EMBEDDINGS_MODEL**: The embeddings model to use (e.g., `text-embedding-3-small`) + +### Automated via GitHub Actions + +The **Update Vector Store** workflow (`.github/workflows/update-vector-store.yml`) runs this script automatically: + +- **Trigger**: Fires on `deployment_status` events β€” specifically when Vercel reports a successful **Production** deployment +- **Change Detection**: Diffs `HEAD~1` to identify added, modified, or deleted `.mdx` files in `src/content/dictionary/` +- **Skip Logic**: Exits early if no dictionary files were changed in the commit +- **Manual Trigger**: Can also be run manually from the GitHub Actions tab with custom `upsert_slugs` and `delete_slugs` inputs +- **Required Secrets**: `OPENAI_API_KEY`, `OPENAI_EMBEDDINGS_MODEL`, `QDRANT_URL`, `QDRANT_API_KEY` + +### Error Handling + +The script includes robust error handling for: +- Missing or invalid CLI arguments (prints usage and exits gracefully) +- Words not found on the production API (404 β€” warns and continues with remaining slugs) +- Network connectivity issues +- Vector store connection and deletion failures +- Per-word error isolation (one failing slug doesn't block the others) +- Non-zero exit code if any operation fails + +### Example Output + +``` +πŸš€ Starting incremental vector store update... +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +πŸ“ Words to upsert: api, closure +πŸ—‘οΈ Words to delete: old-term + +πŸ”„ Processing upsert for "api"... + Deleting old chunks for "api"... + Split into 3 chunk(s). + βœ… Upserted "api" (3 chunks) + +πŸ”„ Processing upsert for "closure"... + Deleting old chunks for "closure"... + Split into 2 chunk(s). + βœ… Upserted "closure" (2 chunks) + +πŸ—‘οΈ Deleting "old-term" from vector store... + βœ… Deleted "old-term" + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +✨ Done! Upserted: 2, Deleted: 1, Failed: 0 +πŸŽ‰ Vector store update completed successfully! +``` + ## Vector Store Cluster Ping Script This script performs a lightweight health check on the Vector Store (Qdrant) cluster to keep it active and prevent automatic deletion due to inactivity. It's designed to be run both locally for testing and automatically via GitHub Actions. @@ -164,7 +277,7 @@ Required environment variables: ### Automated Scheduling The script is automatically run via GitHub Actions: -- **Schedule**: Every Sunday at 2 AM UTC +- **Schedule**: Every Sunday and Wednesday at midnight UTC - **Manual Trigger**: Can be run manually from GitHub Actions tab - **Purpose**: Prevents cluster deletion due to inactivity From 47dd3bba9f5c0b817bc0f2b600e4c92d06cac15b Mon Sep 17 00:00:00 2001 From: babblebey Date: Tue, 3 Mar 2026 22:18:32 +0100 Subject: [PATCH 6/9] fix: update README to reflect change from secret to variable for OPENAI_EMBEDDINGS_MODEL --- .github/workflows/update-vector-store.yml | 2 +- dev/README.md | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/update-vector-store.yml b/.github/workflows/update-vector-store.yml index c554c549..24be0c0e 100644 --- a/.github/workflows/update-vector-store.yml +++ b/.github/workflows/update-vector-store.yml @@ -115,7 +115,7 @@ jobs: if: steps.resolve-slugs.outputs.skip != 'true' env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - OPENAI_EMBEDDINGS_MODEL: ${{ secrets.OPENAI_EMBEDDINGS_MODEL }} + OPENAI_EMBEDDINGS_MODEL: ${{ vars.OPENAI_EMBEDDINGS_MODEL }} QDRANT_URL: ${{ secrets.QDRANT_URL }} QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }} run: | diff --git a/dev/README.md b/dev/README.md index fe4652e8..c984e622 100644 --- a/dev/README.md +++ b/dev/README.md @@ -179,7 +179,8 @@ The **Update Vector Store** workflow (`.github/workflows/update-vector-store.yml - **Change Detection**: Diffs `HEAD~1` to identify added, modified, or deleted `.mdx` files in `src/content/dictionary/` - **Skip Logic**: Exits early if no dictionary files were changed in the commit - **Manual Trigger**: Can also be run manually from the GitHub Actions tab with custom `upsert_slugs` and `delete_slugs` inputs -- **Required Secrets**: `OPENAI_API_KEY`, `OPENAI_EMBEDDINGS_MODEL`, `QDRANT_URL`, `QDRANT_API_KEY` +- **Required Secrets**: `OPENAI_API_KEY`, `QDRANT_URL`, `QDRANT_API_KEY` +- **Required Variables**: `OPENAI_EMBEDDINGS_MODEL` ### Error Handling From 7752d94c493595fa345f849248626dcdc5aaff06 Mon Sep 17 00:00:00 2001 From: babblebey Date: Tue, 3 Mar 2026 23:05:50 +0100 Subject: [PATCH 7/9] feat: add PR label gate to update vector store workflow and enhance README --- .github/workflows/update-vector-store.yml | 137 ++++++++++++++-------- dev/README.md | 5 +- 2 files changed, 88 insertions(+), 54 deletions(-) diff --git a/.github/workflows/update-vector-store.yml b/.github/workflows/update-vector-store.yml index 24be0c0e..e7617b99 100644 --- a/.github/workflows/update-vector-store.yml +++ b/.github/workflows/update-vector-store.yml @@ -26,93 +26,126 @@ jobs: ) steps: + # ── Gate: Check that the merged PR has a dictionary label ────────── + - name: Check PR labels + if: github.event_name != 'workflow_dispatch' + id: pr-check + uses: actions/github-script@v7 + with: + script: | + const sha = context.payload.deployment.sha; + + // Find PRs associated with this deployment commit + const { data: prs } = await github.rest.repos.listPullRequestsAssociatedWithCommit({ + owner: context.repo.owner, + repo: context.repo.repo, + commit_sha: sha, + }); + + // Find the merged PR targeting main + const mergedPR = prs.find(pr => pr.merged_at && pr.base.ref === 'main'); + + if (!mergedPR) { + core.info('No merged PR found for this deployment. Skipping.'); + core.setOutput('should_continue', 'false'); + return; + } + + const labels = mergedPR.labels.map(l => l.name); + core.info(`PR #${mergedPR.number}: ${mergedPR.title}`); + core.info(`Labels: ${labels.join(', ')}`); + + const requiredLabels = ['πŸ“–edit-word', 'πŸ“–new-word']; + const hasRequiredLabel = labels.some(l => requiredLabels.includes(l)); + + if (!hasRequiredLabel) { + core.info(`PR does not have required labels (${requiredLabels.join(', ')}). Skipping.`); + core.setOutput('should_continue', 'false'); + return; + } + + core.info('βœ… PR has required label. Proceeding with update.'); + core.setOutput('should_continue', 'true'); + + - name: Skip β€” PR lacks required labels + if: github.event_name != 'workflow_dispatch' && steps.pr-check.outputs.should_continue != 'true' + run: | + echo "⏭️ Skipping: deployment is not from a πŸ“–new-word or πŸ“–edit-word PR." + + # ── Checkout & detect changed dictionary files ───────────────────── - name: Checkout repository + if: github.event_name == 'workflow_dispatch' || steps.pr-check.outputs.should_continue == 'true' uses: actions/checkout@v4 with: - # Fetch enough history to diff against the previous commit + ref: ${{ github.event.deployment.sha || github.sha }} fetch-depth: 2 - name: Detect changed dictionary files - id: detect-changes - if: github.event_name != 'workflow_dispatch' + if: github.event_name == 'workflow_dispatch' || steps.pr-check.outputs.should_continue == 'true' + id: detect run: | - echo "Detecting dictionary file changes..." + # For manual triggers, use the provided inputs directly + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + UPSERT="${{ github.event.inputs.upsert_slugs }}" + DELETE="${{ github.event.inputs.delete_slugs }}" + + if [ -z "$UPSERT" ] && [ -z "$DELETE" ]; then + echo "has_changes=false" >> "$GITHUB_OUTPUT" + echo "No slugs provided for manual trigger." + else + echo "upsert=$UPSERT" >> "$GITHUB_OUTPUT" + echo "delete=$DELETE" >> "$GITHUB_OUTPUT" + echo "has_changes=true" >> "$GITHUB_OUTPUT" + fi + exit 0 + fi + # For deployment triggers, diff against the parent commit + echo "Detecting dictionary file changes..." UPSERT_SLUGS="" DELETE_SLUGS="" - # Compare HEAD with its parent to find changed dictionary files while IFS=$'\t' read -r status file; do - # Only process files in src/content/dictionary/ if [[ "$file" == src/content/dictionary/*.mdx ]]; then - # Extract slug from filename (e.g. src/content/dictionary/api.mdx -> api) slug=$(basename "$file" .mdx) if [[ "$status" == "D" ]]; then - # File was deleted - if [ -n "$DELETE_SLUGS" ]; then - DELETE_SLUGS="$DELETE_SLUGS,$slug" - else - DELETE_SLUGS="$slug" - fi + DELETE_SLUGS="${DELETE_SLUGS:+$DELETE_SLUGS,}$slug" else - # File was added (A), modified (M), or renamed (R*) - if [ -n "$UPSERT_SLUGS" ]; then - UPSERT_SLUGS="$UPSERT_SLUGS,$slug" - else - UPSERT_SLUGS="$slug" - fi + UPSERT_SLUGS="${UPSERT_SLUGS:+$UPSERT_SLUGS,}$slug" fi fi done < <(git diff --name-status HEAD~1 -- src/content/dictionary/) - echo "upsert_slugs=$UPSERT_SLUGS" >> "$GITHUB_OUTPUT" - echo "delete_slugs=$DELETE_SLUGS" >> "$GITHUB_OUTPUT" - if [ -z "$UPSERT_SLUGS" ] && [ -z "$DELETE_SLUGS" ]; then echo "has_changes=false" >> "$GITHUB_OUTPUT" - echo "No dictionary changes detected. Skipping update." + echo "No dictionary file changes detected. Skipping update." else + echo "upsert=$UPSERT_SLUGS" >> "$GITHUB_OUTPUT" + echo "delete=$DELETE_SLUGS" >> "$GITHUB_OUTPUT" echo "has_changes=true" >> "$GITHUB_OUTPUT" echo "Upsert slugs: $UPSERT_SLUGS" echo "Delete slugs: $DELETE_SLUGS" fi - - name: Resolve slugs - id: resolve-slugs - run: | - if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then - echo "upsert=${{ github.event.inputs.upsert_slugs }}" >> "$GITHUB_OUTPUT" - echo "delete=${{ github.event.inputs.delete_slugs }}" >> "$GITHUB_OUTPUT" - - if [ -z "${{ github.event.inputs.upsert_slugs }}" ] && [ -z "${{ github.event.inputs.delete_slugs }}" ]; then - echo "skip=true" >> "$GITHUB_OUTPUT" - else - echo "skip=false" >> "$GITHUB_OUTPUT" - fi - else - echo "upsert=${{ steps.detect-changes.outputs.upsert_slugs }}" >> "$GITHUB_OUTPUT" - echo "delete=${{ steps.detect-changes.outputs.delete_slugs }}" >> "$GITHUB_OUTPUT" - echo "skip=${{ steps.detect-changes.outputs.has_changes != 'true' }}" >> "$GITHUB_OUTPUT" - fi - - - name: Skip if no changes - if: steps.resolve-slugs.outputs.skip == 'true' - run: echo "βœ… No dictionary changes to process. Skipping." + - name: Skip β€” no dictionary changes + if: steps.detect.outputs.has_changes != 'true' && (github.event_name == 'workflow_dispatch' || steps.pr-check.outputs.should_continue == 'true') + run: echo "⏭️ No dictionary changes to process. Skipping." + # ── Run the incremental update ───────────────────────────────────── - name: Setup Node.js - if: steps.resolve-slugs.outputs.skip != 'true' + if: steps.detect.outputs.has_changes == 'true' uses: actions/setup-node@v4 with: node-version: "20" cache: "npm" - name: Install dependencies - if: steps.resolve-slugs.outputs.skip != 'true' + if: steps.detect.outputs.has_changes == 'true' run: npm ci - name: Update vector store - if: steps.resolve-slugs.outputs.skip != 'true' + if: steps.detect.outputs.has_changes == 'true' env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_EMBEDDINGS_MODEL: ${{ vars.OPENAI_EMBEDDINGS_MODEL }} @@ -121,17 +154,17 @@ jobs: run: | ARGS="" - if [ -n "${{ steps.resolve-slugs.outputs.upsert }}" ]; then - ARGS="$ARGS --upsert ${{ steps.resolve-slugs.outputs.upsert }}" + if [ -n "${{ steps.detect.outputs.upsert }}" ]; then + ARGS="$ARGS --upsert ${{ steps.detect.outputs.upsert }}" fi - if [ -n "${{ steps.resolve-slugs.outputs.delete }}" ]; then - ARGS="$ARGS --delete ${{ steps.resolve-slugs.outputs.delete }}" + if [ -n "${{ steps.detect.outputs.delete }}" ]; then + ARGS="$ARGS --delete ${{ steps.detect.outputs.delete }}" fi echo "Running: npm run update:jai:ci -- $ARGS" npm run update:jai:ci -- $ARGS - name: Update successful - if: steps.resolve-slugs.outputs.skip != 'true' + if: steps.detect.outputs.has_changes == 'true' run: echo "βœ… Vector store update completed successfully" diff --git a/dev/README.md b/dev/README.md index c984e622..855d9375 100644 --- a/dev/README.md +++ b/dev/README.md @@ -176,11 +176,12 @@ Required environment variables: The **Update Vector Store** workflow (`.github/workflows/update-vector-store.yml`) runs this script automatically: - **Trigger**: Fires on `deployment_status` events β€” specifically when Vercel reports a successful **Production** deployment +- **PR Label Gate**: Uses the GitHub API to find the merged PR associated with the deployment commit and checks for the `πŸ“–new-word` or `πŸ“–edit-word` labels. Deployments from PRs without these labels are skipped early (before any Node.js setup or dependency installation) - **Change Detection**: Diffs `HEAD~1` to identify added, modified, or deleted `.mdx` files in `src/content/dictionary/` - **Skip Logic**: Exits early if no dictionary files were changed in the commit -- **Manual Trigger**: Can also be run manually from the GitHub Actions tab with custom `upsert_slugs` and `delete_slugs` inputs +- **Manual Trigger**: Can also be run manually from the GitHub Actions tab with custom `upsert_slugs` and `delete_slugs` inputs (bypasses the label check) - **Required Secrets**: `OPENAI_API_KEY`, `QDRANT_URL`, `QDRANT_API_KEY` -- **Required Variables**: `OPENAI_EMBEDDINGS_MODEL` +- **Required Variables**: `OPENAI_EMBEDDINGS_MODEL` ### Error Handling From fc91102e041e2257eb3d5cbc7f0fe64b458e4ed7 Mon Sep 17 00:00:00 2001 From: Olabode Lawal-Shittabey Date: Wed, 4 Mar 2026 06:33:01 +0100 Subject: [PATCH 8/9] Update dev/update-vector-store.js Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- dev/update-vector-store.js | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/dev/update-vector-store.js b/dev/update-vector-store.js index 66b93a81..f1ed6430 100644 --- a/dev/update-vector-store.js +++ b/dev/update-vector-store.js @@ -131,28 +131,29 @@ for (const slug of upsertSlugs) { try { console.log(`\nπŸ”„ Processing upsert for "${slug}"...`); - // 1. Remove any existing chunks for this word - console.log(` Deleting old chunks for "${slug}"...`); - await deletePointsBySlug(slug); - - // 2. Fetch the latest content from the deployed site + // 1. Fetch the latest content from the deployed site const word = await fetchWord(slug); if (!word) { - console.warn(` ⚠️ Word "${slug}" not found in production API, skipping.`); + console.warn( + ` ⚠️ Word "${slug}" not found in production API, skipping without deleting existing vectors.`, + ); failedCount++; continue; } - // 3. Create a LangChain Document with slug metadata + // 2. Create a LangChain Document with slug metadata const doc = new Document({ pageContent: `${word.title}\n\n${word.content}`, metadata: { slug: word.slug }, }); - // 4. Split into chunks (preserving metadata on each chunk) + // 3. Split into chunks (preserving metadata on each chunk) const chunks = await splitter.splitDocuments([doc]); console.log(` Split into ${chunks.length} chunk(s).`); + // 4. Remove any existing chunks for this word, now that new chunks are ready + console.log(` Deleting old chunks for "${slug}"...`); + await deletePointsBySlug(slug); // 5. Add to vector store await vectorStore.addDocuments(chunks); console.log(` βœ… Upserted "${slug}" (${chunks.length} chunks)`); From 9845d73a5477db7390a04f527c84f611f9822839 Mon Sep 17 00:00:00 2001 From: babblebey Date: Wed, 4 Mar 2026 06:37:42 +0100 Subject: [PATCH 9/9] fix: enhance error handling for CLI arguments in update-vector-store script --- dev/README.md | 3 ++- dev/update-vector-store.js | 28 ++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/dev/README.md b/dev/README.md index 855d9375..33e71be7 100644 --- a/dev/README.md +++ b/dev/README.md @@ -186,7 +186,8 @@ The **Update Vector Store** workflow (`.github/workflows/update-vector-store.yml ### Error Handling The script includes robust error handling for: -- Missing or invalid CLI arguments (prints usage and exits gracefully) +- Unknown flags or flags missing required values (prints an error with usage instructions and exits with code 1) +- No slugs provided (prints usage and exits gracefully with code 0) - Words not found on the production API (404 β€” warns and continues with remaining slugs) - Network connectivity issues - Vector store connection and deletion failures diff --git a/dev/update-vector-store.js b/dev/update-vector-store.js index f1ed6430..35a9573f 100644 --- a/dev/update-vector-store.js +++ b/dev/update-vector-store.js @@ -34,7 +34,16 @@ function parseArgs(argv) { const deleteSlugs = []; for (let i = 0; i < args.length; i++) { - if (args[i] === "--upsert" && args[i + 1]) { + if (args[i] === "--upsert") { + if (!args[i + 1] || args[i + 1].startsWith("--")) { + console.error( + "❌ Error: --upsert flag requires a comma-separated list of slugs.", + ); + console.error( + "Usage: node dev/update-vector-store.js --upsert slug1,slug2 --delete slug3,slug4", + ); + process.exit(1); + } upsertSlugs.push( ...args[i + 1] .split(",") @@ -42,7 +51,16 @@ function parseArgs(argv) { .filter(Boolean), ); i++; - } else if (args[i] === "--delete" && args[i + 1]) { + } else if (args[i] === "--delete") { + if (!args[i + 1] || args[i + 1].startsWith("--")) { + console.error( + "❌ Error: --delete flag requires a comma-separated list of slugs.", + ); + console.error( + "Usage: node dev/update-vector-store.js --upsert slug1,slug2 --delete slug3,slug4", + ); + process.exit(1); + } deleteSlugs.push( ...args[i + 1] .split(",") @@ -50,6 +68,12 @@ function parseArgs(argv) { .filter(Boolean), ); i++; + } else { + console.error(`❌ Error: Unknown argument "${args[i]}".`); + console.error( + "Usage: node dev/update-vector-store.js --upsert slug1,slug2 --delete slug3,slug4", + ); + process.exit(1); } }