improve efficiency of expiring runs in batch, and make sure runs are properly cleaned up from queues and queues rebalanced after getting expired by ttl system

ericallam · ericallam · commit 3ee918088e9e · 2026-02-10T16:09:48.000Z
diff --git a/apps/webapp/test/engine/triggerTask.test.ts b/apps/webapp/test/engine/triggerTask.test.ts
@@ -40,7 +40,7 @@ import { RunEngineTriggerTaskService } from "../../app/runEngine/services/trigge
 import { promiseWithResolvers } from "@trigger.dev/core";
 import { setTimeout } from "node:timers/promises";
 
-vi.setConfig({ testTimeout: 30_000 }); // 30 seconds timeout
+vi.setConfig({ testTimeout: 60_000 }); // 60 seconds timeout
 
 class MockPayloadProcessor implements PayloadProcessor {
   async process(request: TriggerTaskRequest): Promise<IOPacket> {
@@ -78,9 +78,9 @@ class MockTraceEventConcern implements TraceEventConcern {
         spanId: "test",
         traceContext: {},
         traceparent: undefined,
-        setAttribute: () => {},
-        failWithError: () => {},
-        stop: () => {},
+        setAttribute: () => { },
+        failWithError: () => { },
+        stop: () => { },
       },
       "test"
     );
@@ -103,9 +103,9 @@ class MockTraceEventConcern implements TraceEventConcern {
         spanId: "test",
         traceContext: {},
         traceparent: undefined,
-        setAttribute: () => {},
-        failWithError: () => {},
-        stop: () => {},
+        setAttribute: () => { },
+        failWithError: () => { },
+        stop: () => { },
       },
       "test"
     );
@@ -128,9 +128,9 @@ class MockTraceEventConcern implements TraceEventConcern {
         spanId: "test",
         traceContext: {},
         traceparent: undefined,
-        setAttribute: () => {},
-        failWithError: () => {},
-        stop: () => {},
+        setAttribute: () => { },
+        failWithError: () => { },
+        stop: () => { },
       },
       "test"
     );
diff --git a/internal-packages/run-engine/src/batch-queue/completionTracker.ts b/internal-packages/run-engine/src/batch-queue/completionTracker.ts
@@ -45,9 +45,9 @@ export class BatchCompletionTracker {
   }) {
     this.redis = createRedisClient(options.redis);
     this.logger = options.logger ?? {
-      debug: () => {},
-      info: () => {},
-      error: () => {},
+      debug: () => { },
+      info: () => { },
+      error: () => { },
     };
 
     this.#registerCommands();
@@ -109,26 +109,6 @@ export class BatchCompletionTracker {
     return JSON.parse(metaJson) as BatchMeta;
   }
 
-  /**
-   * Update the runCount in batch metadata.
-   * Used when items are skipped due to queue limits.
-   */
-  async updateRunCount(batchId: string, newRunCount: number): Promise<void> {
-    const meta = await this.getMeta(batchId);
-    if (!meta) {
-      this.logger.error("Cannot update runCount: batch metadata not found", { batchId });
-      return;
-    }
-
-    const updatedMeta: BatchMeta = {
-      ...meta,
-      runCount: newRunCount,
-    };
-
-    await this.storeMeta(batchId, updatedMeta);
-    this.logger.debug("Updated batch runCount", { batchId, oldRunCount: meta.runCount, newRunCount });
-  }
-
   // ============================================================================
   // Success/Failure Recording (Idempotent)
   // ============================================================================
diff --git a/internal-packages/run-engine/src/batch-queue/index.ts b/internal-packages/run-engine/src/batch-queue/index.ts
@@ -183,17 +183,17 @@ export class BatchQueue {
       // so we don't need the DLQ - we just need the retry scheduling.
       ...(options.retry
         ? {
-            retry: {
-              strategy: new ExponentialBackoffRetry({
-                maxAttempts: options.retry.maxAttempts,
-                minTimeoutInMs: options.retry.minTimeoutInMs ?? 1_000,
-                maxTimeoutInMs: options.retry.maxTimeoutInMs ?? 30_000,
-                factor: options.retry.factor ?? 2,
-                randomize: options.retry.randomize ?? true,
-              }),
-              deadLetterQueue: false,
-            },
-          }
+          retry: {
+            strategy: new ExponentialBackoffRetry({
+              maxAttempts: options.retry.maxAttempts,
+              minTimeoutInMs: options.retry.minTimeoutInMs ?? 1_000,
+              maxTimeoutInMs: options.retry.maxTimeoutInMs ?? 30_000,
+              factor: options.retry.factor ?? 2,
+              randomize: options.retry.randomize ?? true,
+            }),
+            deadLetterQueue: false,
+          },
+        }
         : {}),
       logger: this.logger,
       tracer: options.tracer,
@@ -395,14 +395,6 @@ export class BatchQueue {
     return this.completionTracker.getEnqueuedCount(batchId);
   }
 
-  /**
-   * Update the runCount for a batch.
-   * Used when items are skipped due to queue limits.
-   */
-  async updateRunCount(batchId: string, newRunCount: number): Promise<void> {
-    return this.completionTracker.updateRunCount(batchId, newRunCount);
-  }
-
   // ============================================================================
   // Public API - Query
   // ============================================================================
diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts
@@ -190,11 +190,11 @@ export class RunEngine {
       ttlSystem: options.queue?.ttlSystem?.disabled
         ? undefined
         : {
-            shardCount: options.queue?.ttlSystem?.shardCount,
-            pollIntervalMs: options.queue?.ttlSystem?.pollIntervalMs,
-            batchSize: options.queue?.ttlSystem?.batchSize,
-            callback: this.#ttlExpiredCallback.bind(this),
-          },
+          shardCount: options.queue?.ttlSystem?.shardCount,
+          pollIntervalMs: options.queue?.ttlSystem?.pollIntervalMs,
+          batchSize: options.queue?.ttlSystem?.batchSize,
+          callback: this.#ttlExpiredCallback.bind(this),
+        },
     });
 
     this.worker = new Worker({
@@ -655,11 +655,11 @@ export class RunEngine {
               associatedWaitpoint:
                 resumeParentOnCompletion && parentTaskRunId
                   ? {
-                      create: this.waitpointSystem.buildRunAssociatedWaitpoint({
-                        projectId: environment.project.id,
-                        environmentId: environment.id,
-                      }),
-                    }
+                    create: this.waitpointSystem.buildRunAssociatedWaitpoint({
+                      projectId: environment.project.id,
+                      environmentId: environment.id,
+                    }),
+                  }
                   : undefined,
             },
           });
@@ -832,9 +832,9 @@ export class RunEngine {
         const waitpointData =
           resumeParentOnCompletion && parentTaskRunId
             ? this.waitpointSystem.buildRunAssociatedWaitpoint({
-                projectId: environment.project.id,
-                environmentId: environment.id,
-              })
+              projectId: environment.project.id,
+              environmentId: environment.id,
+            })
             : undefined;
 
         // Create the run in terminal SYSTEM_FAILURE status.
@@ -1340,14 +1340,6 @@ export class RunEngine {
     return this.batchQueue.getEnqueuedCount(batchId);
   }
 
-  /**
-   * Update the runCount for a batch.
-   * Used when items are skipped due to queue limits.
-   */
-  async updateBatchRunCount(batchId: string, newRunCount: number): Promise<void> {
-    return this.batchQueue.updateRunCount(batchId, newRunCount);
-  }
-
   async getWaitpoint({
     waitpointId,
     environmentId,
diff --git a/internal-packages/run-engine/src/engine/systems/ttlSystem.ts b/internal-packages/run-engine/src/engine/systems/ttlSystem.ts
@@ -1,11 +1,12 @@
 import { parseNaturalLanguageDuration } from "@trigger.dev/core/v3/isomorphic";
 import { TaskRunError } from "@trigger.dev/core/v3/schemas";
-import { PrismaClientOrTransaction, TaskRunStatus } from "@trigger.dev/database";
+import { Prisma, PrismaClientOrTransaction, TaskRunStatus } from "@trigger.dev/database";
 import { isExecuting } from "../statuses.js";
 import { getLatestExecutionSnapshot } from "./executionSnapshotSystem.js";
 import { SystemResources } from "./systems.js";
 import { WaitpointSystem } from "./waitpointSystem.js";
 import { startSpan } from "@internal/tracing";
+import pMap from "p-map";
 
 export type TtlSystemOptions = {
   resources: SystemResources;
@@ -169,7 +170,7 @@ export class TtlSystem {
         const expired: string[] = [];
         const skipped: { runId: string; reason: string }[] = [];
 
-        // Fetch all runs with their snapshots in a single query
+        // Fetch all runs in a single query (no snapshot data needed)
         const runs = await this.$.prisma.taskRun.findMany({
           where: { id: { in: runIds } },
           select: {
@@ -188,36 +189,13 @@ export class TtlSystem {
                 projectId: true,
               },
             },
-            executionSnapshots: {
-              orderBy: { createdAt: "desc" },
-              take: 1,
-              select: {
-                executionStatus: true,
-                environmentId: true,
-                environmentType: true,
-                projectId: true,
-                organizationId: true,
-              },
-            },
           },
         });
 
         // Filter runs that can be expired
         const runsToExpire: typeof runs = [];
 
         for (const run of runs) {
-          const latestSnapshot = run.executionSnapshots[0];
-
-          if (!latestSnapshot) {
-            skipped.push({ runId: run.id, reason: "no_snapshot" });
-            continue;
-          }
-
-          if (isExecuting(latestSnapshot.executionStatus)) {
-            skipped.push({ runId: run.id, reason: "executing" });
-            continue;
-          }
-
           if (run.status !== "PENDING") {
             skipped.push({ runId: run.id, reason: `status_${run.status}` });
             continue;
@@ -245,79 +223,70 @@ export class TtlSystem {
           return { expired, skipped };
         }
 
-        // Update all runs in a single batch
+        // Update all runs in a single SQL call (status, dates, and error JSON)
         const now = new Date();
         const runIdsToExpire = runsToExpire.map((r) => r.id);
 
-        await this.$.prisma.taskRun.updateMany({
-          where: { id: { in: runIdsToExpire } },
-          data: {
-            status: "EXPIRED" as TaskRunStatus,
-            completedAt: now,
-            expiredAt: now,
-            // Note: updateMany doesn't support nested writes, so we handle error and snapshots separately
-          },
-        });
+        const error: TaskRunError = {
+          type: "STRING_ERROR",
+          raw: "Run expired because the TTL was reached",
+        };
+
+        await this.$.prisma.$executeRaw`
+          UPDATE "TaskRun"
+          SET "status" = 'EXPIRED'::"TaskRunStatus",
+              "completedAt" = ${now},
+              "expiredAt" = ${now},
+              "updatedAt" = ${now},
+              "error" = ${JSON.stringify(error)}::jsonb
+          WHERE "id" IN (${Prisma.join(runIdsToExpire)})
+        `;
+
+        // Process each run: enqueue waitpoint completion jobs and emit events
+        await pMap(
+          runsToExpire,
+          async (run) => {
+            try {
+              // Enqueue a finishWaitpoint worker job for resilient waitpoint completion
+              if (run.associatedWaitpoint) {
+                await this.$.worker.enqueue({
+                  id: `finishWaitpoint.ttl.${run.associatedWaitpoint.id}`,
+                  job: "finishWaitpoint",
+                  payload: {
+                    waitpointId: run.associatedWaitpoint.id,
+                    error: JSON.stringify(error),
+                  },
+                });
+              }
+
+              // Emit event
+              this.$.eventBus.emit("runExpired", {
+                run: {
+                  id: run.id,
+                  spanId: run.spanId,
+                  ttl: run.ttl,
+                  taskEventStore: run.taskEventStore,
+                  createdAt: run.createdAt,
+                  updatedAt: now,
+                  completedAt: now,
+                  expiredAt: now,
+                  status: "EXPIRED" as TaskRunStatus,
+                },
+                time: now,
+                organization: { id: run.runtimeEnvironment.organizationId },
+                project: { id: run.runtimeEnvironment.projectId },
+                environment: { id: run.runtimeEnvironment.id },
+              });
 
-        // Create snapshots and set errors for each run (these require individual updates)
-        await Promise.all(
-          runsToExpire.map(async (run) => {
-            const latestSnapshot = run.executionSnapshots[0]!;
-            const error: TaskRunError = {
-              type: "STRING_ERROR",
-              raw: `Run expired because the TTL (${run.ttl}) was reached`,
-            };
-
-            // Update the error field (updateMany can't do JSON fields properly)
-            await this.$.prisma.taskRun.update({
-              where: { id: run.id },
-              data: { error },
-            });
-
-            // Create the snapshot
-            await this.$.prisma.taskRunExecutionSnapshot.create({
-              data: {
+              expired.push(run.id);
+            } catch (e) {
+              this.$.logger.error("Failed to process expired run", {
                 runId: run.id,
-                engine: "V2",
-                executionStatus: "FINISHED",
-                description: "Run was expired because the TTL was reached",
-                runStatus: "EXPIRED",
-                environmentId: latestSnapshot.environmentId,
-                environmentType: latestSnapshot.environmentType,
-                projectId: latestSnapshot.projectId,
-                organizationId: latestSnapshot.organizationId,
-              },
-            });
-
-            // Complete the waitpoint
-            if (run.associatedWaitpoint) {
-              await this.waitpointSystem.completeWaitpoint({
-                id: run.associatedWaitpoint.id,
-                output: { value: JSON.stringify(error), isError: true },
+                error: e,
               });
             }
-
-            // Emit event
-            this.$.eventBus.emit("runExpired", {
-              run: {
-                id: run.id,
-                spanId: run.spanId,
-                ttl: run.ttl,
-                taskEventStore: run.taskEventStore,
-                createdAt: run.createdAt,
-                updatedAt: now,
-                completedAt: now,
-                expiredAt: now,
-                status: "EXPIRED" as TaskRunStatus,
-              },
-              time: now,
-              organization: { id: run.runtimeEnvironment.organizationId },
-              project: { id: run.runtimeEnvironment.projectId },
-              environment: { id: run.runtimeEnvironment.id },
-            });
-
-            expired.push(run.id);
-          })
+          },
+          { concurrency: 10, stopOnError: false }
         );
 
         span.setAttribute("expiredCount", expired.length);
diff --git a/internal-packages/run-engine/src/engine/tests/ttl.test.ts b/internal-packages/run-engine/src/engine/tests/ttl.test.ts
diff --git a/internal-packages/run-engine/src/run-queue/index.ts b/internal-packages/run-engine/src/run-queue/index.ts