From a2e5d1617cb7fe8223d4591da9d97f584c3fcf61 Mon Sep 17 00:00:00 2001 From: Kenta Iwasaki Date: Sun, 15 Feb 2026 02:07:49 +0800 Subject: [PATCH] fix(vercel-ai): prevent tool call span map memory leak Tool calls were stored in a global map and only cleaned up on tool errors, causing unbounded retention in tool-heavy apps (and potential OOMs when inputs/outputs were recorded). Store only span context in a bounded LRU cache and clean up on successful tool results; add tests for caching/eviction. --- .../core/src/tracing/vercel-ai/constants.ts | 10 +- packages/core/src/tracing/vercel-ai/index.ts | 5 +- packages/core/src/tracing/vercel-ai/types.ts | 5 + packages/core/src/tracing/vercel-ai/utils.ts | 8 +- .../vercel-ai-tool-call-span-map.test.ts | 121 ++++++++++++++++ .../tracing/vercelai/instrumentation.ts | 130 ++++++++++-------- 6 files changed, 214 insertions(+), 65 deletions(-) create mode 100644 packages/core/test/lib/tracing/vercel-ai-tool-call-span-map.test.ts diff --git a/packages/core/src/tracing/vercel-ai/constants.ts b/packages/core/src/tracing/vercel-ai/constants.ts index 82baf0312d7c..d59080828de8 100644 --- a/packages/core/src/tracing/vercel-ai/constants.ts +++ b/packages/core/src/tracing/vercel-ai/constants.ts @@ -1,8 +1,12 @@ -import type { Span } from '../../types-hoist/span'; +import { LRUMap } from '../../utils/lru'; +import type { ToolCallSpanContext } from './types'; -// Global Map to track tool call IDs to their corresponding spans +export const TOOL_CALL_SPAN_MAP_MAX_SIZE = 10_000; + +// Global LRU map to track tool call IDs to their corresponding span contexts. // This allows us to capture tool errors and link them to the correct span -export const toolCallSpanMap = new Map(); +// without keeping full Span objects (and their potentially large attributes) alive. +export const toolCallSpanMap = new LRUMap(TOOL_CALL_SPAN_MAP_MAX_SIZE); // Operation sets for efficient mapping to OpenTelemetry semantic convention values export const INVOKE_AGENT_OPS = new Set([ diff --git a/packages/core/src/tracing/vercel-ai/index.ts b/packages/core/src/tracing/vercel-ai/index.ts index d3c4b036e228..233762f4caa2 100644 --- a/packages/core/src/tracing/vercel-ai/index.ts +++ b/packages/core/src/tracing/vercel-ai/index.ts @@ -232,12 +232,13 @@ function processToolCallSpan(span: Span, attributes: SpanAttributes): void { renameAttributeKey(attributes, AI_TOOL_CALL_NAME_ATTRIBUTE, GEN_AI_TOOL_NAME_ATTRIBUTE); renameAttributeKey(attributes, AI_TOOL_CALL_ID_ATTRIBUTE, GEN_AI_TOOL_CALL_ID_ATTRIBUTE); - // Store the span in our global map using the tool call ID + // Store the span context in our global map using the tool call ID. // This allows us to capture tool errors and link them to the correct span + // without retaining the full Span object in memory. const toolCallId = attributes[GEN_AI_TOOL_CALL_ID_ATTRIBUTE]; if (typeof toolCallId === 'string') { - toolCallSpanMap.set(toolCallId, span); + toolCallSpanMap.set(toolCallId, span.spanContext()); } // https://opentelemetry.io/docs/specs/semconv/registry/attributes/gen-ai/#gen-ai-tool-type diff --git a/packages/core/src/tracing/vercel-ai/types.ts b/packages/core/src/tracing/vercel-ai/types.ts index 03f22c415001..754ec53551ce 100644 --- a/packages/core/src/tracing/vercel-ai/types.ts +++ b/packages/core/src/tracing/vercel-ai/types.ts @@ -2,3 +2,8 @@ export interface TokenSummary { inputTokens: number; outputTokens: number; } + +export interface ToolCallSpanContext { + traceId: string; + spanId: string; +} diff --git a/packages/core/src/tracing/vercel-ai/utils.ts b/packages/core/src/tracing/vercel-ai/utils.ts index b5f1b6c68352..b72bfb90a903 100644 --- a/packages/core/src/tracing/vercel-ai/utils.ts +++ b/packages/core/src/tracing/vercel-ai/utils.ts @@ -18,7 +18,7 @@ import { } from '../ai/gen-ai-attributes'; import { extractSystemInstructions, getTruncatedJsonString } from '../ai/utils'; import { toolCallSpanMap } from './constants'; -import type { TokenSummary } from './types'; +import type { TokenSummary, ToolCallSpanContext } from './types'; import { AI_PROMPT_ATTRIBUTE, AI_PROMPT_MESSAGES_ATTRIBUTE } from './vercel-ai-attributes'; /** @@ -75,9 +75,9 @@ export function applyAccumulatedTokens( } /** - * Get the span associated with a tool call ID + * Get the span context associated with a tool call ID. */ -export function _INTERNAL_getSpanForToolCallId(toolCallId: string): Span | undefined { +export function _INTERNAL_getSpanForToolCallId(toolCallId: string): ToolCallSpanContext | undefined { return toolCallSpanMap.get(toolCallId); } @@ -85,7 +85,7 @@ export function _INTERNAL_getSpanForToolCallId(toolCallId: string): Span | undef * Clean up the span mapping for a tool call ID */ export function _INTERNAL_cleanupToolCallSpan(toolCallId: string): void { - toolCallSpanMap.delete(toolCallId); + toolCallSpanMap.remove(toolCallId); } /** diff --git a/packages/core/test/lib/tracing/vercel-ai-tool-call-span-map.test.ts b/packages/core/test/lib/tracing/vercel-ai-tool-call-span-map.test.ts new file mode 100644 index 000000000000..d9d61e8e04e7 --- /dev/null +++ b/packages/core/test/lib/tracing/vercel-ai-tool-call-span-map.test.ts @@ -0,0 +1,121 @@ +import { beforeEach, describe, expect, it } from 'vitest'; +import { addVercelAiProcessors } from '../../../src/tracing/vercel-ai'; +import { TOOL_CALL_SPAN_MAP_MAX_SIZE, toolCallSpanMap } from '../../../src/tracing/vercel-ai/constants'; +import { _INTERNAL_cleanupToolCallSpan, _INTERNAL_getSpanForToolCallId } from '../../../src/tracing/vercel-ai/utils'; +import { + AI_TOOL_CALL_ID_ATTRIBUTE, + AI_TOOL_CALL_NAME_ATTRIBUTE, +} from '../../../src/tracing/vercel-ai/vercel-ai-attributes'; +import type { SpanAttributes, SpanAttributeValue, SpanTimeInput } from '../../../src/types-hoist/span'; +import type { SpanStatus } from '../../../src/types-hoist/spanStatus'; +import type { OpenTelemetrySdkTraceBaseSpan } from '../../../src/utils/spanUtils'; +import { getDefaultTestClientOptions, TestClient } from '../../mocks/client'; + +function createToolCallSpan(params: { + toolCallId: string; + toolName: string; + traceId: string; + spanId: string; +}): OpenTelemetrySdkTraceBaseSpan { + const attributes: SpanAttributes = { + [AI_TOOL_CALL_ID_ATTRIBUTE]: params.toolCallId, + [AI_TOOL_CALL_NAME_ATTRIBUTE]: params.toolName, + }; + + const startTime: SpanTimeInput = [0, 0]; + const endTime: SpanTimeInput = [0, 0]; + const status: SpanStatus = { code: 0 }; + + const span: OpenTelemetrySdkTraceBaseSpan = { + attributes, + startTime, + endTime, + name: 'ai.toolCall', + status, + spanContext: () => ({ + traceId: params.traceId, + spanId: params.spanId, + traceFlags: 1, + }), + end: () => undefined, + setAttribute: (key: string, value: SpanAttributeValue | undefined) => { + if (value === undefined) { + // eslint-disable-next-line @typescript-eslint/no-dynamic-delete + delete attributes[key]; + } else { + attributes[key] = value; + } + return span; + }, + setAttributes: (nextAttributes: SpanAttributes) => { + for (const key of Object.keys(nextAttributes)) { + const value = nextAttributes[key]; + if (value === undefined) { + // eslint-disable-next-line @typescript-eslint/no-dynamic-delete + delete attributes[key]; + } else { + attributes[key] = value; + } + } + return span; + }, + setStatus: (nextStatus: SpanStatus) => { + span.status = nextStatus; + return span; + }, + updateName: (name: string) => { + span.name = name; + return span; + }, + isRecording: () => true, + addEvent: () => span, + addLink: () => span, + addLinks: () => span, + recordException: () => undefined, + }; + + return span; +} + +describe('vercel-ai tool call span context map', () => { + beforeEach(() => { + toolCallSpanMap.clear(); + }); + + it('stores toolCallId -> span context on spanStart', () => { + const options = getDefaultTestClientOptions({ tracesSampleRate: 1.0 }); + const client = new TestClient(options); + client.init(); + addVercelAiProcessors(client); + + const span = createToolCallSpan({ + toolCallId: 'tool-call-1', + toolName: 'bash', + traceId: 'trace-id-1', + spanId: 'span-id-1', + }); + + client.emit('spanStart', span); + + expect(_INTERNAL_getSpanForToolCallId('tool-call-1')).toMatchObject({ + traceId: 'trace-id-1', + spanId: 'span-id-1', + }); + + _INTERNAL_cleanupToolCallSpan('tool-call-1'); + expect(_INTERNAL_getSpanForToolCallId('tool-call-1')).toBeUndefined(); + }); + + it('evicts old entries when the map exceeds max size', () => { + for (let i = 0; i < TOOL_CALL_SPAN_MAP_MAX_SIZE + 1; i++) { + toolCallSpanMap.set(`tool-call-${i}`, { traceId: `trace-${i}`, spanId: `span-${i}` }); + } + + expect(toolCallSpanMap.size).toBe(TOOL_CALL_SPAN_MAP_MAX_SIZE); + expect(toolCallSpanMap.get('tool-call-0')).toBeUndefined(); + expect(toolCallSpanMap.get(`tool-call-${TOOL_CALL_SPAN_MAP_MAX_SIZE}`)).toEqual({ + traceId: `trace-${TOOL_CALL_SPAN_MAP_MAX_SIZE}`, + spanId: `span-${TOOL_CALL_SPAN_MAP_MAX_SIZE}`, + }); + }); +}); diff --git a/packages/node/src/integrations/tracing/vercelai/instrumentation.ts b/packages/node/src/integrations/tracing/vercelai/instrumentation.ts index 19e6a2798b01..bd6dd4ac5009 100644 --- a/packages/node/src/integrations/tracing/vercelai/instrumentation.ts +++ b/packages/node/src/integrations/tracing/vercelai/instrumentation.ts @@ -1,6 +1,5 @@ import type { InstrumentationConfig, InstrumentationModuleDefinition } from '@opentelemetry/instrumentation'; import { InstrumentationBase, InstrumentationNodeModuleDefinition } from '@opentelemetry/instrumentation'; -import type { Span } from '@sentry/core'; import { _INTERNAL_cleanupToolCallSpan, _INTERNAL_getSpanForToolCallId, @@ -43,33 +42,46 @@ interface RecordingOptions { recordOutputs?: boolean; } -interface ToolError { - type: 'tool-error' | 'tool-result' | 'tool-call'; +interface ToolErrorPart { + type: 'tool-error'; toolCallId: string; toolName: string; - input?: { - [key: string]: unknown; - }; error: Error; - dynamic?: boolean; } -function isToolError(obj: unknown): obj is ToolError { +interface ToolResultPart { + type: 'tool-result'; + toolCallId: string; + toolName: string; +} + +function isToolErrorPart(obj: unknown): obj is ToolErrorPart { if (typeof obj !== 'object' || obj === null) { return false; } const candidate = obj as Record; return ( - 'type' in candidate && - 'error' in candidate && - 'toolName' in candidate && - 'toolCallId' in candidate && candidate.type === 'tool-error' && + typeof candidate.toolName === 'string' && + typeof candidate.toolCallId === 'string' && candidate.error instanceof Error ); } +function isToolResultPart(obj: unknown): obj is ToolResultPart { + if (typeof obj !== 'object' || obj === null) { + return false; + } + + const candidate = obj as Record; + return ( + candidate.type === 'tool-result' && + typeof candidate.toolName === 'string' && + typeof candidate.toolCallId === 'string' + ); +} + /** * Check for tool errors in the result and capture them * Tool errors are not rejected in Vercel V5, it is added as metadata to the result content @@ -79,59 +91,65 @@ function checkResultForToolErrors(result: unknown): void { return; } - const resultObj = result as { content: Array }; + const resultObj = result as { content: unknown }; if (!Array.isArray(resultObj.content)) { return; } for (const item of resultObj.content) { - if (isToolError(item)) { - // Try to get the span associated with this tool call ID - const associatedSpan = _INTERNAL_getSpanForToolCallId(item.toolCallId) as Span; - - if (associatedSpan) { - // We have the span, so link the error using span and trace IDs from the span - const spanContext = associatedSpan.spanContext(); - - withScope(scope => { - // Set the span and trace context for proper linking - scope.setContext('trace', { - trace_id: spanContext.traceId, - span_id: spanContext.spanId, - }); - - scope.setTag('vercel.ai.tool.name', item.toolName); - scope.setTag('vercel.ai.tool.callId', item.toolCallId); - - scope.setLevel('error'); - - captureException(item.error, { - mechanism: { - type: 'auto.vercelai.otel', - handled: false, - }, - }); + // Successful tool calls should not keep toolCallId -> span context mappings alive. + if (isToolResultPart(item)) { + _INTERNAL_cleanupToolCallSpan(item.toolCallId); + continue; + } + + if (!isToolErrorPart(item)) { + continue; + } + + // Try to get the span context associated with this tool call ID + const spanContext = _INTERNAL_getSpanForToolCallId(item.toolCallId); + + if (spanContext) { + // We have a span context, so link the error using span and trace IDs from the span + withScope(scope => { + // Set the span and trace context for proper linking + scope.setContext('trace', { + trace_id: spanContext.traceId, + span_id: spanContext.spanId, }); - // Clean up the span mapping since we've processed this tool error - // We won't get multiple { type: 'tool-error' } parts for the same toolCallId. - _INTERNAL_cleanupToolCallSpan(item.toolCallId); - } else { - // Fallback: capture without span linking - withScope(scope => { - scope.setTag('vercel.ai.tool.name', item.toolName); - scope.setTag('vercel.ai.tool.callId', item.toolCallId); - scope.setLevel('error'); - - captureException(item.error, { - mechanism: { - type: 'auto.vercelai.otel', - handled: false, - }, - }); + scope.setTag('vercel.ai.tool.name', item.toolName); + scope.setTag('vercel.ai.tool.callId', item.toolCallId); + + scope.setLevel('error'); + + captureException(item.error, { + mechanism: { + type: 'auto.vercelai.otel', + handled: false, + }, }); - } + }); + } else { + // Fallback: capture without span linking + withScope(scope => { + scope.setTag('vercel.ai.tool.name', item.toolName); + scope.setTag('vercel.ai.tool.callId', item.toolCallId); + scope.setLevel('error'); + + captureException(item.error, { + mechanism: { + type: 'auto.vercelai.otel', + handled: false, + }, + }); + }); } + + // Clean up the span mapping since we've processed this tool error + // We won't get multiple { type: 'tool-error' } parts for the same toolCallId. + _INTERNAL_cleanupToolCallSpan(item.toolCallId); } }