From 8464e82c2dfce1a89ac52c00dcd17d1fe4f5363a Mon Sep 17 00:00:00 2001 From: Antonio Sejas Date: Wed, 8 Apr 2026 21:31:39 +0100 Subject: [PATCH 1/3] Show context compaction events in studio code CLI The Agent SDK emits system messages when it shrinks context (compacting status, compact_boundary, microcompact_boundary) but the CLI UI ignored them, so long sessions appeared to stop suddenly while the SDK was silently managing context. Surface these events as informational messages and as a loader status during compaction. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/ai/ui.ts | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/apps/cli/ai/ui.ts b/apps/cli/ai/ui.ts index 06d703620d..12ee8a5095 100644 --- a/apps/cli/ai/ui.ts +++ b/apps/cli/ai/ui.ts @@ -2034,6 +2034,69 @@ export class AiChatUI { | { sessionId: string; maxTurnsReached: true; numTurns: number } | undefined { switch ( message.type ) { + case 'system': { + // Surface context-management events the SDK emits but that would + // otherwise be invisible to the user (compaction, micro-compaction, + // compacting status). Without this, long turns can appear to "stop + // suddenly" while the SDK is silently shrinking context. + if ( ! ( 'subtype' in message ) ) { + return undefined; + } + + if ( message.subtype === 'status' ) { + if ( message.status === 'compacting' ) { + this.setLoaderMessage( __( 'Compacting context…' ) ); + } + return undefined; + } + + if ( message.subtype === 'compact_boundary' ) { + const meta = message.compact_metadata; + const trigger = meta?.trigger ?? 'auto'; + const preTokens = meta?.pre_tokens ?? 0; + this.showInfo( + sprintf( + /* translators: 1: trigger (auto|manual), 2: token count before compaction */ + __( 'Context compacted (%1$s, %2$d tokens summarized into a shorter form).' ), + trigger, + preTokens + ) + ); + return undefined; + } + + // `microcompact_boundary` is emitted by the SDK runtime but not in + // the public type, so narrow via a defensive cast. It drops old + // tool-result attachments (e.g. screenshots) to free tokens without + // summarizing the whole transcript. + const systemMessage = message as { + subtype?: string; + microcompactMetadata?: { + tokensSaved?: number; + clearedAttachmentUUIDs?: string[]; + }; + }; + if ( systemMessage.subtype === 'microcompact_boundary' ) { + const tokensSaved = systemMessage.microcompactMetadata?.tokensSaved ?? 0; + const cleared = systemMessage.microcompactMetadata?.clearedAttachmentUUIDs?.length ?? 0; + this.showInfo( + sprintf( + /* translators: 1: number of old attachments dropped, 2: tokens freed */ + _n( + 'Dropped %1$d old attachment to free %2$d tokens.', + 'Dropped %1$d old attachments to free %2$d tokens.', + cleared + ), + cleared, + tokensSaved + ) + ); + return undefined; + } + + return undefined; + } + case 'assistant': { for ( const block of message.message.content ) { if ( block.type === 'text' ) { From 001657b11c2e1162fb0819ee9e45d916b33d74dc Mon Sep 17 00:00:00 2001 From: Antonio Sejas Date: Wed, 8 Apr 2026 21:38:33 +0100 Subject: [PATCH 2/3] Show context usage percentage in studio code footer Track the model's input + cache token counts against the reported contextWindow on every result message and render the percentage in the prompt footer alongside the existing status message, so users can see how full the context window is before it triggers compaction. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/ai/ui.ts | 70 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/apps/cli/ai/ui.ts b/apps/cli/ai/ui.ts index 12ee8a5095..965966a7db 100644 --- a/apps/cli/ai/ui.ts +++ b/apps/cli/ai/ui.ts @@ -83,6 +83,7 @@ class PromptEditor implements Component, Focusable { busyMessage: string | null = null; hints: string[] = []; statusMessage: string | null = null; + contextUsageLabel: string | null = null; showBottomBar = true; get focused(): boolean { @@ -193,7 +194,17 @@ class PromptEditor implements Component, Focusable { activeHints.length > 0 ? ' ' + activeHints.map( ( h ) => chalk.dim( h ) ).join( chalk.dim( ' · ' ) ) : ''; - const rightPart = this.statusMessage ? chalk.dim( this.statusMessage ) + ' ' : ''; + const rightSegments: string[] = []; + if ( this.contextUsageLabel ) { + rightSegments.push( this.contextUsageLabel ); + } + if ( this.statusMessage ) { + rightSegments.push( this.statusMessage ); + } + const rightPart = + rightSegments.length > 0 + ? rightSegments.map( ( s ) => chalk.dim( s ) ).join( chalk.dim( ' · ' ) ) + ' ' + : ''; if ( leftPart || rightPart ) { const leftLen = visibleWidth( leftPart ); const rightLen = visibleWidth( rightPart ); @@ -1597,6 +1608,62 @@ export class AiChatUI { this.tui.requestRender(); } + /** + * Update the context-usage indicator shown in the footer. + * + * `modelUsage` comes from an SDK `result` message. We sum the input token + * flavours (regular + cache reads + cache creations) because all of them + * count against the model's context window, and divide by `contextWindow` + * to surface a rough "how full is the context" percentage to the user. + */ + updateContextUsage( + modelUsage: + | Record< + string, + { + inputTokens?: number; + cacheReadInputTokens?: number; + cacheCreationInputTokens?: number; + contextWindow?: number; + } + > + | undefined + ): void { + if ( ! modelUsage ) { + return; + } + + // Use the entry with the largest window — resume/fork sessions can + // list multiple models, and we want the one whose budget is actually + // driving the next turn. + let best: { used: number; window: number } | null = null; + for ( const usage of Object.values( modelUsage ) ) { + const window = usage.contextWindow ?? 0; + if ( window <= 0 ) { + continue; + } + const used = + ( usage.inputTokens ?? 0 ) + + ( usage.cacheReadInputTokens ?? 0 ) + + ( usage.cacheCreationInputTokens ?? 0 ); + if ( ! best || window > best.window ) { + best = { used, window }; + } + } + + if ( ! best ) { + return; + } + + const percent = Math.min( 100, Math.round( ( best.used / best.window ) * 100 ) ); + this.editor.contextUsageLabel = sprintf( + /* translators: %d: percentage of context window consumed */ + __( 'Context %d%%' ), + percent + ); + this.tui.requestRender(); + } + private busyTimer: ReturnType< typeof setInterval > | null = null; private busyFrameIndex = 0; private static readonly BUSY_FRAMES = [ '⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏' ]; @@ -2186,6 +2253,7 @@ export class AiChatUI { } case 'result': { this.hideLoader(); + this.updateContextUsage( message.modelUsage ); if ( message.subtype === 'success' ) { const thinkingSec = Math.round( ( this.nowMs() - this.turnStartTime ) / 1000 ); if ( ! this.hasShownResponseMarker ) { From 9c8350353f72df6b85adfbb50ff3cb05cd435ab4 Mon Sep 17 00:00:00 2001 From: Antonio Sejas Date: Wed, 8 Apr 2026 22:01:38 +0100 Subject: [PATCH 3/3] Fix context usage percentage to use per-call prompt size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous version divided result.modelUsage cumulative totals by contextWindow, which inflated the percentage in multi-iteration turns (each iteration's cached reads were summed, so an 11-iteration turn could report > 100% even when no single API call was close to the window). Track the last assistant message's usage snapshot instead — that reflects the actual prompt size sent on the most recent API call — and cache the context window from result messages so the label can update mid-turn as iterations stream in. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/ai/ui.ts | 90 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 26 deletions(-) diff --git a/apps/cli/ai/ui.ts b/apps/cli/ai/ui.ts index 965966a7db..18a2994b34 100644 --- a/apps/cli/ai/ui.ts +++ b/apps/cli/ai/ui.ts @@ -1609,21 +1609,57 @@ export class AiChatUI { } /** - * Update the context-usage indicator shown in the footer. - * - * `modelUsage` comes from an SDK `result` message. We sum the input token - * flavours (regular + cache reads + cache creations) because all of them - * count against the model's context window, and divide by `contextWindow` - * to surface a rough "how full is the context" percentage to the user. + * Tracks the last-call prompt size (input + cache reads + cache creations) + * from the most recent assistant message. Unlike `result.modelUsage` — which + * is summed across all agentic iterations in a turn and therefore overstates + * how full the context actually is — this mirrors what was actually sent to + * the model on the latest API call. That's the closest available signal for + * "how full is the context right now". + */ + private lastPromptTokens: number | null = null; + + /** + * Largest `contextWindow` observed in any `result.modelUsage` this session. + * Cached so the footer can render mid-turn (while only `assistant` messages + * are streaming in) without waiting for the next `result`. + */ + private knownContextWindow: number | null = null; + + /** + * Record the prompt-size snapshot from an assistant message's `usage` block. + * Called from `handleMessage` on every `assistant` message so the indicator + * reflects the latest iteration even mid-turn. + */ + recordAssistantUsage( usage: unknown ): void { + if ( ! usage || typeof usage !== 'object' ) { + return; + } + const u = usage as { + input_tokens?: number; + cache_read_input_tokens?: number; + cache_creation_input_tokens?: number; + }; + const total = + ( u.input_tokens ?? 0 ) + + ( u.cache_read_input_tokens ?? 0 ) + + ( u.cache_creation_input_tokens ?? 0 ); + if ( total > 0 ) { + this.lastPromptTokens = total; + this.renderContextUsageLabel(); + } + } + + /** + * Cache the largest `contextWindow` from an SDK `result.modelUsage` map. + * Called on every `result` message. We pick the largest window because + * resume/fork sessions can list multiple models and we want the main + * agent's budget, not a side sub-agent's. */ updateContextUsage( modelUsage: | Record< string, { - inputTokens?: number; - cacheReadInputTokens?: number; - cacheCreationInputTokens?: number; contextWindow?: number; } > @@ -1633,29 +1669,28 @@ export class AiChatUI { return; } - // Use the entry with the largest window — resume/fork sessions can - // list multiple models, and we want the one whose budget is actually - // driving the next turn. - let best: { used: number; window: number } | null = null; + let window = this.knownContextWindow ?? 0; for ( const usage of Object.values( modelUsage ) ) { - const window = usage.contextWindow ?? 0; - if ( window <= 0 ) { - continue; - } - const used = - ( usage.inputTokens ?? 0 ) + - ( usage.cacheReadInputTokens ?? 0 ) + - ( usage.cacheCreationInputTokens ?? 0 ); - if ( ! best || window > best.window ) { - best = { used, window }; + const candidate = usage.contextWindow ?? 0; + if ( candidate > window ) { + window = candidate; } } - if ( ! best ) { - return; + if ( window > 0 ) { + this.knownContextWindow = window; + this.renderContextUsageLabel(); } + } - const percent = Math.min( 100, Math.round( ( best.used / best.window ) * 100 ) ); + private renderContextUsageLabel(): void { + if ( this.lastPromptTokens === null || ! this.knownContextWindow ) { + return; + } + const percent = Math.min( + 100, + Math.round( ( this.lastPromptTokens / this.knownContextWindow ) * 100 ) + ); this.editor.contextUsageLabel = sprintf( /* translators: %d: percentage of context window consumed */ __( 'Context %d%%' ), @@ -2165,6 +2200,9 @@ export class AiChatUI { } case 'assistant': { + // Capture per-call prompt size so the footer reflects the latest + // iteration's context fill, not the per-turn cumulative total. + this.recordAssistantUsage( ( message.message as { usage?: unknown } ).usage ); for ( const block of message.message.content ) { if ( block.type === 'text' ) { this.hideLoader();