Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions apps/docs/content/docs/en/execution/costs.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,21 @@ Use your own API keys for AI model providers instead of Sim's hosted keys to pay

When configured, workflows use your key instead of Sim's hosted keys. If removed, workflows automatically fall back to hosted keys with the multiplier.

## Voice Input

Voice input uses ElevenLabs Scribe v2 Realtime for speech-to-text transcription. It is available in the Mothership chat and in deployed chat voice mode.

| Context | Cost per session | Max duration |
|---------|-----------------|--------------|
| Mothership (workspace) | ~5 credits ($0.024) | 3 minutes |
| Deployed chat (voice mode) | ~2 credits ($0.008) | 1 minute |

Each voice session is billed when it starts. In deployed chat voice mode, each conversation turn (speak → agent responds → speak again) is a separate session. Multi-turn conversations are billed per turn.

<Callout type="info">
Voice input requires `ELEVENLABS_API_KEY` to be configured. When the key is not set, voice input controls are hidden.
</Callout>

## Plans

Sim has two paid plan tiers — **Pro** and **Max**. Either can be used individually or with a team. Team plans pool credits across all seats in the organization.
Expand Down
6 changes: 3 additions & 3 deletions apps/sim/app/api/a2a/serve/[agentId]/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
import { type AuthResult, AuthType, checkHybridAuth } from '@/lib/auth/hybrid'
import { acquireLock, getRedisClient, releaseLock } from '@/lib/core/config/redis'
import { validateUrlWithDNS } from '@/lib/core/security/input-validation.server'
import { getClientIp } from '@/lib/core/utils/request'
import { SSE_HEADERS } from '@/lib/core/utils/sse'
import { getBaseUrl } from '@/lib/core/utils/urls'
import { generateId } from '@/lib/core/utils/uuid'
Expand Down Expand Up @@ -52,10 +53,9 @@ function getCallerFingerprint(request: NextRequest, userId?: string | null): str
return `user:${userId}`
}

const forwardedFor = request.headers.get('x-forwarded-for')?.split(',')[0]?.trim()
const realIp = request.headers.get('x-real-ip')?.trim()
const clientIp = getClientIp(request)
const userAgent = request.headers.get('user-agent')?.trim() || 'unknown'
return `public:${forwardedFor || realIp || 'unknown'}:${userAgent}`
return `public:${clientIp}:${userAgent}`
}

function hasCallerAccessToTask(
Expand Down
4 changes: 2 additions & 2 deletions apps/sim/app/api/demo-requests/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { type NextRequest, NextResponse } from 'next/server'
import { env } from '@/lib/core/config/env'
import type { TokenBucketConfig } from '@/lib/core/rate-limiter'
import { RateLimiter } from '@/lib/core/rate-limiter'
import { generateRequestId } from '@/lib/core/utils/request'
import { generateRequestId, getClientIp } from '@/lib/core/utils/request'
import { getEmailDomain } from '@/lib/core/utils/urls'
import { sendEmail } from '@/lib/messaging/email/mailer'
import { getFromEmailAddress } from '@/lib/messaging/email/utils'
Expand All @@ -25,7 +25,7 @@ export async function POST(req: NextRequest) {
const requestId = generateRequestId()

try {
const ip = req.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ?? 'unknown'
const ip = getClientIp(req)
const storageKey = `public:demo-request:${ip}`

const { allowed, remaining, resetAt } = await rateLimiter.checkRateLimitDirect(
Expand Down
4 changes: 2 additions & 2 deletions apps/sim/app/api/help/integration-request/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { z } from 'zod'
import { env } from '@/lib/core/config/env'
import type { TokenBucketConfig } from '@/lib/core/rate-limiter'
import { RateLimiter } from '@/lib/core/rate-limiter'
import { generateRequestId } from '@/lib/core/utils/request'
import { generateRequestId, getClientIp } from '@/lib/core/utils/request'
import { getEmailDomain } from '@/lib/core/utils/urls'
import { sendEmail } from '@/lib/messaging/email/mailer'
import {
Expand Down Expand Up @@ -37,7 +37,7 @@ export async function POST(req: NextRequest) {
const requestId = generateRequestId()

try {
const ip = req.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ?? 'unknown'
const ip = getClientIp(req)
const storageKey = `public:integration-request:${ip}`

const { allowed, remaining, resetAt } = await rateLimiter.checkRateLimitDirect(
Expand Down
11 changes: 11 additions & 0 deletions apps/sim/app/api/settings/voice/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { NextResponse } from 'next/server'
import { hasSTTService } from '@/lib/speech/config'

/**
* Returns whether server-side STT is configured.
* Unauthenticated — the response is a single boolean,
* not sensitive data, and deployed chat visitors need it.
*/
export async function GET() {
return NextResponse.json({ sttAvailable: hasSTTService() })
}
171 changes: 171 additions & 0 deletions apps/sim/app/api/speech/token/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import { db } from '@sim/db'
import { chat } from '@sim/db/schema'
import { createLogger } from '@sim/logger'
import { eq } from 'drizzle-orm'
import { type NextRequest, NextResponse } from 'next/server'
import { getSession } from '@/lib/auth'
import { hasExceededCostLimit } from '@/lib/billing/core/subscription'
import { recordUsage } from '@/lib/billing/core/usage-log'
import { env } from '@/lib/core/config/env'
import { getCostMultiplier, isBillingEnabled } from '@/lib/core/config/feature-flags'
import { RateLimiter } from '@/lib/core/rate-limiter'
import { validateAuthToken } from '@/lib/core/security/deployment'
import { getClientIp } from '@/lib/core/utils/request'

const logger = createLogger('SpeechTokenAPI')

export const dynamic = 'force-dynamic'

const ELEVENLABS_TOKEN_URL = 'https://api.elevenlabs.io/v1/single-use-token/realtime_scribe'

const VOICE_SESSION_COST_PER_MIN = 0.008
const WORKSPACE_SESSION_MAX_MINUTES = 3
const CHAT_SESSION_MAX_MINUTES = 1

const STT_TOKEN_RATE_LIMIT = {
maxTokens: 30,
refillRate: 3,
refillIntervalMs: 72 * 1000,
} as const

const rateLimiter = new RateLimiter()

async function validateChatAuth(
request: NextRequest,
chatId: string
): Promise<{ valid: boolean; ownerId?: string }> {
try {
const chatResult = await db
.select({
id: chat.id,
userId: chat.userId,
isActive: chat.isActive,
authType: chat.authType,
password: chat.password,
})
.from(chat)
.where(eq(chat.id, chatId))
.limit(1)

if (chatResult.length === 0 || !chatResult[0].isActive) {
return { valid: false }
}

const chatData = chatResult[0]

if (chatData.authType === 'public') {
return { valid: true, ownerId: chatData.userId }
}

const cookieName = `chat_auth_${chatId}`
const authCookie = request.cookies.get(cookieName)
if (authCookie && validateAuthToken(authCookie.value, chatId, chatData.password)) {
return { valid: true, ownerId: chatData.userId }
}

return { valid: false }
} catch (error) {
logger.error('Error validating chat auth for STT:', error)
return { valid: false }
}
}

export async function POST(request: NextRequest) {
try {
const body = await request.json().catch(() => ({}))
const chatId = body?.chatId as string | undefined

let billingUserId: string | undefined

if (chatId) {
const chatAuth = await validateChatAuth(request, chatId)
if (!chatAuth.valid) {
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
billingUserId = chatAuth.ownerId
} else {
const session = await getSession()
if (!session?.user?.id) {
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
billingUserId = session.user.id
}

if (isBillingEnabled) {
const rateLimitKey = chatId
? `stt-token:chat:${chatId}:${getClientIp(request)}`
: `stt-token:user:${billingUserId}`

const rateCheck = await rateLimiter.checkRateLimitDirect(rateLimitKey, STT_TOKEN_RATE_LIMIT)
if (!rateCheck.allowed) {
return NextResponse.json(
{ error: 'Voice input rate limit exceeded. Please try again later.' },
{
status: 429,
headers: {
'Retry-After': String(Math.ceil((rateCheck.retryAfterMs ?? 60000) / 1000)),
},
}
)
}
}

if (billingUserId && isBillingEnabled) {
const exceeded = await hasExceededCostLimit(billingUserId)
if (exceeded) {
return NextResponse.json(
{ error: 'Usage limit exceeded. Please upgrade your plan to continue.' },
{ status: 402 }
)
}
}

const apiKey = env.ELEVENLABS_API_KEY
if (!apiKey?.trim()) {
return NextResponse.json(
{ error: 'Speech-to-text service is not configured' },
{ status: 503 }
)
}

const response = await fetch(ELEVENLABS_TOKEN_URL, {
method: 'POST',
headers: { 'xi-api-key': apiKey },
})

if (!response.ok) {
const errBody = await response.json().catch(() => ({}))
const message =
errBody.detail || errBody.message || `Token request failed (${response.status})`
logger.error('ElevenLabs token request failed', { status: response.status, message })
return NextResponse.json({ error: message }, { status: 502 })
}

const data = await response.json()

if (billingUserId) {
const maxMinutes = chatId ? CHAT_SESSION_MAX_MINUTES : WORKSPACE_SESSION_MAX_MINUTES
const sessionCost = VOICE_SESSION_COST_PER_MIN * maxMinutes

await recordUsage({
userId: billingUserId,
entries: [
{
category: 'fixed',
source: 'voice-input',
description: `Voice input session (${maxMinutes} min)`,
cost: sessionCost * getCostMultiplier(),
},
],
}).catch((err) => {
logger.warn('Failed to record voice input usage, continuing:', err)
})
}

return NextResponse.json({ token: data.token })
} catch (error) {
const message = error instanceof Error ? error.message : 'Failed to generate speech token'
logger.error('Speech token error:', error)
return NextResponse.json({ error: message }, { status: 500 })
}
}
13 changes: 12 additions & 1 deletion apps/sim/app/chat/[identifier]/chat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@ export default function ChatClient({ identifier }: { identifier: string }) {
const [authRequired, setAuthRequired] = useState<'password' | 'email' | 'sso' | null>(null)

const [isVoiceFirstMode, setIsVoiceFirstMode] = useState(false)
const [sttAvailable, setSttAvailable] = useState(false)

useEffect(() => {
fetch('/api/settings/voice')
.then((r) => (r.ok ? r.json() : { sttAvailable: false }))
.then((data) => setSttAvailable(data.sttAvailable === true))
.catch(() => setSttAvailable(false))
}, [])
const { isStreamingResponse, abortControllerRef, stopStreaming, handleStreamedResponse } =
useChatStreaming()
const audioContextRef = useRef<AudioContext | null>(null)
Expand Down Expand Up @@ -443,8 +451,9 @@ export default function ChatClient({ identifier }: { identifier: string }) {
}, [isStreamingResponse, stopStreaming, setMessages, stopAudio])

const handleVoiceStart = useCallback(() => {
if (!sttAvailable) return
setIsVoiceFirstMode(true)
}, [])
}, [sttAvailable])

const handleExitVoiceMode = useCallback(() => {
setIsVoiceFirstMode(false)
Expand Down Expand Up @@ -494,6 +503,7 @@ export default function ChatClient({ identifier }: { identifier: string }) {
isStreaming={isStreamingResponse}
isPlayingAudio={isPlayingAudio}
audioContextRef={audioContextRef}
chatId={chatConfig?.id}
messages={messages.map((msg) => ({
content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content),
type: msg.type,
Expand Down Expand Up @@ -529,6 +539,7 @@ export default function ChatClient({ identifier }: { identifier: string }) {
isStreaming={isStreamingResponse}
onStopStreaming={() => stopStreaming(setMessages)}
onVoiceStart={handleVoiceStart}
sttAvailable={sttAvailable}
/>
</div>
</div>
Expand Down
22 changes: 11 additions & 11 deletions apps/sim/app/chat/components/input/input.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,6 @@ const logger = createLogger('ChatInput')

const MAX_TEXTAREA_HEIGHT = 200

const IS_STT_AVAILABLE =
typeof window !== 'undefined' &&
!!(
(window as Window & { SpeechRecognition?: unknown; webkitSpeechRecognition?: unknown })
.SpeechRecognition ||
(window as Window & { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition
)

interface AttachedFile {
id: string
name: string
Expand All @@ -37,7 +29,15 @@ export const ChatInput: React.FC<{
onStopStreaming?: () => void
onVoiceStart?: () => void
voiceOnly?: boolean
}> = ({ onSubmit, isStreaming = false, onStopStreaming, onVoiceStart, voiceOnly = false }) => {
sttAvailable?: boolean
}> = ({
onSubmit,
isStreaming = false,
onStopStreaming,
onVoiceStart,
voiceOnly = false,
sttAvailable = false,
}) => {
const fileInputRef = useRef<HTMLInputElement>(null)
const textareaRef = useRef<HTMLTextAreaElement>(null)
const [inputValue, setInputValue] = useState('')
Expand Down Expand Up @@ -142,7 +142,7 @@ export const ChatInput: React.FC<{
return (
<Tooltip.Provider>
<div className='flex items-center justify-center'>
{IS_STT_AVAILABLE && (
{sttAvailable && (
<Tooltip.Root>
<Tooltip.Trigger asChild>
<div>
Expand Down Expand Up @@ -295,7 +295,7 @@ export const ChatInput: React.FC<{

{/* Right: mic + send */}
<div className='flex items-center gap-1.5'>
{IS_STT_AVAILABLE && (
{sttAvailable && (
<Tooltip.Root>
<Tooltip.Trigger asChild>
<button
Expand Down
Loading
Loading