Build With X

Voice Agent

WebSocket-based text-to-speech with Grok

Live Voice API

Grok's Live Voice API enables real-time voice conversations via WebSocket.

The Live Voice API is currently in beta. API surface may change.

WebSocket Connection

const ws = new WebSocket("wss://api.x.ai/v1/live")

ws.onopen = () => {
  // Send authentication
  ws.send(
    JSON.stringify({
      type: "auth",
      api_key: process.env.XAI_API_KEY,
    })
  )

  // Configure session
  ws.send(
    JSON.stringify({
      type: "session.update",
      session: {
        model: "grok-3-live",
        voice: "alloy",
        turn_detection: {
          type: "server_vad",
          threshold: 0.5,
        },
      },
    })
  )
}

Sending Audio

// Send audio chunks as base64
function sendAudioChunk(audioBuffer: ArrayBuffer) {
  const base64 = Buffer.from(audioBuffer).toString("base64")
  ws.send(
    JSON.stringify({
      type: "input_audio_buffer.append",
      audio: base64,
    })
  )
}

// Commit audio when done speaking
function commitAudio() {
  ws.send(
    JSON.stringify({
      type: "input_audio_buffer.commit",
    })
  )
}

Receiving Responses

ws.onmessage = (event) => {
  const data = JSON.parse(event.data)

  switch (data.type) {
    case "response.audio.delta":
      // Received audio chunk
      const audioChunk = Buffer.from(data.delta, "base64")
      playAudio(audioChunk)
      break

    case "response.audio_transcript.delta":
      // Received transcript of assistant's speech
      console.log("Assistant:", data.delta)
      break

    case "input_audio_buffer.speech_started":
      // User started speaking (VAD detected)
      break

    case "input_audio_buffer.speech_stopped":
      // User stopped speaking
      break

    case "error":
      console.error("Error:", data.error)
      break
  }
}

Voice Options

VoiceDescription
alloyNeutral, balanced
echoWarm, conversational
fableExpressive, dynamic
onyxDeep, authoritative
novaBright, energetic
shimmerClear, professional

Audio Format

// Input audio requirements
const audioConfig = {
  sampleRate: 24000, // 24kHz
  channels: 1, // Mono
  format: "pcm16", // 16-bit PCM
}

// Output audio format
// Same as input: 24kHz mono PCM16

Full Example

import WebSocket from "ws"

async function createVoiceSession() {
  const ws = new WebSocket("wss://api.x.ai/v1/live")

  return new Promise<WebSocket>((resolve, reject) => {
    ws.onopen = () => {
      ws.send(
        JSON.stringify({
          type: "auth",
          api_key: process.env.XAI_API_KEY,
        })
      )

      ws.send(
        JSON.stringify({
          type: "session.update",
          session: {
            model: "grok-3-live",
            voice: "nova",
            instructions:
              "You are a helpful assistant. Keep responses brief and conversational.",
            turn_detection: {
              type: "server_vad",
              threshold: 0.5,
              silence_duration_ms: 500,
            },
          },
        })
      )

      resolve(ws)
    }

    ws.onerror = reject
  })
}

// Usage
const session = await createVoiceSession()

session.onmessage = (event) => {
  const data = JSON.parse(event.data.toString())
  // Handle messages...
}

Error Handling

ws.onerror = (error) => {
  console.error("WebSocket error:", error)
}

ws.onclose = (event) => {
  if (event.code !== 1000) {
    console.error("Unexpected close:", event.code, event.reason)
    // Implement reconnection logic
  }
}

On this page