Run Quickstart First
Use the clone-and-run flow first, then come back here for implementation details.
Configure backend environment and dependencies
The backend uses two processes: token server and agent worker.
backend/.env
Copy
LIVEKIT_URL=wss://your-project.livekit.cloud # https://cloud.livekit.io
LIVEKIT_API_KEY=your_api_key # https://cloud.livekit.io
LIVEKIT_API_SECRET=your_api_secret # https://cloud.livekit.io
GOOGLE_API_KEY=your_google_api_key # https://aistudio.google.com/api-keys
E2E_GOOGLE_MODEL=gemini-2.5-flash-native-audio-preview-12-2025
E2E_GOOGLE_VOICE=Puck
SPATIALREAL_API_KEY=your_api_key # https://app.spatialreal.ai/apps
SPATIALREAL_APP_ID=your_app_id # https://app.spatialreal.ai/apps
SPATIALREAL_AVATAR_ID=6aed28f9-674c-4ffb-89ee-b447b28aa3ed # https://app.spatialreal.ai/avatars/library
backend/pyproject.toml
Copy
[project]
name = "spatialreal-agent-quickstart-backend"
version = "0.1.0"
requires-python = ">=3.10,<3.15"
dependencies = [
"flask>=3.0.0",
"flask-cors>=4.0.0",
"python-dotenv>=1.0.0",
"livekit-api>=1.1.0",
"livekit-agents==1.4.5",
"livekit-plugins-google==1.4.5",
"livekit-plugins-spatialreal==1.4.5",
]
Implement token endpoint and dispatch logic
token_server.py issues the browser token and dispatches the LiveKit agent.backend/token_server.py
Copy
import asyncio
import os
from datetime import timedelta
from uuid import uuid4
from dotenv import load_dotenv
from flask import Flask, jsonify, request
from flask_cors import CORS
from livekit import api
load_dotenv()
app = Flask(__name__)
CORS(app)
LIVEKIT_URL = os.getenv("LIVEKIT_URL")
LIVEKIT_API_KEY = os.getenv("LIVEKIT_API_KEY")
LIVEKIT_API_SECRET = os.getenv("LIVEKIT_API_SECRET")
async def create_room_and_dispatch(room_name: str) -> None:
lkapi = api.LiveKitAPI(LIVEKIT_URL, LIVEKIT_API_KEY, LIVEKIT_API_SECRET)
try:
try:
await lkapi.room.create_room(api.CreateRoomRequest(name=room_name))
except Exception:
pass
await lkapi.agent_dispatch.create_dispatch(
api.CreateAgentDispatchRequest(room=room_name, agent_name="voice-assistant")
)
finally:
await lkapi.aclose()
@app.route("/token", methods=["POST"])
def token():
if not LIVEKIT_API_KEY or not LIVEKIT_API_SECRET:
return jsonify({"error": "LiveKit credentials not configured"}), 500
body = request.get_json() or {}
room_name = body.get("room", "voice-agent-room")
requested_identity = body.get("identity")
identity = (
requested_identity.strip()
if isinstance(requested_identity, str) and requested_identity.strip()
else f"browser-{uuid4().hex[:8]}"
)
jwt = (
api.AccessToken(LIVEKIT_API_KEY, LIVEKIT_API_SECRET)
.with_identity(identity)
.with_name(identity)
.with_ttl(timedelta(hours=1))
.with_grants(
api.VideoGrants(
room_join=True,
room=room_name,
can_publish=True,
can_subscribe=True,
can_publish_data=True,
)
)
.to_jwt()
)
try:
asyncio.run(create_room_and_dispatch(room_name))
except Exception as exc:
print(f"Warning: Failed to dispatch agent: {exc}")
return jsonify(
{
"token": jwt,
"url": LIVEKIT_URL,
"room": room_name,
"identity": identity,
}
)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=8080, debug=True)
Implement agent worker + AvatarSession
agent.py runs the realtime LLM and starts SpatialReal avatar publishing in the same room.backend/agent.py
Copy
import os
from dotenv import load_dotenv
from livekit.agents import Agent, AgentSession, AutoSubscribe, JobContext, WorkerOptions, cli
from livekit.plugins import google
from livekit.plugins.spatialreal import AvatarSession
load_dotenv()
class VoiceAssistant(Agent):
def __init__(self) -> None:
super().__init__(
instructions="You are a helpful voice assistant. Keep replies short and natural."
)
async def entrypoint(ctx: JobContext) -> None:
await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
session = AgentSession(
llm=google.realtime.RealtimeModel(
model=os.getenv("E2E_GOOGLE_MODEL", "gemini-2.5-flash"),
voice=os.getenv("E2E_GOOGLE_VOICE", "Puck"),
api_key=os.getenv("GOOGLE_API_KEY"),
)
)
avatar = AvatarSession()
await avatar.start(session, room=ctx.room)
await session.start(agent=VoiceAssistant(), room=ctx.room)
if __name__ == "__main__":
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, agent_name="voice-assistant"))
Configure frontend env and Vite
Frontend reads token endpoint and room name from env and proxies
/token locally.frontend/.env
Copy
VITE_SPATIALREAL_APP_ID=your_app_id # https://app.spatialreal.ai/apps
VITE_SPATIALREAL_AVATAR_ID=6aed28f9-674c-4ffb-89ee-b447b28aa3ed # https://app.spatialreal.ai/avatars/library
VITE_TOKEN_ENDPOINT=http://localhost:8080/token
VITE_ROOM_NAME=voice-agent-room
frontend/vite.config.ts
Copy
import { defineConfig } from 'vite'
import react from '@vitejs/plugin-react'
import { avatarkitVitePlugin } from '@spatialwalk/avatarkit/vite'
export default defineConfig({
plugins: [react(), avatarkitVitePlugin()],
server: {
port: 3000,
proxy: {
'/token': {
target: 'http://localhost:8080',
changeOrigin: true,
},
},
},
})
Implement AvatarKit UI frontend page
App.tsx requests token, mounts SpatialRealAvatarProvider, and controls microphone state.frontend/src/App.tsx
Copy
import { useState } from 'react'
import '@livekit/components-styles'
import { Button } from '@/components/ui/button'
import { Track } from 'livekit-client'
import {
SpatialRealAvatarCanvas,
SpatialRealAvatarError,
SpatialRealAvatarFrame,
SpatialRealAvatarLoading,
SpatialRealAvatarProvider,
SpatialRealAvatarStatus,
useSpatialRealAvatarContext,
} from '@/components/spatialreal-avatar'
type AvatarConnection = {
url: string
token: string
roomName: string
}
type TokenResponse = {
url: string
token: string
room: string
}
function AvatarPanel({ onExit }: { onExit: () => void }) {
const avatar = useSpatialRealAvatarContext()
const [pending, setPending] = useState(false)
const micPublication = avatar.room?.localParticipant.getTrackPublication(Track.Source.Microphone)
const hasPublishedMic = Boolean(micPublication?.track)
const isMicMuted = micPublication?.isMuted ?? false
const toggleMicrophone = async () => {
if (pending || !avatar.isConnected) return
setPending(true)
try {
if (!hasPublishedMic) {
await avatar.startPublishingMicrophone()
} else if (isMicMuted) {
await micPublication?.unmute()
} else {
await micPublication?.mute()
}
} finally {
setPending(false)
}
}
const disconnect = async () => {
if (pending) return
setPending(true)
try {
await avatar.disconnect()
} finally {
setPending(false)
onExit()
}
}
return (
<div className="flex w-full max-w-[760px] flex-col gap-3">
<SpatialRealAvatarFrame className="overflow-hidden">
<SpatialRealAvatarCanvas minHeight={420} />
<SpatialRealAvatarLoading />
<SpatialRealAvatarError />
</SpatialRealAvatarFrame>
<div className="flex flex-wrap items-center gap-2">
<Button disabled={!avatar.isConnected || pending} onClick={() => void toggleMicrophone()} type="button">
{!hasPublishedMic ? 'Enable Mic' : isMicMuted ? 'Unmute Mic' : 'Mute Mic'}
</Button>
<Button disabled={pending} onClick={() => void disconnect()} type="button" variant="outline">
Disconnect
</Button>
<SpatialRealAvatarStatus />
<span className="text-sm">
{avatar.error
? avatar.error.message
: avatar.isConnected
? !hasPublishedMic
? 'Connected. Mic is off, enable mic to talk.'
: isMicMuted
? 'Connected. Mic is muted.'
: 'Connected. Mic is on, start speaking.'
: 'Connecting...'}
</span>
</div>
</div>
)
}
export default function App() {
const appId = import.meta.env.VITE_SPATIALREAL_APP_ID
const avatarId = import.meta.env.VITE_SPATIALREAL_AVATAR_ID
const tokenEndpoint = import.meta.env.VITE_TOKEN_ENDPOINT || '/token'
const roomName = import.meta.env.VITE_ROOM_NAME || 'voice-agent-room'
const [connection, setConnection] = useState<AvatarConnection | null>(null)
const [connecting, setConnecting] = useState(false)
const [status, setStatus] = useState('Click Connect to start')
const requestConnection = async () => {
if (connecting || connection) return
if (!appId || !avatarId) {
setStatus('Missing VITE_SPATIALREAL_APP_ID or VITE_SPATIALREAL_AVATAR_ID in .env')
return
}
setConnecting(true)
setStatus('Requesting token...')
try {
const response = await fetch(tokenEndpoint, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ room: roomName }),
})
if (!response.ok) throw new Error('Failed to fetch token')
const payload = (await response.json()) as TokenResponse
if (!payload.url || !payload.token || !payload.room) {
throw new Error('Token response is missing url, token, or room')
}
setConnection({
url: payload.url,
token: payload.token,
roomName: payload.room,
})
setStatus('Connecting avatar...')
} catch (error) {
setStatus(error instanceof Error ? error.message : 'Failed to request token')
} finally {
setConnecting(false)
}
}
if (!appId || !avatarId) {
return <div className="p-4">Missing required environment variables. Check `.env`.</div>
}
return (
<div className="flex min-h-screen items-center justify-center p-4">
{connection ? (
<SpatialRealAvatarProvider
appId={appId}
avatarId={avatarId}
connection={connection}
onConnected={() => setStatus('Connected. Click Enable Mic to talk.')}
onDisconnected={() => setStatus('Disconnected')}
onAvatarError={(error) => setStatus(error.message)}
>
<AvatarPanel
onExit={() => {
setConnection(null)
setStatus('Disconnected')
}}
/>
</SpatialRealAvatarProvider>
) : (
<div className="flex w-full max-w-[560px] flex-col gap-2.5">
<Button disabled={connecting} onClick={() => void requestConnection()} type="button">
{connecting ? 'Connecting...' : 'Connect'}
</Button>
<span className="text-sm">{status}</span>
</div>
)}
</div>
)
}
Understand runtime sequence
Use this sequence to trace issues across backend, agent, and frontend.
Copy
frontend -> /token -> token_server creates JWT + dispatches agent
agent worker joins room -> starts Gemini session + AvatarSession
frontend connects with token -> avatar renders -> mic publish/unpublish drives conversation

