"use client"; import { Button } from "@/components/ui/button"; import { Spinner } from "@/components/ui/spinner"; import { cn } from "@/lib/utils"; import { MicIcon, SquareIcon } from "lucide-react"; import type { ComponentProps } from "react"; import { useCallback, useEffect, useRef, useState } from "react"; interface SpeechRecognition extends EventTarget { continuous: boolean; interimResults: boolean; lang: string; start(): void; stop(): void; onstart: ((this: SpeechRecognition, ev: Event) => void) | null; onend: ((this: SpeechRecognition, ev: Event) => void) | null; onresult: | ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => void) | null; onerror: | ((this: SpeechRecognition, ev: SpeechRecognitionErrorEvent) => void) | null; } interface SpeechRecognitionEvent extends Event { results: SpeechRecognitionResultList; resultIndex: number; } interface SpeechRecognitionResultList { readonly length: number; item(index: number): SpeechRecognitionResult; [index: number]: SpeechRecognitionResult; } interface SpeechRecognitionResult { readonly length: number; item(index: number): SpeechRecognitionAlternative; [index: number]: SpeechRecognitionAlternative; isFinal: boolean; } interface SpeechRecognitionAlternative { transcript: string; confidence: number; } interface SpeechRecognitionErrorEvent extends Event { error: string; } declare global { interface Window { SpeechRecognition: new () => SpeechRecognition; webkitSpeechRecognition: new () => SpeechRecognition; } } type SpeechInputMode = "speech-recognition" | "media-recorder" | "none"; export type SpeechInputProps = ComponentProps & { onTranscriptionChange?: (text: string) => void; /** * Callback for when audio is recorded using MediaRecorder fallback. * This is called in browsers that don't support the Web Speech API (Firefox, Safari). * The callback receives an audio Blob that should be sent to a transcription service. * Return the transcribed text, which will be passed to onTranscriptionChange. */ onAudioRecorded?: (audioBlob: Blob) => Promise; lang?: string; }; const detectSpeechInputMode = (): SpeechInputMode => { if (typeof window === "undefined") { return "none"; } if ("SpeechRecognition" in window || "webkitSpeechRecognition" in window) { return "speech-recognition"; } if ("MediaRecorder" in window && "mediaDevices" in navigator) { return "media-recorder"; } return "none"; }; export const SpeechInput = ({ className, onTranscriptionChange, onAudioRecorded, lang = "en-US", ...props }: SpeechInputProps) => { const [isListening, setIsListening] = useState(false); const [isProcessing, setIsProcessing] = useState(false); const [mode] = useState(detectSpeechInputMode); const [isRecognitionReady, setIsRecognitionReady] = useState(false); const recognitionRef = useRef(null); const mediaRecorderRef = useRef(null); const streamRef = useRef(null); const audioChunksRef = useRef([]); const onTranscriptionChangeRef = useRef< SpeechInputProps["onTranscriptionChange"] >(onTranscriptionChange); const onAudioRecordedRef = useRef(onAudioRecorded); // Keep refs in sync onTranscriptionChangeRef.current = onTranscriptionChange; onAudioRecordedRef.current = onAudioRecorded; // Initialize Speech Recognition when mode is speech-recognition useEffect(() => { if (mode !== "speech-recognition") { return; } const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; const speechRecognition = new SpeechRecognition(); speechRecognition.continuous = true; speechRecognition.interimResults = true; speechRecognition.lang = lang; const handleStart = () => { setIsListening(true); }; const handleEnd = () => { setIsListening(false); }; const handleResult = (event: Event) => { const speechEvent = event as SpeechRecognitionEvent; let finalTranscript = ""; for ( let i = speechEvent.resultIndex; i < speechEvent.results.length; i += 1 ) { const result = speechEvent.results[i]; if (result.isFinal) { finalTranscript += result[0]?.transcript ?? ""; } } if (finalTranscript) { onTranscriptionChangeRef.current?.(finalTranscript); } }; const handleError = () => { setIsListening(false); }; speechRecognition.addEventListener("start", handleStart); speechRecognition.addEventListener("end", handleEnd); speechRecognition.addEventListener("result", handleResult); speechRecognition.addEventListener("error", handleError); recognitionRef.current = speechRecognition; setIsRecognitionReady(true); return () => { speechRecognition.removeEventListener("start", handleStart); speechRecognition.removeEventListener("end", handleEnd); speechRecognition.removeEventListener("result", handleResult); speechRecognition.removeEventListener("error", handleError); speechRecognition.stop(); recognitionRef.current = null; setIsRecognitionReady(false); }; }, [mode, lang]); // Cleanup MediaRecorder and stream on unmount useEffect( () => () => { if (mediaRecorderRef.current?.state === "recording") { mediaRecorderRef.current.stop(); } if (streamRef.current) { for (const track of streamRef.current.getTracks()) { track.stop(); } } }, [] ); // Start MediaRecorder recording const startMediaRecorder = useCallback(async () => { if (!onAudioRecordedRef.current) { return; } try { const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); streamRef.current = stream; const mediaRecorder = new MediaRecorder(stream); audioChunksRef.current = []; const handleDataAvailable = (event: BlobEvent) => { if (event.data.size > 0) { audioChunksRef.current.push(event.data); } }; const handleStop = async () => { for (const track of stream.getTracks()) { track.stop(); } streamRef.current = null; const audioBlob = new Blob(audioChunksRef.current, { type: "audio/webm", }); if (audioBlob.size > 0 && onAudioRecordedRef.current) { setIsProcessing(true); try { const transcript = await onAudioRecordedRef.current(audioBlob); if (transcript) { onTranscriptionChangeRef.current?.(transcript); } } catch { // Error handling delegated to the onAudioRecorded caller } finally { setIsProcessing(false); } } }; const handleError = () => { setIsListening(false); for (const track of stream.getTracks()) { track.stop(); } streamRef.current = null; }; mediaRecorder.addEventListener("dataavailable", handleDataAvailable); mediaRecorder.addEventListener("stop", handleStop); mediaRecorder.addEventListener("error", handleError); mediaRecorderRef.current = mediaRecorder; mediaRecorder.start(); setIsListening(true); } catch { setIsListening(false); } }, []); // Stop MediaRecorder recording const stopMediaRecorder = useCallback(() => { if (mediaRecorderRef.current?.state === "recording") { mediaRecorderRef.current.stop(); } setIsListening(false); }, []); const toggleListening = useCallback(() => { if (mode === "speech-recognition" && recognitionRef.current) { if (isListening) { recognitionRef.current.stop(); } else { recognitionRef.current.start(); } } else if (mode === "media-recorder") { if (isListening) { stopMediaRecorder(); } else { startMediaRecorder(); } } }, [mode, isListening, startMediaRecorder, stopMediaRecorder]); // Determine if button should be disabled const isDisabled = mode === "none" || (mode === "speech-recognition" && !isRecognitionReady) || (mode === "media-recorder" && !onAudioRecorded) || isProcessing; return (
{/* Animated pulse rings */} {isListening && [0, 1, 2].map((index) => (
))} {/* Main record button */}
); };