I'm trying to modify application for conversational agent from this github. I want to implement interruption of text-to-speech model by user.The idea is following:There is the class TextToSpeech
with method speak
that uses subprocess to play audio translated from text chunk by chunk. Befor playing each chunk of audio it's checking if interruption
which is asyncio.Event
is set, and if it's True
than function terminates subprocess and returns:
class TextToSpeech: # Set your Deepgram API Key and desired voice model DG_API_KEY = os.getenv("DEEPGRAM_API_KEY") MODEL_NAME = "aura-helios-en" # Example model name, change as needed @staticmethod def is_installed(lib_name: str) -> bool: lib = shutil.which(lib_name) return lib is not None def speak(self, text): if not self.is_installed("ffplay"): raise ValueError("ffplay not found, necessary to stream audio.") DEEPGRAM_URL = f"https://api.deepgram.com/v1/speak?model={self.MODEL_NAME}&performance=some&encoding=linear16&sample_rate=24000" headers = {"Authorization": f"Token {self.DG_API_KEY}","Content-Type": "application/json" } payload = {"text": text } player_command = ["ffplay", "-autoexit", "-", "-nodisp"] player_process = subprocess.Popen( player_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, ) start_time = time.time() # Record the time before sending the request first_byte_time = None # Initialize a variable to store the time when the first byte is received with requests.post(DEEPGRAM_URL, stream=True, headers=headers, json=payload) as r: for chunk in r.iter_content(chunk_size=1024): if interruption.is_set(): player_process.kill() return if chunk: if first_byte_time is None: # Check if this is the first chunk received first_byte_time = time.time() # Record the time when the first byte is received ttfb = int((first_byte_time - start_time)*1000) # Calculate the time to first byte print(f"TTS Time to First Byte (TTFB): {ttfb}ms\n") player_process.stdin.write(chunk) player_process.stdin.flush() if player_process.stdin: player_process.stdin.close() player_process.wait()
Async function get_transcript
performs speech-to-text transcription. It contains function on_message
to check if received audio is user's speech and sets the interruption
Event in these cases:
async def get_transcript(callback): try: # example of setting up a client config. logging values: WARNING, VERBOSE, DEBUG, SPAM config = DeepgramClientOptions(options={"keepalive": "true"}) deepgram: DeepgramClient = DeepgramClient("", config) dg_connection = deepgram.listen.asynclive.v("1") print ("Listening...") async def on_message(self, result, **kwargs): sentence = result.channel.alternatives[0].transcript if not result.speech_final: transcript_collector.add_part(sentence) interruption.set() transcription_complete.clear() else: # This is the final part of the current sentence transcript_collector.add_part(sentence) full_sentence = transcript_collector.get_full_transcript() # Check if the full_sentence is not empty before printing if len(full_sentence.strip()) > 0: full_sentence = full_sentence.strip() print(f"Human: {full_sentence}") callback(full_sentence) # Call the callback with the full_sentence transcript_collector.reset() transcription_complete.set() # Signal to stop transcription and exit dg_connection.on(LiveTranscriptionEvents.Transcript, on_message) options = LiveOptions( model="nova-2", punctuate=True, language="en-US", encoding="linear16", channels=1, sample_rate=16000, endpointing=300, smart_format=True, ) await dg_connection.start(options) # Open a microphone stream on the default input device microphone = Microphone(dg_connection.send) microphone.start() # start indefinite loop to listen continuously while not conversation_over.is_set(): await asyncio.sleep(0.01) # Wait for the microphone to close microphone.finish() # Indicate that we've finished await dg_connection.finish() except Exception as e: print(f"Could not open socket: {e}") return
I need to run this function in indefinite loop in the background, simultaneously with TextToSpeech().speak()
to continuously listen to user and catch interruptions.This is the class with main async function:
class ConversationManager: def __init__(self): self.transcription_response = "" self.llm = LanguageModelProcessor() async def main(self): def handle_full_sentence(full_sentence): self.transcription_response = full_sentence asyncio.create_task(get_transcript(handle_full_sentence)) # Loop indefinitely until "goodbye" is detected while not conversation_over.is_set(): await transcription_complete.wait() interruption.clear() if not transcription_complete.is_set(): continue # Check for "goodbye" to exit the loop if "goodbye" in self.transcription_response.lower(): conversation_over.set() transcription_complete.clear() llm_response = self.llm.process(self.transcription_response) tts = TextToSpeech() tts.speak(llm_response) # Reset transcription_response for the next loop iteration self.transcription_response = ""
It runs with the following code:
if __name__ == "__main__": manager = ConversationManager() asyncio.run(manager.main())
I'm creating Events outside of all functions this way:
transcription_complete = asyncio.Event() # Event to signal transcription completionconversation_over = asyncio.Event() # Event to signal end of conversationinterruption = asyncio.Event() # Event to signal interruption
As I understand task asyncio.create_task(get_transcript(handle_full_sentence))
runs a function get_transcript
every time await
appears. It's recording and transcribing my voice in the background, because if I'm talking while TextToSpeech().speak()
then after speak()
returns I can see all of my transcribed phrases in the console. But the problem is that it's not interrupting the TextToSpeech().speak()
.