Source code for strauss.tts_caption

"""The :obj:`tts_caption` submodule: tool for generating spoken captions

This uses text-to-speech via the the ``TTS`` module to allow captions
represented as strings to be converted to spoken audio to precede the
sonification.
"""

from scipy.io import wavfile
from scipy.interpolate import interp1d
import numpy as np
import strauss.utilities as utils
import re
import ffmpeg as ff
import os
import warnings
from pathlib import Path

default_tts_voice = None


[docs]
class NoTTSAPI(Exception):
    # except when no API key is found for coqui-TTS module
    pass

try:
    from TTS.api import TTS
    ttsMode = 'coqui-tts'
    default_tts_voice = Path('tts_models','en','jenny', 'jenny')
    supported_voices = utils.get_supported_coqui_voices()    
except (OSError, ModuleNotFoundError, NoTTSAPI) as sderr:
    try:
      import pyttsx3
      ttsMode = 'pyttsx3'
      class TTS:
          def __init__(*args, **kwargs):
              pass
          def list_models(self):
              return getVoices(True)
      warnings.warn("Default TTS module coqui not found, using pyttsx3 instead. Note this is platform \n"
                    "dependent and can be problematic for linux-based systems (using the espeak engine)")
      default_tts_voice = {} # i.e. system default TTS (if exists)
    except (OSError, ModuleNotFoundError) as sderr:
      ttsMode = 'None'
      # print('No supported text-to-speech packages have been found.')
      def TTS(*args, **kwargs):
          raise TTSIsNotSupported("strauss has not been installed with text-to-speech support. \n"
                "This is not installed by default, due to some specific module requirements of the TTS module.\n"
                "Reinstalling strauss with 'pip install strauss[AI-TTS]' will give you access to this function\n"
                "If you run into issues with the TTS package, you can also install pyttsx3. Currently the most\n" 
                "compatible version is not published on PyPI, but you can install from the test repo with \n"
                "'pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ pyttsx3==2.99'")


[docs]
class TTSIsNotSupported(Exception):
    pass


def get_ttsMode():
   return ttsMode


[docs]
def getVoices(info=False):
  '''Get available voices for text-to-speech.

  When info=True, this prints out information
  for each voice option.

    Args:
      info (:obj:`bool`): Print out voice information when True, 
      by default False
      voices (:obj:`list`): List of ``pyttsx3.voice.Voice`` objects
      or ``dict`` objects.
 
  '''
  if ttsMode == 'pyttsx3':
      engine = pyttsx3.init()
      voices = engine.getProperty('voices')
      getter = vars
  elif ttsMode == 'coqui-tts':
      voices = supported_voices
      getter = dict
  else:
      getter = dict
      voices = [{"voices": "None"}]
  if info==True:
      print('Text-to-speech voice options')
      for ind in range(len(voices)):
          voiceProps = getter(voices[ind])
          print('\nVoice index:', ind)
          for key in voiceProps.keys():
              print('{}: {}'.format(key, voiceProps[key]))      
  return voices      



[docs]
def render_caption(caption, samprate, model, caption_path):
    '''The render_caption function generates an audio caption from text input
    and writes it as a wav file. If the sample rate of the model is not equal 
    to that passed from sonification.py, it resamples to the correct rate and
    re-writes the file. 
    
    If Coqui-AI is installed, text from user input is converted with text-to-
    speech software from Coqui-AI - https://pypi.org/project/TTS/ . 
    You can view publicly available voice models with 'TTS.list_models()'

    If Coqui-AI is not installed but pyttsx3 (https://pypi.org/project/pyttsx3/)
    is installed, text from user input is converted offline using pyttsx3.

    Note:
    STRAUSS checks if Coqui-AI is available. If it is, ``ttsMode`` is set to
    ``coqui-ai``. If it is unavailable, STRAUSS checks whether pyttsx3 is 
    available. If it is, ``ttsMode`` is set to ``pyttsx3``.

    Args:
      caption (:obj:`str`): script to be spoken by the TTS voice
      samprate (:obj:`int`): samples per second
      model (:obj:`str` for Coqui-AI; :obj:`dict` for pyttsx3): for Coqui-AI: 
        valid name of TTS voice from the underlying TTS module; for pyttsx3:
        dictionary with keys of 'rate' (percent of speed, signed int16),
        'volume' (float from 0 to 1), and/or 'voice' (the voice 'id' that can
        be chosen from the list given by the TTS.list_models() function).
      caption_path (:obj:`str`): filepath for spoken caption output
    '''

    # TODO: allow uniform indexing and/or language querying approaches for more consistency between tts modes...
    if ttsMode == 'coqui-tts':

      # TODO: do this better with logging. We can filter TTS function output, e.g. alert to downloading models...
      print('Rendering caption (this can take a while if the caption is long, or if the TTS model needs downloading)...')
      
      # capture stdout from the talkative TTS module
      with utils.Capturing() as output:
          # Load in the tts model
          tts = TTS(str(model), progress_bar=False, gpu=False)

          # render to speech, and write as a wav file (allow )
          tts.tts_to_file(text=caption, file_path=caption_path)
          
    elif ttsMode == 'pyttsx3':
        
      # Setup voice model for pyttsx3
      engine = pyttsx3.init() # initialize object

      # check what model info was set; if none were
      # specified, use defaults
      for key in ['rate','volume','voice']:
          if key in model.keys():
              engine.setProperty(key, model[key])
          else:
              pass

      engine.save_to_file(caption, caption_path, name='caption')
      # note the current PyPI release ()
      engine.runAndWait()
      
    else:
       # initialise dummy TTS class to raise error.
       TTS()
          
    # Read the file back in to check the sample rate
    try:
        # Try to read in directly...
        rate_in, wavobj = wavfile.read(caption_path)
    except:
        # ...but pttsx3 TTS can produce audio files incompatable
        # with scipy - convert to standard WAV using ffmpeg
        cpre = caption_path.split('.')[0] + '_pre.wav'
        os.rename(caption_path, cpre)
        ff.input(cpre).output(caption_path).run(quiet=1)
        rate_in, wavobj = wavfile.read(caption_path)
        
    # If it doesn't match the required rate, resample and re-write
    if rate_in != samprate:
        new_wavobj = utils.resample(rate_in, samprate, wavobj)
        wavfile.write(caption_path, samprate, new_wavobj)