This article is a deed dive on how episodes for the Bedtime stories podcast are generated. Specifically, how the speech is generated and how it is composed with background audion. To learn more about the podcast, check this overview article - Bedtime stories generated by AI.
For the speech generation, I use the TensorFlowTTS library and the pre-trained models. Unfortunately, this library provides only one voice but hopefully in the future there will be more voices available.
%%capture
%%bash
pip install pydub
pip install git+https://github.com/TensorSpeech/TensorFlowTTS.git
pip install git+https://github.com/repodiac/german_transliterate.git#egg=german_transliterate
import os
import re
import numpy as np
from pydub import AudioSegment
from pydub.playback import play
import soundfile
import subprocess
import tempfile
import IPython.display as ipd
from tqdm import tqdm
import tensorflow as tf
from tensorflow_tts.inference import TFAutoModel
from tensorflow_tts.inference import AutoConfig
from tensorflow_tts.inference import AutoProcessor
Second, we need to download the speech models Tacotron 2 and Melgan which were trained on the LJ Speech Dataset.
tacotron2 = TFAutoModel.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en", name="tacotron2")
melgan = TFAutoModel.from_pretrained("tensorspeech/tts-melgan-ljspeech-en", name="melgan")
processor = AutoProcessor.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en")
def text2speech(input_text, text2mel_model, vocoder_model):
    input_ids = processor.text_to_sequence(input_text)
    # text2mel part
    _, mel_outputs, stop_token_prediction, alignment_history = text2mel_model.inference(
        tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
        tf.convert_to_tensor([len(input_ids)], tf.int32),
        tf.convert_to_tensor([0], dtype=tf.int32)
        )
    # vocoder part
    audio = vocoder_model(mel_outputs)[0, :, 0]
    return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy()
Because I could not perform speech synthesis on large text, I needed a helper function that will chunk a large text into smaller chunks.
def split_into_chunks(text: str, max_length):
    """Split a chunk of text into chunks of max_length and return a list of them."""
    sentences = re.split(r"(?<=\.)\s+(?=[A-Z])", text.replace("\n", " "))
    chunks = []
    current_chunk = []
    chunk_length = 0
    for sentence in sentences:
        sentence_length = len(sentence)
        if chunk_length + sentence_length + 1 > max_length:
            # This chunk would overflow, make a new chunk.
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            chunk_length = 0
        current_chunk.append(sentence)
        chunk_length += sentence_length + 1
    chunks.append(" ".join(current_chunk))
    return chunks
%%bash
rm -rf *.mp3
rm -rf *.wav
Next paste the story text in placeholder variable
story = """
replace with actual story
"""
Then, create chunks from the story text and place those chunks around some introductory and prelude texts.
number = 1
begining = f"""
Welcome to Episode {number} of the Bedtime short stories podcast, the AI generated podcast with short stories to help you sleep.
I am Ex Machina, and I will be narrating your story tonight.
"""
end = f"""
I hope you didn't make it so far and you are already asleep. If not then I hope you have enjoyed this short story. See next time.
""".strip()
chunks = split_into_chunks(story, 1000)
chunks = [begining] + chunks + [end]
# setup window for tacotron2 if you want to try
tacotron2.setup_window(win_front=10, win_back=10)
Now we can generate the speech for every chunk and save it in a separate WAV file
sr = 22050
chunk_names = []
for index, chunk in tqdm(enumerate(chunks), total=len(chunks)):
    mels, alignment_history, audios = text2speech(chunk, tacotron2, melgan)
    chunk_name = f'voice_{number}_part_{index}.wav'
    soundfile.write(chunk_name, audios, sr, 'PCM_24')
    chunk_names.append(chunk_name)
First, we generate a WAV file with 3 seconds of silence
silence_segment = AudioSegment.silent(duration=3000)
silence_segment.export('silence.wav', format="wav");
Second, we place the silence audio with the rest of the episode audio
first = 1
last = len(chunk_names) - 1
chunk_names = chunk_names[:first] + ['silence.wav'] + chunk_names[first: last] + ['silence.wav'] + chunk_names[last:]
Then we concatenate different audio chunks to generate the speech file of the episode
def concatenate_tracks(chunk_names, output):
    """Concatenate mutliple audio tracks into one."""
    audios = np.array([])
    for chunk_name in tqdm(chunk_names):
        audio , _ = soundfile.read(chunk_name)
        audios = np.concatenate([audios, audio])
    soundfile.write(output, audios, sr, 'PCM_24')
def wav2mp3(input, output):
    audio = AudioSegment.from_wav(input)
    audio.export(output, format="mp3")
concatenate_tracks(chunk_names, 'voice.wav')
For convinience when later adding background, I convert the WAV audio file into the MP3 format.
wav2mp3('voice.wav', 'voice.mp3')
Background sound
To make the episode more interesting I add a background sound that matches the theme of the episode. I use freesound.org which is a great resouce for loyalty free audio. For instance, some interesting audios: Ocean waves, rain with thunder.
First, download the audio that best matches the episode theme
!curl -s -o base_background.mp3 https://freesound.org/data/previews/237/237729_3839718-lq.mp3
voice_duration = AudioSegment.from_wav('voice.wav').duration_seconds
background_duration = AudioSegment.from_mp3('base_background.mp3').duration_seconds
print(f"Voice duration is {voice_duration} seconds vs base background in {background_duration} seconds.")
base_background = AudioSegment.from_mp3("base_background.mp3")
background = base_background
for _ in range(int(voice_duration / background_duration) + 1):
    background = background + base_background
background.export("background.mp3", format="mp3")
background_duration = background.duration_seconds
print(f"Voice duration is {voice_duration} seconds vs background in {background_duration} seconds.")
def add_background_track(episode_file, background_file, output):
    tempbg = tempfile.mkstemp()[1]
    tempepisode = tempfile.mkstemp()[1]
    episode = AudioSegment.from_mp3(episode_file)
    background = AudioSegment.from_mp3(background_file)
    padded_episode = AudioSegment.silent(duration=7000) + episode + AudioSegment.silent(duration=8000)
    padded_episode.export(tempepisode, format='mp3')
    cut_bg = background[: padded_episode.duration_seconds * 1000].fade_in(3000).fade_out(5000)
    # Lower the background track volume.
    lower_volume_cut_bg = cut_bg - 10
    lower_volume_cut_bg.export(tempbg, format='mp3')
    subprocess.run(
        [
            "ffmpeg",
            "-y",
            "-i",
            tempbg,
            "-i",
            tempepisode,
            "-filter_complex",
            "amerge,acompressor=threshold=-21dB:ratio=12:attack=100:release=500",
            "-ac",
            "2",
            "-c:a",
            "libmp3lame",
            "-q:a",
            "4",
            output,
        ]
    )
    os.unlink(tempbg)
    os.unlink(tempepisode)
Finally, add the background to the voice file and disply the final result
add_background_track('voice.mp3', 'background.mp3', 'episode.mp3')
AudioSegment.from_mp3('episode.mp3')
That's all folks
You can give the podcast a try, all episodes are pulished here https://anchor.fm/exmachina
I would love to hear any feedack, suggestions or ideas for improvement. So feel free to leave a comment or reach out on twitter @bachiirc
