Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| from huggingface_hub import hf_hub_download | |
| from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface | |
| from Modules.ControllabilityGAN.GAN import GanWrapper | |
| class ControllableInterface: | |
| def __init__(self, gpu_id="cpu", available_artificial_voices=50, tts_model_path=None, vocoder_model_path=None, embedding_gan_path=None): | |
| if gpu_id == "cpu": | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "" | |
| elif gpu_id == "cuda": | |
| pass | |
| else: # in this case we hopefully got a number. | |
| os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | |
| os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" | |
| if embedding_gan_path is None: | |
| embedding_gan_path = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="embedding_gan.pt") | |
| self.device = "cuda" if gpu_id != "cpu" else "cpu" | |
| self.model = ToucanTTSInterface(device=self.device, tts_model_path=tts_model_path, vocoder_model_path=vocoder_model_path) | |
| self.wgan = GanWrapper(embedding_gan_path, num_cached_voices=available_artificial_voices, device=self.device) | |
| self.generated_speaker_embeds = list() | |
| self.available_artificial_voices = available_artificial_voices | |
| self.current_language = "" | |
| self.current_accent = "" | |
| def read(self, | |
| prompt, | |
| reference_audio, | |
| voice_seed, | |
| prosody_creativity, | |
| loudness_in_db | |
| ): | |
| print(prompt + "\n\n") | |
| if reference_audio is None: | |
| if not voice_seed: | |
| self.wgan.set_latent(7) | |
| controllability_vector = torch.tensor([0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0], dtype=torch.float32) | |
| embedding = self.wgan.modify_embed(controllability_vector) | |
| self.model.set_utterance_embedding(embedding=embedding) | |
| else: | |
| wavs = list() | |
| pitch, energy, durations = None, None, None | |
| for i in range(3, 8): | |
| self.wgan.set_latent(i) | |
| controllability_vector = torch.tensor([0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0], dtype=torch.float32) | |
| embedding = self.wgan.modify_embed(controllability_vector) | |
| self.model.set_utterance_embedding(embedding=embedding) | |
| wav, sr, pitch, energy, durations = self.model(prompt, | |
| input_is_phones=True, | |
| duration_scaling_factor=1.0, | |
| pitch_variance_scale=1.0, | |
| energy_variance_scale=1.0, | |
| pause_duration_scaling_factor=1.0, | |
| return_plot_as_filepath=False, | |
| prosody_creativity=prosody_creativity, | |
| loudness_in_db=loudness_in_db, | |
| pitch=pitch.unsqueeze(0) if pitch is not None else pitch, | |
| energy=energy.unsqueeze(0) if energy is not None else energy, | |
| durations=durations.unsqueeze(0) if durations is not None else durations) | |
| wavs.append(wav) | |
| wav = sum(wavs) / len(wavs) | |
| else: | |
| self.model.set_utterance_embedding(reference_audio) | |
| if not voice_seed or reference_audio is not None: | |
| wav, sr, pitch, energy, durations = self.model(prompt, | |
| input_is_phones=True, | |
| duration_scaling_factor=1.0, | |
| pitch_variance_scale=1.0, | |
| energy_variance_scale=1.0, | |
| pause_duration_scaling_factor=1.0, | |
| return_plot_as_filepath=False, | |
| prosody_creativity=prosody_creativity, | |
| loudness_in_db=loudness_in_db) | |
| return sr, wav | |