# Style-Bert-Vits2 **Repository Path**: Ruepr123/style-bert-vits2 ## Basic Information - **Project Name**: Style-Bert-Vits2 - **Description**: No description available - **Primary Language**: Unknown - **License**: AGPL-3.0 - **Default Branch**: master - **Homepage**: None - **GVP Project**: No ## Statistics - **Stars**: 0 - **Forks**: 0 - **Created**: 2025-10-26 - **Last Updated**: 2025-10-26 ## Categories & Tags **Categories**: Uncategorized **Tags**: None ## README # 一、简介 由于Style-Bert-Vits开源项目是个基于windows的整合包,以前端交互的形式完成模型的训练以及推理,不方便在linux上运行,所以需要对代码进行拆分。 目前完成了数据处理部分,包括音频切分、ASR打标、音频重采样、文本转音素、提取文本特征、提取说话人style特征... 未完成:模型微调、说话人情绪微调 # 二、微调基本流程 ## 1.切分原始语音 模型对数据长度有要求,所以对应太长的音频需要先切分为短音频。作者使用litagin02/silero-vad(一个语音活动检测模型,VAD),识别音频信号中包含语音的部分,过滤掉静音或背景噪音,再对音频进行切分。具体函数如下: * 使用slice.py文件中的split_wav方法切分音频,参数说明: * vad_model:加载的vad模型 * utils:vad模型的相关工具函数 * audio_file:音频路径 * target_dir:切分后的模型的保存路径 * min_sec:切分最小片段长度 * max_sec:切分最大片段长度 * min_silence_dur_ms最小静音时长 * time_suffix是否在文件名后添加时间戳 ```python import torch from slice import * from pathlib import Path vad_model, utils = torch.hub.load( repo_or_dir="", model="silero_vad", onnx=True, trust_repo=True, ) file = Path("./Data/sxc/xxx.mp3") input_dir = Path("./Data/sxc/") output_dir = Path("./Data/sxc/raw") min_sec = 1 max_sec = 3 min_silence_dur_ms = 700 time_suffix = False rel_path = file.relative_to(input_dir) ime_sec, count = split_wav( vad_model=vad_model, utils=utils, audio_file=file, target_dir=output_dir / rel_path.parent, min_sec=min_sec, max_sec=max_sec, min_silence_dur_ms=min_silence_dur_ms, time_suffix=time_suffix, ) ``` ## 2.ASR打标 使用阿里的ASR模型将音频转为文本,使用funasr库中的AutoModel类初始化模型,然后调用generate生成文本。将ASR的过程写在了transcribe_files方法中,参数如下: * audio_files:音频路径列表 * input_dir:音频所在文件夹路径 * output_file:音频输出文件夹路径 * model_name:模型名(不是ASR的模型名,是自己用于区分微调模型的模型名) * initial_prompt:初始化asr后的文本 * language:音频所使用的语言 * device:模型推理使用的设备 ```python import argparse import os import traceback from tqdm import tqdm from pathlib import Path from funasr import AutoModel # HFのWhisperはファイルリストを与えるとバッチ処理ができて速い def transcribe_files( audio_files: list[Path], input_dir: Path, output_file: Path, model_name: str, initial_prompt: str, language: str = "zh", device: str = "cuda", ) -> list[str]: if language == 'zh': language_id = 'ZH' results = [] for input_file in audio_files: text: str = model.generate(input=str(input_file))[0]["text"] if text.startswith(f" {initial_prompt}"): text = text[len(f" {initial_prompt}") :] with open(output_file, "a", encoding="utf-8") as f: wav_rel_path = input_file.relative_to(input_dir) f.write(f"{wav_rel_path}|{model_name}|{language_id}|{text}\n") results.append(text) return results # 模型需要自行下载 root = "/data/gaojinpeng/03_CQU_Project/05_TTS/GPT-SoVITS-main/" path_asr = root + 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' path_vad = root + 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch' path_punc = root + 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" model = AutoModel( model = path_asr, model_revision = "v2.0.4", vad_model = path_vad, vad_model_revision = "v2.0.4", punc_model = path_punc, punc_model_revision = "v2.0.4", ) input_file = Path('./Data/sxc/raw/') output_file = Path("./Data/sxc/esd.list") wav_files = [f for f in input_file.rglob("*.wav") if f.is_file()] transcribe_files(wav_files, input_file, output_file, 'sxc', initial_prompt=' ') ``` ## 3.数据预处理 ### 3.1 音频重采样 对原始音频按照44100Hz的频率进行重采样(为什么要重采样?),使用librosa.load就可以完成重采样,具体实现写在了resample方法里,resample的参数如下: * files:音频路径 * input_dir:输入文件夹路径 * output_dir:输出路径 * target_sr:采样频率 * normalize:是否归一化 * trim:是否去除音频信号开头和结尾的静音部分。 ```python import librosa import soundfile import pyloudnorm as pyln DEFAULT_BLOCK_SIZE: float = 0.400 # seconds class BlockSizeException(Exception): pass # 归一化 def normalize_audio(data, sr: int): meter = pyln.Meter(sr, block_size=DEFAULT_BLOCK_SIZE) # create BS.1770 meter try: loudness = meter.integrated_loudness(data) except ValueError as e: raise BlockSizeException(e) data = pyln.normalize.loudness(data, loudness, -23.0) return data # 重采样,按照44100Hz频率进行采样 def resample( files: Path, input_dir: Path, output_dir: Path, target_sr: int, normalize: bool, trim: bool, ): """ 加载文件并将其转换为 target_sr wav 文件, 保存在output_dir中,保留与input_dir的相对路径 """ for file in files: # type: ignore try: # 检查librosa是否可以读取该文件 # 除了wav之外,还可以读取mp3、ogg、flac等。 wav, sr = librosa.load(file, sr=target_sr) if normalize: try: wav = normalize_audio(wav, sr) except BlockSizeException: print("") logger.info( f"Skip normalize due to less than {DEFAULT_BLOCK_SIZE} second audio: {file}" ) if trim: wav, _ = librosa.effects.trim(wav, top_db=30) relative_path = file.relative_to(input_dir) # 此处,即使扩展名不是 .wav,也会被替换为 .wav。 output_path = output_dir / relative_path.with_suffix(".wav") output_path.parent.mkdir(parents=True, exist_ok=True) soundfile.write(output_path, wav, sr) except Exception as e: logger.warning(f"Cannot load file, so skipping: {file}, {e}") input_dir = Path('./Data/sxc/raw/') output_dir = Path("./Data/sxc/wavs") wav_files = [f for f in input_file.rglob("*.wav") if f.is_file()] # 结果保存在wav中 resample(wav_files, input_dir, output_dir, 44100, False, False) ``` ### 3.2 文本转音素 可以直接调用style_bert_vits2文件夹下nlp.py中的clean_text方法将文本转为音素,实现方法写在了方法process_line中,具体参数如下: * line:asr打标后保存的字符串数据,格式为:音频文件名|模型名|语言|asr,例如:Data/Erwin/wavs/Erwin-0.wav|Erwin|ZH|如果这个作战顺利,你也许 * transcription_path:asr打标后的文件保存路径 * correct_path:是否将音频名转为音频路径 ```python from pathlib import Path from style_bert_vits2.nlp import clean_text # line:asr内容,transcription_path:asr文件路径, correct_path:是否修正路径,use_jp_extra:是否使用额外日语,yomi_error:是否使用错误读音 def process_line( line: str, transcription_path: Path, correct_path: bool, use_jp_extra: bool, yomi_error: str, ): splitted_line = line.strip().split("|") if len(splitted_line) != 4: raise ValueError(f"Invalid line format: {line.strip()}") utt, spk, language, text = splitted_line norm_text, phones, tones, word2ph = clean_text( text=text, language=language, # type: ignore use_jp_extra=use_jp_extra, raise_yomi_error=(yomi_error != "use"), ) if correct_path: utt = str(transcription_path.parent / "wavs" / utt) return "{}|{}|{}|{}|{}|{}|{}\n".format( utt, spk, language, norm_text, " ".join(phones), " ".join([str(i) for i in tones]), " ".join([str(i) for i in word2ph]), ) transcription_path = Path("./Data/sxc/esd.list") processed_data = [] with transcription_path.open("r", encoding="utf-8") as trans_file: for line in trans_file.readlines(): line = line.strip() processed_line = process_line(line, transcription_path, True, False, "raise") processed_data.append(processed_line) # cleaned_path = transcription_path.with_name( transcription_path.name + ".cleaned" ) with cleaned_path.open("w", encoding="utf-8") as out_file: for line in processed_data: out_file.write(line) ``` ### 3.3 提取文本特征 对于中文文本,使用chinese-roberta-wwm-ext-large提取bert特征,其它语言模型在文件style_bert_vits2/constants.py中有定义。Bert模型需要存放在文件夹Bert中。 具体方法实现在了process_line_bert中,具体参数如下: * file:待处理的数据 * add_blank:是否添加空格 ```python import torch from pathlib import Path from style_bert_vits2.models import commons from style_bert_vits2.nlp import cleaned_text_to_sequence, extract_bert_feature def process_line_bert(x: tuple[str, bool]): line, add_blank = x # device = f"cuda:0" device = f"cpu" wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|") phone = phones.split(" ") tone = [int(i) for i in tone.split(" ")] word2ph = [int(i) for i in word2ph.split(" ")] word2ph = [i for i in word2ph] phone, tone, language = cleaned_text_to_sequence( # 不知道有啥用 phone, tone, language_str # ) # 是否在两个音素之间添加空格 if add_blank: phone = commons.intersperse(phone, 0) tone = commons.intersperse(tone, 0) language = commons.intersperse(language, 0) for i in range(len(word2ph)): word2ph[i] = word2ph[i] * 2 word2ph[0] += 1 bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt") try: bert = torch.load(bert_path) assert bert.shape[-1] == len(phone) except Exception: bert = extract_bert_feature(text, word2ph, language_str, device) assert bert.shape[-1] == len(phone) torch.save(bert, bert_path) input_dir = Path('./Data/sxc/esd.list.cleaned') add_blank = True lines = [] # wav_files = [f for f in input_dir.rglob("*.wav") if f.is_file()] with open(input_dir, encoding="utf-8") as f: for line in f.readlines(): lines.append(line.strip()) for file in lines: process_line_bert((file, add_blank)) ``` ### 3.4 提取音频Style特征 作者使用pyannote/wespeaker-voxceleb-resnet34-LM提取说话人风格特征(一种基于 ResNet34 架构的说话人识别模型),特征保存在后缀是.npy的文件中,具体实现如下: ```python import torch from pathlib import Path import numpy as np from pyannote.audio import Inference, Model class NaNValueError(ValueError): """自定义异常类。如果找到 NaN 值则使用。""" def save_style_vector(wav_path: str): try: style_vec = inference(wav_path) except Exception as e: print("\n") logger.error(f"Error occurred with file: {wav_path}, Details:\n{e}\n") raise # 値にNaNが含まれていると悪影響なのでチェックする if np.isnan(style_vec).any(): print("\n") logger.warning(f"NaN value found in style vector: {wav_path}") raise NaNValueError(f"NaN value found in style vector: {wav_path}") np.save(f"{wav_path}.npy", style_vec) # `test.wav` -> `test.wav.npy` def process_line_style(line: str): wav_path = line.split("|")[0] try: save_style_vector(wav_path) return line, None except NaNValueError: return line, "nan_error" # 初始化说话人验证模型 model = Model.from_pretrained("pyannote/wespeaker-voxceleb-resnet34-LM") inference = Inference(model, window="whole") device = torch.device('cpu') inference.to(device) input_dir = Path('./Data/sxc/esd.list.cleaned') add_blank = True lines = [] # wav_files = [f for f in input_dir.rglob("*.wav") if f.is_file()] with open(input_dir, encoding="utf-8") as f: for line in f.readlines(): lines.append(line.strip()) for file in lines: process_line_style(file) ``` ### 4.模型微调(正在完成...)