# Style-Bert-Vits2

**Repository Path**: Ruepr123/style-bert-vits2

## Basic Information

- **Project Name**: Style-Bert-Vits2
- **Description**: No description available
- **Primary Language**: Unknown
- **License**: AGPL-3.0
- **Default Branch**: master
- **Homepage**: None
- **GVP Project**: No

## Statistics

- **Stars**: 0
- **Forks**: 0
- **Created**: 2025-10-26
- **Last Updated**: 2025-10-26

## Categories & Tags

**Categories**: Uncategorized

**Tags**: None

## README

# 一、简介

由于Style-Bert-Vits开源项目是个基于windows的整合包，以前端交互的形式完成模型的训练以及推理，不方便在linux上运行，所以需要对代码进行拆分。

目前完成了数据处理部分，包括音频切分、ASR打标、音频重采样、文本转音素、提取文本特征、提取说话人style特征...

未完成：模型微调、说话人情绪微调

# 二、微调基本流程

## 1.切分原始语音

模型对数据长度有要求，所以对应太长的音频需要先切分为短音频。作者使用litagin02/silero-vad(一个语音活动检测模型，VAD)，识别音频信号中包含语音的部分，过滤掉静音或背景噪音，再对音频进行切分。具体函数如下：

* 使用slice.py文件中的split_wav方法切分音频，参数说明：
  * vad_model：加载的vad模型
  * utils：vad模型的相关工具函数
  * audio_file：音频路径
  * target_dir：切分后的模型的保存路径
  * min_sec：切分最小片段长度
  * max_sec：切分最大片段长度
  * min_silence_dur_ms最小静音时长
  * time_suffix是否在文件名后添加时间戳

```python
import torch
from slice import *
from pathlib import Path

vad_model, utils = torch.hub.load(
            repo_or_dir="",
            model="silero_vad",
            onnx=True,
            trust_repo=True,
        )

file = Path("./Data/sxc/xxx.mp3")
input_dir = Path("./Data/sxc/")
output_dir = Path("./Data/sxc/raw")

min_sec = 1
max_sec = 3
min_silence_dur_ms = 700
time_suffix = False
rel_path = file.relative_to(input_dir)
ime_sec, count = split_wav(
                    vad_model=vad_model,
                    utils=utils,
                    audio_file=file,
                    target_dir=output_dir / rel_path.parent,
                    min_sec=min_sec,
                    max_sec=max_sec,
                    min_silence_dur_ms=min_silence_dur_ms,
                    time_suffix=time_suffix,
                )
```

## 2.ASR打标

使用阿里的ASR模型将音频转为文本，使用funasr库中的AutoModel类初始化模型，然后调用generate生成文本。将ASR的过程写在了transcribe_files方法中，参数如下：

* audio_files：音频路径列表
* input_dir：音频所在文件夹路径
* output_file：音频输出文件夹路径
* model_name：模型名（不是ASR的模型名，是自己用于区分微调模型的模型名）
* initial_prompt：初始化asr后的文本
* language：音频所使用的语言
* device：模型推理使用的设备

```python
import argparse
import os
import traceback
from tqdm import tqdm
from pathlib import Path
from funasr import AutoModel

# HFのWhisperはファイルリストを与えるとバッチ処理ができて速い
def transcribe_files(
    audio_files: list[Path],
    input_dir: Path,
    output_file: Path,
    model_name: str,
    initial_prompt: str,
    language: str = "zh",
    device: str = "cuda",
) -> list[str]:
    if language == 'zh':
        language_id = 'ZH'
    results = []
    for input_file in audio_files:
        text: str = model.generate(input=str(input_file))[0]["text"]
        if text.startswith(f" {initial_prompt}"):
            text = text[len(f" {initial_prompt}") :]
        with open(output_file, "a", encoding="utf-8") as f:
            wav_rel_path = input_file.relative_to(input_dir)
            f.write(f"{wav_rel_path}|{model_name}|{language_id}|{text}\n")
        results.append(text)

    return results

# 模型需要自行下载
root = "/data/gaojinpeng/03_CQU_Project/05_TTS/GPT-SoVITS-main/"
path_asr  = root + 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
path_vad  = root + 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
path_punc = root + 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
path_asr  = path_asr  if os.path.exists(path_asr)  else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
path_vad  = path_vad  if os.path.exists(path_vad)  else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"

model = AutoModel(
    model               = path_asr,
    model_revision      = "v2.0.4",
    vad_model           = path_vad,
    vad_model_revision  = "v2.0.4",
    punc_model          = path_punc,
    punc_model_revision = "v2.0.4",
)

input_file = Path('./Data/sxc/raw/')
output_file = Path("./Data/sxc/esd.list")

wav_files = [f for f in input_file.rglob("*.wav") if f.is_file()]
transcribe_files(wav_files, input_file, output_file, 'sxc', initial_prompt=' ')
```

## 3.数据预处理

### 3.1 音频重采样

对原始音频按照44100Hz的频率进行重采样（为什么要重采样？），使用librosa.load就可以完成重采样，具体实现写在了resample方法里，resample的参数如下：

* files：音频路径
* input_dir：输入文件夹路径
* output_dir：输出路径
* target_sr：采样频率
* normalize：是否归一化
* trim：是否去除音频信号开头和结尾的静音部分。

```python
import librosa
import soundfile
import pyloudnorm as pyln

DEFAULT_BLOCK_SIZE: float = 0.400  # seconds

class BlockSizeException(Exception):
    pass

# 归一化
def normalize_audio(data, sr: int):
    meter = pyln.Meter(sr, block_size=DEFAULT_BLOCK_SIZE)  # create BS.1770 meter
    try:
        loudness = meter.integrated_loudness(data)
    except ValueError as e:
        raise BlockSizeException(e)

    data = pyln.normalize.loudness(data, loudness, -23.0)
    return data

# 重采样，按照44100Hz频率进行采样
def resample(
    files: Path,
    input_dir: Path,
    output_dir: Path,
    target_sr: int,
    normalize: bool,
    trim: bool,
):
    """
    加载文件并将其转换为 target_sr wav 文件，
    保存在output_dir中，保留与input_dir的相对路径
    """
    for file in files: # type: ignore
        try:
            # 检查librosa是否可以读取该文件
            # 除了wav之外，还可以读取mp3、ogg、flac等。
            wav, sr = librosa.load(file, sr=target_sr)
            if normalize:
                try:
                    wav = normalize_audio(wav, sr)
                except BlockSizeException:
                    print("")
                    logger.info(
                        f"Skip normalize due to less than {DEFAULT_BLOCK_SIZE} second audio: {file}"
                    )
            if trim:
                wav, _ = librosa.effects.trim(wav, top_db=30)
            relative_path = file.relative_to(input_dir)
            # 此处，即使扩展名不是 .wav，也会被替换为 .wav。
            output_path = output_dir / relative_path.with_suffix(".wav")
            output_path.parent.mkdir(parents=True, exist_ok=True)
            soundfile.write(output_path, wav, sr)
        except Exception as e:
            logger.warning(f"Cannot load file, so skipping: {file}, {e}")

input_dir = Path('./Data/sxc/raw/')
output_dir = Path("./Data/sxc/wavs")
wav_files = [f for f in input_file.rglob("*.wav") if f.is_file()]
# 结果保存在wav中
resample(wav_files, input_dir, output_dir, 44100, False, False)
```

### 3.2 文本转音素

可以直接调用style_bert_vits2文件夹下nlp.py中的clean_text方法将文本转为音素，实现方法写在了方法process_line中，具体参数如下：

* line：asr打标后保存的字符串数据，格式为：音频文件名|模型名|语言|asr，例如：Data/Erwin/wavs/Erwin-0.wav|Erwin|ZH|如果这个作战顺利，你也许
* transcription_path：asr打标后的文件保存路径
* correct_path：是否将音频名转为音频路径

```python
from pathlib import Path
from style_bert_vits2.nlp import clean_text

# line：asr内容，transcription_path：asr文件路径， correct_path：是否修正路径，use_jp_extra：是否使用额外日语，yomi_error：是否使用错误读音
def process_line(
    line: str,
    transcription_path: Path,
    correct_path: bool,
    use_jp_extra: bool,
    yomi_error: str,
):
    splitted_line = line.strip().split("|")
    if len(splitted_line) != 4:
        raise ValueError(f"Invalid line format: {line.strip()}")
    utt, spk, language, text = splitted_line
    norm_text, phones, tones, word2ph = clean_text(
        text=text,
        language=language,  # type: ignore
        use_jp_extra=use_jp_extra,
        raise_yomi_error=(yomi_error != "use"),
    )
    if correct_path:
        utt = str(transcription_path.parent / "wavs" / utt)

    return "{}|{}|{}|{}|{}|{}|{}\n".format(
        utt,
        spk,
        language,
        norm_text,
        " ".join(phones),
        " ".join([str(i) for i in tones]),
        " ".join([str(i) for i in word2ph]),
    )

transcription_path = Path("./Data/sxc/esd.list")
processed_data = []
with transcription_path.open("r", encoding="utf-8") as trans_file:
    for line in trans_file.readlines():
        line = line.strip()
        processed_line = process_line(line, transcription_path, True, False, "raise")
        processed_data.append(processed_line)
# 
cleaned_path = transcription_path.with_name(
            transcription_path.name + ".cleaned"
        )
with cleaned_path.open("w", encoding="utf-8") as out_file:
    for line in processed_data:
        out_file.write(line)
```

### 3.3 提取文本特征

对于中文文本，使用chinese-roberta-wwm-ext-large提取bert特征，其它语言模型在文件style_bert_vits2/constants.py中有定义。Bert模型需要存放在文件夹Bert中。

具体方法实现在了process_line_bert中，具体参数如下：

* file：待处理的数据
* add_blank：是否添加空格

```python
import torch
from pathlib import Path
from style_bert_vits2.models import commons
from style_bert_vits2.nlp import cleaned_text_to_sequence, extract_bert_feature

def process_line_bert(x: tuple[str, bool]):
    line, add_blank = x
    # device = f"cuda:0"
    device = f"cpu"
    wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
    phone = phones.split(" ")
    tone = [int(i) for i in tone.split(" ")]
    word2ph = [int(i) for i in word2ph.split(" ")]
    word2ph = [i for i in word2ph]
    phone, tone, language = cleaned_text_to_sequence( # 不知道有啥用
        phone, tone, language_str # 
    )

    # 是否在两个音素之间添加空格
    if add_blank:
        phone = commons.intersperse(phone, 0)
        tone = commons.intersperse(tone, 0)
        language = commons.intersperse(language, 0)
        for i in range(len(word2ph)):
            word2ph[i] = word2ph[i] * 2
        word2ph[0] += 1

    bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt")

    try:
        bert = torch.load(bert_path)
        assert bert.shape[-1] == len(phone)
    except Exception:
        bert = extract_bert_feature(text, word2ph, language_str, device)
        assert bert.shape[-1] == len(phone)
        torch.save(bert, bert_path)

input_dir = Path('./Data/sxc/esd.list.cleaned')
add_blank = True
lines = []
# wav_files = [f for f in input_dir.rglob("*.wav") if f.is_file()]
with open(input_dir, encoding="utf-8") as f:
    for line in f.readlines():
        lines.append(line.strip())
for file in lines:
    process_line_bert((file, add_blank))
```

### 3.4 提取音频Style特征

作者使用pyannote/wespeaker-voxceleb-resnet34-LM提取说话人风格特征（一种基于 ResNet34 架构的说话人识别模型），特征保存在后缀是.npy的文件中，具体实现如下：

```python
import torch
from pathlib import Path
import numpy as np
from pyannote.audio import Inference, Model

class NaNValueError(ValueError):
    """自定义异常类。如果找到 NaN 值则使用。"""

def save_style_vector(wav_path: str):
    try:
        style_vec = inference(wav_path)
    except Exception as e:
        print("\n")
        logger.error(f"Error occurred with file: {wav_path}, Details:\n{e}\n")
        raise
    # 値にNaNが含まれていると悪影響なのでチェックする
    if np.isnan(style_vec).any():
        print("\n")
        logger.warning(f"NaN value found in style vector: {wav_path}")
        raise NaNValueError(f"NaN value found in style vector: {wav_path}")
    np.save(f"{wav_path}.npy", style_vec)  # `test.wav` -> `test.wav.npy`

def process_line_style(line: str):
    wav_path = line.split("|")[0]
    try:
        save_style_vector(wav_path)
        return line, None
    except NaNValueError:
        return line, "nan_error"
# 初始化说话人验证模型
model = Model.from_pretrained("pyannote/wespeaker-voxceleb-resnet34-LM")
inference = Inference(model, window="whole")
device = torch.device('cpu')
inference.to(device)

input_dir = Path('./Data/sxc/esd.list.cleaned')
add_blank = True
lines = []
# wav_files = [f for f in input_dir.rglob("*.wav") if f.is_file()]
with open(input_dir, encoding="utf-8") as f:
    for line in f.readlines():
        lines.append(line.strip())
for file in lines:
    process_line_style(file)
```

### 4.模型微调（正在完成...）