From b7674f21f45a93a48e7334dd0daa5c2f375767bb Mon Sep 17 00:00:00 2001 From: pu-zhe Date: Thu, 14 Aug 2025 16:39:47 +0800 Subject: [PATCH 1/6] add cosyvoice2 FAQ --- .../built-in/audio/CosyVoice2/README.md | 47 ++++++++++++++----- .../audio/CosyVoice2/requirements.txt | 2 +- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/ACL_PyTorch/built-in/audio/CosyVoice2/README.md b/ACL_PyTorch/built-in/audio/CosyVoice2/README.md index 833da1e741..3c0baf4bda 100644 --- a/ACL_PyTorch/built-in/audio/CosyVoice2/README.md +++ b/ACL_PyTorch/built-in/audio/CosyVoice2/README.md @@ -69,18 +69,17 @@ cd ModelZoo-PyTorch/ACL_PyTorch/built-in/audio/CosyVoice/CosyVoice2 文件目录结构大致如下: ```text - 📁 CosyVoice/ - ├── 📁 CosyVoice2/ - | |── 📁 300I - | |── 📄 diff_CosyVoice_300I.patch - | |── 📄 modeling_qwen2.py - | |── 📁 800I - | |── 📄 diff_CosyVoice_800I.patch - | |── 📄 modeling_qwen2.py - | |── 📁 CosyVoice - | |── 📁 cosyVoice源码文件 # cosyVoice的源码文件,此处不一一列举 - │ ├── 📁 CosyVoice-0.5B/ # 权重文件 - │ ├── 📁 transformers/ # transformers库,里面修改modeling_qwen2.py文件 + 📁 CosyVoice2/ + |── 📁 300I + |── 📄 diff_CosyVoice_300I.patch + |── 📄 modeling_qwen2.py + |── 📁 800I + |── 📄 diff_CosyVoice_800I.patch + |── 📄 modeling_qwen2.py + |── 📁 CosyVoice + |── 📁 cosyVoice源码文件 # cosyVoice的源码文件,此处不一一列举 + ├── 📁 CosyVoice-0.5B/ # 权重文件 + ├── 📁 transformers/ # transformers库,里面修改modeling_qwen2.py文件 │── 📄 requirements.txt # 依赖库 |── 📄 infer.py # 推理脚本 └── 📄 modify_onnx.py # 模型转换脚本 @@ -200,3 +199,27 @@ cd ModelZoo-PyTorch/ACL_PyTorch/built-in/audio/CosyVoice/CosyVoice2 | cosyvoice |800I A2|0.28| | cosyvoice |300I DUO|0.75| + +# FAQ + 1. 环境安装依赖 + + (1)安装requirements.txt中的python库时,提示pynini编译失败: + pynini是WeTextProcessing的安装依赖项,编译报错时,需要按照获取源码章节,第2小节,手动编译安装WeTextProcessing。 + + (2)如提示未安装tokenizers库或版本冲突,可使用0.15.1版本tokenizers。 + + 2. 如在Openeular系统运行模型推理的过程中提示,fatal error: 'cstdint' file not found: + + 确保gcc,g++已安装成功 + 导入如下环境变量 + export CPLUS_INCLUDE_PATH=/usr/include/c++/12:/usr/include/c++/12/aarch64-openEuler-linux:$CPLUS_INCLUDE_PATH + + 3. 推理过程需确保ATC转换生成OM文件的过程,和推理过程的CANN版本保持一致。 + + 4. ATC转换时,提示Soc version ins invalid. + + atc命令--soc_version,需加入Ascend前缀,如--soc_version=Ascend310P3,具体型号以npu-smi info查询结果为准。 + + 5. 运行modify_onnx.py时,如提示ModuleNotFoundError: No module named 'auto_optimizer': + + 需先安装[msit](https://gitee.com/ascend/msit)工具。 diff --git a/ACL_PyTorch/built-in/audio/CosyVoice2/requirements.txt b/ACL_PyTorch/built-in/audio/CosyVoice2/requirements.txt index fb8c778f62..eda3d5f19e 100644 --- a/ACL_PyTorch/built-in/audio/CosyVoice2/requirements.txt +++ b/ACL_PyTorch/built-in/audio/CosyVoice2/requirements.txt @@ -26,7 +26,7 @@ soundfile==0.12.1 tensorboard==2.14.0 torch==2.3.1 torch_npu==2.3.1.post6 -torchaudio==2.4.0 +torchaudio==2.3.1 uvicorn==0.30.0 wget==3.2 fastapi==0.111.0 -- Gitee From cae7b65a615de7f5ede31b4de163c838771bc09d Mon Sep 17 00:00:00 2001 From: pu-zhe Date: Sat, 30 Aug 2025 11:02:30 +0800 Subject: [PATCH 2/6] cosyvoice2 support 313T 800T A2 --- .../CosyVoice2/800I/diff_CosyVoice_800T.patch | 674 ++++++++++++++++++ .../built-in/audio/CosyVoice2/README.md | 10 +- 2 files changed, 680 insertions(+), 4 deletions(-) create mode 100644 ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800T.patch diff --git a/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800T.patch b/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800T.patch new file mode 100644 index 0000000000..d53dbf3121 --- /dev/null +++ b/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800T.patch @@ -0,0 +1,674 @@ +diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py +index e2d62e2..0af241c 100644 +--- a/cosyvoice/cli/cosyvoice.py ++++ b/cosyvoice/cli/cosyvoice.py +@@ -13,11 +13,15 @@ + # limitations under the License. + import os + import time ++import platform ++import datetime + from typing import Generator + from tqdm import tqdm + from hyperpyyaml import load_hyperpyyaml + from modelscope import snapshot_download + import torch ++import acl ++from ais_bench.infer.interface import InferSession + from cosyvoice.cli.frontend import CosyVoiceFrontEnd + from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model + from cosyvoice.utils.file_utils import logging +@@ -126,7 +130,7 @@ class CosyVoice: + + class CosyVoice2(CosyVoice): + +- def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False): ++ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, load_om=False): + self.instruct = True if '-Instruct' in model_dir else False + self.model_dir = model_dir + self.fp16 = fp16 +@@ -155,6 +159,18 @@ class CosyVoice2(CosyVoice): + self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'), + '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir), + self.fp16) ++ if load_om: ++ arch = platform.machine() ++ system = platform.system().lower() ++ context, _ = acl.rt.get_context() ++ flow_om = InferSession(0, '{}/flow_{}_{}.om'.format(model_dir, system ,arch)) ++ flow_om_static = InferSession(0, '{}/flow_static.om'.format(model_dir)) ++ speech_om = InferSession(0, '{}/speech_{}_{}.om'.format(model_dir, system ,arch)) ++ _ = acl.rt.set_context(context) ++ self.frontend.speech_om = speech_om ++ self.frontend.flow_om = flow_om ++ self.model.flow.decoder.flow_om_static = flow_om_static ++ self.model.flow.decoder.flow_om = flow_om + del configs + + def inference_instruct(self, *args, **kwargs): +@@ -171,3 +187,19 @@ class CosyVoice2(CosyVoice): + logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) + yield model_output + start_time = time.time() ++ ++ def inference_sft_streaming_input(self, tts_text, char_idx, spk_id, user_id, input_end, stream=False, speed=1.0, text_frontend=True): ++ for i in [tts_text]: ++ model_input = self.frontend.frontend_sft(i, spk_id) ++ model_input["user_id"] = user_id ++ model_input["input_end"] = input_end ++ model_input['char_idx'] = char_idx ++ ++ start_time = time.time() ++ # print('synthesis text {}'.format(i)) ++ for model_output in self.model.tts_streaming_input(**model_input, stream=stream, speed=speed): ++ speech_len = model_output['tts_speech'].shape[1] / self.sample_rate ++ print("finish 1 chunk inference ", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')) ++ logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) ++ yield model_output ++ start_time = time.time() +diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py +index 6e10f00..25ad767 100644 +--- a/cosyvoice/cli/frontend.py ++++ b/cosyvoice/cli/frontend.py +@@ -71,6 +71,8 @@ class CosyVoiceFrontEnd: + self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True) + self.en_tn_model = EnNormalizer() + self.inflect_parser = inflect.engine() ++ self.speech_om = None ++ self.flow_om = None + + def _extract_text_token(self, text): + if isinstance(text, Generator): +@@ -92,11 +94,16 @@ class CosyVoiceFrontEnd: + def _extract_speech_token(self, speech): + assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s' + feat = whisper.log_mel_spectrogram(speech, n_mels=128) +- speech_token = self.speech_tokenizer_session.run(None, +- {self.speech_tokenizer_session.get_inputs()[0].name: +- feat.detach().cpu().numpy(), +- self.speech_tokenizer_session.get_inputs()[1].name: +- np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist() ++ if torch.npu.is_available() and self.speech_om: ++ feed = [feat.detach().cpu().numpy(), np.array([feat.shape[2]], dtype=np.int32)] ++ speech_token = self.speech_om.infer(feed, mode='dymshape', custom_sizes=[100000000])[0].flatten().tolist() ++ self.flow_om.set_context() ++ else: ++ speech_token = self.speech_tokenizer_session.run(None, ++ {self.speech_tokenizer_session.get_inputs()[0].name: ++ feat.detach().cpu().numpy(), ++ self.speech_tokenizer_session.get_inputs()[1].name: ++ np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist() + speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device) + speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device) + return speech_token, speech_token_len +diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py +index 9ebf8cb..a8775a1 100644 +--- a/cosyvoice/cli/model.py ++++ b/cosyvoice/cli/model.py +@@ -99,7 +99,7 @@ class CosyVoiceModel: + self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context() + + def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): +- with self.llm_context: ++ with self.llm_context(): + if isinstance(text, Generator): + assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!' + for i in self.llm.inference_bistream(text=text, +@@ -307,13 +307,25 @@ class CosyVoice2Model(CosyVoiceModel): + self.speech_window = np.hamming(2 * self.source_cache_len) + # rtf and decoding related + self.stream_scale_factor = 1 +- self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() ++ if torch.cuda.is_available(): ++ stream = torch.cuda.Stream(device=self.device) ++ self.llm_context = lambda: torch.cuda.stream(stream) ++ else: ++ self.llm_context = lambda: contextlib.nullcontext() + self.lock = threading.Lock() + # dict used to store session related variable + self.tts_speech_token_dict = {} + self.llm_end_dict = {} + self.hift_cache_dict = {} + ++ # add for support streaming input ++ self.first_chunk_size = 20 ++ self.token_offset_dict = {} ++ self.prompt_text_dict = {} ++ self.prompt_speech_token_dict = {} ++ self.speech_feat_dict = {} ++ self.embedding_dict = {} ++ + def load_jit(self, flow_encoder_model): + flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device) + self.flow.encoder = flow_encoder +@@ -362,12 +374,17 @@ class CosyVoice2Model(CosyVoiceModel): + with self.lock: + self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False + self.hift_cache_dict[this_uuid] = None +- p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid)) +- p.start() + if stream is True: + token_offset = 0 +- while True: +- time.sleep(0.1) ++ # 删除线程操作,串行执行推理,加速首包时延 ++ for i in self.llm.inference(text=text.to(self.device), ++ text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device), ++ prompt_text=prompt_text.to(self.device), ++ prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device), ++ prompt_speech_token=llm_prompt_speech_token.to(self.device), ++ prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device), ++ embedding=llm_embedding.to(self.device)): ++ self.tts_speech_token_dict[this_uuid].append(i) + if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= self.token_hop_len + self.flow.pre_lookahead_len: + this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0) + this_tts_speech = self.token2wav(token=this_tts_speech_token, +@@ -379,10 +396,6 @@ class CosyVoice2Model(CosyVoiceModel): + finalize=False) + token_offset += self.token_hop_len + yield {'tts_speech': this_tts_speech.cpu()} +- if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < self.token_hop_len + self.flow.pre_lookahead_len: +- break +- p.join() +- # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None + this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0) + this_tts_speech = self.token2wav(token=this_tts_speech_token, + prompt_token=flow_prompt_speech_token, +@@ -393,6 +406,8 @@ class CosyVoice2Model(CosyVoiceModel): + finalize=True) + yield {'tts_speech': this_tts_speech.cpu()} + else: ++ p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid)) ++ p.start() + # deal with all tokens + p.join() + this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0) +@@ -409,3 +424,83 @@ class CosyVoice2Model(CosyVoiceModel): + self.tts_speech_token_dict.pop(this_uuid) + self.llm_end_dict.pop(this_uuid) + torch.cuda.empty_cache() ++ ++ def tts_streaming_input(self, text, char_idx, flow_embedding, llm_embedding=torch.zeros(0, 192), ++ prompt_text=torch.zeros(1, 0, dtype=torch.int32), ++ llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), ++ flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), ++ prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs): ++ this_uuid = kwargs.get("user_id", "AscendDefaultUser") ++ if this_uuid not in self.tts_speech_token_dict: ++ self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False ++ self.hift_cache_dict[this_uuid] = None ++ self.token_offset_dict[this_uuid] = 0 ++ ++ self.prompt_text_dict[this_uuid] = prompt_text ++ self.prompt_speech_token_dict[this_uuid] = flow_prompt_speech_token ++ self.speech_feat_dict[this_uuid] = prompt_speech_feat ++ self.embedding_dict[this_uuid] = flow_embedding ++ else: ++ prompt_text = self.prompt_text_dict[this_uuid] ++ llm_prompt_speech_token = self.prompt_speech_token_dict[this_uuid] ++ flow_prompt_speech_token = self.prompt_speech_token_dict[this_uuid] ++ flow_embedding = self.embedding_dict[this_uuid] ++ llm_embedding = self.embedding_dict[this_uuid] ++ prompt_speech_feat = self.speech_feat_dict[this_uuid] ++ ++ for i in self.llm.inference_bistream_streaming_input(text=text, ++ char_idx=torch.tensor([char_idx]).to(self.device), ++ prompt_text=prompt_text.to(self.device), ++ prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device), ++ prompt_speech_token=llm_prompt_speech_token.to(self.device), ++ prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device), ++ embedding=llm_embedding.to(self.device), ++ uuid=this_uuid, input_end=kwargs['input_end']): ++ self.tts_speech_token_dict[this_uuid].append(i) ++ ++ assert stream is True, "output must be streaming" ++ ++ while True: ++ is_first_chunk_ready = (self.token_offset_dict[this_uuid] == 0 and len(self.tts_speech_token_dict[this_uuid]) >= self.first_chunk_size + self.flow.pre_lookahead_len) ++ is_next_chunk_ready = (self.token_offset_dict[this_uuid] > 0 and len(self.tts_speech_token_dict[this_uuid]) - self.token_offset_dict[this_uuid] >= self.token_hop_len + self.flow.pre_lookahead_len) ++ if is_first_chunk_ready or is_next_chunk_ready: ++ if self.token_offset_dict[this_uuid] == 0: ++ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:self.first_chunk_size + self.flow.pre_lookahead_len]).unsqueeze(dim=0) ++ else: ++ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:self.token_offset_dict[this_uuid] + self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0) # 0-53, 0-103, 0-153... ++ this_tts_speech = self.token2wav(token=this_tts_speech_token, ++ prompt_token=flow_prompt_speech_token, ++ prompt_feat=prompt_speech_feat, ++ embedding=flow_embedding, ++ uuid=this_uuid, ++ token_offset=self.token_offset_dict[this_uuid], ++ finalize=False) ++ if self.token_offset_dict[this_uuid] == 0: ++ self.token_offset_dict[this_uuid] += self.first_chunk_size ++ else: ++ self.token_offset_dict[this_uuid] += self.token_hop_len ++ yield {'tts_speech': this_tts_speech.cpu()} ++ # 是否需要退出循环(token 不够下一次推理) ++ if len(self.tts_speech_token_dict[this_uuid]) - self.token_offset_dict[this_uuid] < self.token_hop_len + self.flow.pre_lookahead_len: ++ break ++ ++ if kwargs['input_end'] is True: ++ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0) ++ this_tts_speech = self.token2wav(token=this_tts_speech_token, ++ prompt_token=flow_prompt_speech_token, ++ prompt_feat=prompt_speech_feat, ++ embedding=flow_embedding, ++ uuid=this_uuid, ++ token_offset=self.token_offset_dict[this_uuid], ++ finalize=True) ++ yield {'tts_speech': this_tts_speech.cpu()} ++ ++ self.tts_speech_token_dict.pop(this_uuid) ++ self.llm_end_dict.pop(this_uuid) ++ self.hift_cache_dict.pop(this_uuid) ++ ++ self.token_offset_dict.pop(this_uuid) ++ self.prompt_text_dict.pop(this_uuid) ++ self.prompt_speech_token_dict.pop(this_uuid) ++ self.speech_feat_dict.pop(this_uuid) ++ self.embedding_dict.pop(this_uuid) +diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py +index 6a60f6d..fbe7545 100644 +--- a/cosyvoice/flow/flow_matching.py ++++ b/cosyvoice/flow/flow_matching.py +@@ -14,6 +14,7 @@ + import threading + import torch + import torch.nn.functional as F ++import numpy as np + from matcha.models.components.flow_matching import BASECFM + + +@@ -32,6 +33,8 @@ class ConditionalCFM(BASECFM): + # Just change the architecture of the estimator here + self.estimator = estimator + self.lock = threading.Lock() ++ self.flow_om = None ++ self.flow_om_static = None + + @torch.inference_mode() + def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)): +@@ -105,12 +108,26 @@ class ConditionalCFM(BASECFM): + t_in[:] = t.unsqueeze(0) + spks_in[0] = spks + cond_in[0] = cond +- dphi_dt = self.forward_estimator( +- x_in, mask_in, +- mu_in, t_in, +- spks_in, +- cond_in +- ) ++ # 动态分档推理, 在流式输出中,每次输出的token数目固定,可以采取动态分档模型执行推理 ++ if torch.npu.is_available() and self.flow_om_static and x.size(2)%100==0 and x.size(2)<800: ++ feed_list = [x_in, mask_in, mu_in, t_in, spks_in, cond_in] ++ feed = [i.cpu().detach().numpy().astype(np.float32) for i in feed_list] ++ dphi_dt = self.flow_om_static.infer(feed, mode="dymdims") ++ self.flow_om.set_context() ++ dphi_dt = torch.from_numpy(dphi_dt[0]).npu() ++ # 输出的token数目不固定场景采用动态模型推理 ++ elif torch.npu.is_available() and self.flow_om: ++ feed_list = [x_in, mask_in, mu_in, t_in, spks_in, cond_in] ++ feed = [i.cpu().detach().numpy().astype(np.float32) for i in feed_list] ++ dphi_dt = self.flow_om.infer(feed, mode="dymshape", custom_sizes=10000000) ++ dphi_dt = torch.from_numpy(dphi_dt[0]).npu() ++ else: ++ dphi_dt = self.forward_estimator( ++ x_in, mask_in, ++ mu_in, t_in, ++ spks_in, ++ cond_in ++ ) + dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0) + dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt) + x = x + dt * dphi_dt +diff --git a/cosyvoice/hifigan/generator.py b/cosyvoice/hifigan/generator.py +index c47bf05..7f3e4ae 100644 +--- a/cosyvoice/hifigan/generator.py ++++ b/cosyvoice/hifigan/generator.py +@@ -23,6 +23,7 @@ import torch.nn.functional as F + from torch.nn import Conv1d + from torch.nn import ConvTranspose1d + from torch.nn.utils import remove_weight_norm ++from torch.nn.utils.parametrize import remove_parametrizations + from torch.nn.utils.parametrizations import weight_norm + from torch.distributions.uniform import Uniform + +@@ -99,8 +100,8 @@ class ResBlock(torch.nn.Module): + + def remove_weight_norm(self): + for idx in range(len(self.convs1)): +- remove_weight_norm(self.convs1[idx]) +- remove_weight_norm(self.convs2[idx]) ++ remove_parametrizations(self.convs1[idx], "weight") ++ remove_parametrizations(self.convs2[idx], "weight") + + + class SineGen(torch.nn.Module): +@@ -319,14 +320,11 @@ class HiFTGenerator(nn.Module): + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: +- remove_weight_norm(l) ++ remove_parametrizations(l, 'weight') + for l in self.resblocks: + l.remove_weight_norm() +- remove_weight_norm(self.conv_pre) +- remove_weight_norm(self.conv_post) +- self.m_source.remove_weight_norm() +- for l in self.source_downs: +- remove_weight_norm(l) ++ remove_parametrizations(self.conv_pre, 'weight') ++ remove_parametrizations(self.conv_post, 'weight') + for l in self.source_resblocks: + l.remove_weight_norm() + +@@ -346,9 +344,7 @@ class HiFTGenerator(nn.Module): + self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device)) + return inverse_transform + +- def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor: +- s_stft_real, s_stft_imag = self._stft(s.squeeze(1)) +- s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1) ++ def decode(self, x: torch.Tensor, s_stft: torch.Tensor, index: torch.int) -> torch.Tensor: + + x = self.conv_pre(x) + for i in range(self.num_upsamples): +@@ -356,7 +352,7 @@ class HiFTGenerator(nn.Module): + x = self.ups[i](x) + + if i == self.num_upsamples - 1: +- x = self.reflection_pad(x) ++ x = torch.cat((x, x[:,:,-2:-1]), -1) + + # fusion + si = self.source_downs[i](s_stft) +@@ -373,12 +369,10 @@ class HiFTGenerator(nn.Module): + + x = F.leaky_relu(x) + x = self.conv_post(x) +- magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :]) +- phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :]) # actually, sin is redundancy ++ magnitude = torch.exp(x[:, :index, :]) ++ phase = torch.sin(x[:, index:, :]) # actually, sin is redundancy + +- x = self._istft(magnitude, phase) +- x = torch.clamp(x, -self.audio_limit, self.audio_limit) +- return x ++ return magnitude, phase + + def forward( + self, +@@ -407,5 +401,12 @@ class HiFTGenerator(nn.Module): + # use cache_source to avoid glitch + if cache_source.shape[2] != 0: + s[:, :, :cache_source.shape[2]] = cache_source +- generated_speech = self.decode(x=speech_feat, s=s) ++ # torchair编译,对decode函数做部分适配 ++ s_stft_real, s_stft_imag = self._stft(s.squeeze(1)) ++ s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1) ++ # 字典取值操作无法被dynamo编译,把decode内部的index拿到外面计算 ++ index = self.istft_params["n_fft"] // 2 + 1 ++ magnitude, phase = self.decode(x=speech_feat, s_stft=s_stft, index=index) ++ x = self._istft(magnitude, phase) ++ generated_speech = torch.clamp(x, -self.audio_limit, self.audio_limit) + return generated_speech, s +diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py +index bbd3305..7eb32ad 100644 +--- a/cosyvoice/llm/llm.py ++++ b/cosyvoice/llm/llm.py +@@ -229,16 +229,17 @@ class Qwen2Encoder(torch.nn.Module): + super().__init__() + self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path) + +- def forward_one_step(self, xs, masks, cache=None): +- input_masks = masks[:, -1, :] +- outs = self.model( +- inputs_embeds=xs, +- attention_mask=input_masks, +- output_hidden_states=True, +- return_dict=True, +- use_cache=True, +- past_key_values=cache, +- ) ++ def forward_one_step(self, xs, masks, prompt_length, cache=None): ++ with torch.no_grad(): ++ outs = self.model( ++ inputs_embeds=xs, ++ attention_mask=masks, ++ prompt_length=prompt_length, ++ output_hidden_states=True, ++ return_dict=True, ++ use_cache=True, ++ past_key_values=cache, ++ ) + xs = outs.hidden_states[-1] + new_cache = outs.past_key_values + return xs, new_cache +@@ -283,6 +284,15 @@ class Qwen2LM(TransformerLM): + self.sampling = sampling + self.mix_ratio = mix_ratio + ++ # 5. added for support streaming input ++ self.prompt_speech_token_emb_dict = {} ++ self.lm_input_dict = {} ++ self.out_tokens_dict = {} ++ self.cache_dict = {} ++ self.text_cache_dict = {} ++ self.next_fill_index = {} ++ self.prompt_length = {} ++ + @torch.inference_mode() + def inference( + self, +@@ -318,9 +328,16 @@ class Qwen2LM(TransformerLM): + # 5. step by step decode + out_tokens = [] + cache = None ++ input_length = lm_input.shape[1] + for i in range(max_len): ++ prompt_length = input_length + i ++ if i == 0: ++ masks = torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool).logical_not() ++ else: ++ masks = None + y_pred, cache = self.llm.forward_one_step(lm_input, +- masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool), ++ masks=masks, ++ prompt_length=prompt_length, + cache=cache) + logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) + top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item() +@@ -331,7 +348,7 @@ class Qwen2LM(TransformerLM): + # in stream mode, yield token one by one + yield top_ids + out_tokens.append(top_ids) +- lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1) ++ lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1).detach().clone() + + @torch.inference_mode() + def inference_bistream( +@@ -432,3 +449,144 @@ class Qwen2LM(TransformerLM): + # in stream mode, yield token one by one + yield top_ids + lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1) ++ ++ @torch.inference_mode() ++ def inference_bistream_streaming_input( ++ self, ++ text: torch.Tensor, ++ char_idx: torch.Tensor, ++ prompt_text: torch.Tensor, ++ prompt_text_len: torch.Tensor, ++ prompt_speech_token: torch.Tensor, ++ prompt_speech_token_len: torch.Tensor, ++ embedding: torch.Tensor, ++ uuid: str, ++ input_end: bool, ++ sampling: int = 25, ++ max_token_text_ratio: float = 20, ++ min_token_text_ratio: float = 2, ++ ) -> Generator[torch.Tensor, None, None]: ++ ++ def build_causal_mask(query_len, key_len, devices): ++ num_past = key_len - query_len ++ assert num_past >= 0 ++ causal_mask = torch.triu(torch.ones((query_len, query_len), device=devices), diagonal=1).to(torch.bool) ++ left_padding = torch.zeros((query_len, num_past), dtype=torch.bool, device=devices) ++ full_masks = torch.cat([left_padding, causal_mask], dim=-1) ++ return full_masks.unsqueeze(0) ++ ++ device = prompt_text.device ++ ++ if uuid not in self.cache_dict: ++ sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1) ++ if prompt_speech_token_len != 0: ++ self.prompt_speech_token_emb_dict[uuid] = self.speech_embedding(prompt_speech_token) ++ else: ++ self.prompt_speech_token_emb_dict[uuid] = torch.zeros(1, 0, self.llm_input_size, dtype=prompt_text.dtype).to(device) ++ ++ self.lm_input_dict[uuid] = torch.concat([sos_eos_emb], dim=1) # [1,1,896] ++ ++ self.out_tokens_dict[uuid] = [] ++ self.cache_dict[uuid] = None ++ ++ self.text_cache_dict[uuid] = self.llm.model.model.embed_tokens(prompt_text) # [1, prompt_text, 896] ++ self.next_fill_index[uuid] = -1 ++ self.prompt_length[uuid] = 0 ++ ++ text_emb = self.llm.model.model.embed_tokens(text) ++ ++ for i in range(text_emb.size(1)): ++ self.text_cache_dict[uuid] = torch.concat([self.text_cache_dict[uuid], text_emb[:, i].unsqueeze(1)], dim=1) ++ index = 0 ++ while self.prompt_speech_token_emb_dict[uuid].size(1) != 0: ++ if self.text_cache_dict[uuid].size(1) >= self.mix_ratio[0]: ++ lm_input_text, lm_input_speech = self.text_cache_dict[uuid][:, :self.mix_ratio[0]], self.prompt_speech_token_emb_dict[uuid][:, :self.mix_ratio[1]] ++ index += 1 ++ logging.info('append {} text token {} speech token'.format(lm_input_text.size(1), lm_input_speech.size(1))) ++ self.lm_input_dict[uuid] = torch.concat([self.lm_input_dict[uuid], lm_input_text, lm_input_speech], dim=1) ++ self.text_cache_dict[uuid], self.prompt_speech_token_emb_dict[uuid] = self.text_cache_dict[uuid][:, self.mix_ratio[0]:], self.prompt_speech_token_emb_dict[uuid][:, self.mix_ratio[1]:] ++ else: ++ break ++ ++ if self.prompt_speech_token_emb_dict[uuid].size(1) == 0: # 文本token数量多于音频token,混合完以后,剩余文本token,开始解码 ++ # 若上一次解码的 token 是 fill_token,说明 LLM 想要更多 text token ++ # 或者首次预测时,还没开始解码,out_tokens_dict 为空 ++ if ((len(self.out_tokens_dict[uuid]) != 0 and self.out_tokens_dict[uuid][-1] == self.speech_token_size + 2) ++ or (len(self.out_tokens_dict[uuid]) == 0 and self.lm_input_dict[uuid].size(1) == 1)): ++ # token数量够了 ++ if self.text_cache_dict[uuid].size(1) >= self.mix_ratio[0]: ++ lm_input_text = self.text_cache_dict[uuid][:, :self.mix_ratio[0]] # 抽出5个token ++ if len(self.out_tokens_dict[uuid]) != 0 and self.out_tokens_dict[uuid][-1] == self.speech_token_size + 2: # 预测出filling token,前面cache已经缓存,当前直接输入即可 ++ self.lm_input_dict[uuid] = lm_input_text ++ else: # sft刚开始预测,需要和sos token拼接在一起 ++ self.lm_input_dict[uuid] = torch.concat([self.lm_input_dict[uuid], lm_input_text], dim=1) ++ self.text_cache_dict[uuid] = self.text_cache_dict[uuid][:, self.mix_ratio[0]:] ++ else: ++ continue ++ ++ while True: ++ self.prompt_length[uuid] += self.lm_input_dict[uuid].shape[1] ++ seq_len = self.prompt_length[uuid] ++ if self.lm_input_dict[uuid].shape[1] > 1: ++ masks = build_causal_mask(self.lm_input_dict[uuid].shape[1], seq_len, ++ self.lm_input_dict[uuid].device) ++ else: ++ masks = None ++ y_pred, self.cache_dict[uuid] = self.llm.forward_one_step(self.lm_input_dict[uuid], ++ masks=masks, ++ prompt_length=seq_len, ++ cache=self.cache_dict[uuid]) ++ logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) ++ # 判断是否生成 filling_token: ++ if self.next_fill_index[uuid] != -1 and len(self.out_tokens_dict[uuid]) == self.next_fill_index[uuid]: ++ top_ids = self.speech_token_size + 2 # 该预测filling token了 ++ self.next_fill_index[uuid] += (self.mix_ratio[1] + 1) # 找到下一个filling token的位置 ++ else: ++ top_ids = self.sampling_ids(logp.squeeze(dim=0), self.out_tokens_dict[uuid], sampling, ignore_eos=True).item() ++ # 特殊 token 处理, fill_token → 中断预测、等待新文本 token。 ++ if top_ids == self.speech_token_size + 2: ++ self.next_fill_index[uuid] = len(self.out_tokens_dict[uuid]) + self.mix_ratio[1] + 1 # -1 > 30 ++ self.out_tokens_dict[uuid].append(top_ids) ++ if top_ids >= self.speech_token_size: ++ if top_ids == self.speech_token_size + 2: # 预测到了filling token, break掉迎接新的文本token ++ break ++ else: ++ raise ValueError('should not get token {}'.format(top_ids)) ++ yield top_ids ++ self.lm_input_dict[uuid] = self.speech_embedding.weight[top_ids].reshape(1, 1, -1).detach().clone() ++ ++ if input_end: ++ # 3. final decode 文本全部送完,进行最后的解码。 ++ task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1) ++ self.lm_input_dict[uuid] = torch.concat([self.lm_input_dict[uuid], self.text_cache_dict[uuid], task_id_emb, self.prompt_speech_token_emb_dict[uuid]], dim=1) ++ logging.info('no more text token, decode until met eos') ++ while True: ++ self.prompt_length[uuid] += self.lm_input_dict[uuid].shape[1] ++ seq_len = self.prompt_length[uuid] ++ if self.lm_input_dict[uuid].shape[1] > 1: ++ masks = build_causal_mask(self.lm_input_dict[uuid].shape[1], seq_len, self.lm_input_dict[uuid].device) ++ else: ++ masks = None ++ y_pred, self.cache_dict[uuid] = self.llm.forward_one_step(self.lm_input_dict[uuid], ++ masks=masks, ++ prompt_length=seq_len, ++ cache=self.cache_dict[uuid]) ++ logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) ++ top_ids = self.sampling_ids(logp.squeeze(dim=0), self.out_tokens_dict[uuid], sampling, ignore_eos=False).item() ++ self.out_tokens_dict[uuid].append(top_ids) ++ if top_ids >= self.speech_token_size: ++ if top_ids == self.speech_token_size: ++ break ++ else: ++ raise ValueError('should not get token {}'.format(top_ids)) ++ # in stream mode, yield token one by one ++ yield top_ids ++ self.lm_input_dict[uuid] = self.speech_embedding.weight[top_ids].reshape(1, 1, -1).detach().clone() ++ ++ # this user is done ++ self.prompt_speech_token_emb_dict.pop(uuid) ++ self.lm_input_dict.pop(uuid) ++ self.out_tokens_dict.pop(uuid) ++ self.cache_dict.pop(uuid) ++ self.text_cache_dict.pop(uuid) ++ self.next_fill_index.pop(uuid) +\ No newline at end of file +diff --git a/cosyvoice/utils/common.py b/cosyvoice/utils/common.py +index 3e61a8c..d316b92 100644 +--- a/cosyvoice/utils/common.py ++++ b/cosyvoice/utils/common.py +@@ -107,12 +107,33 @@ def init_weights(m, mean=0.0, std=0.01): + + # Repetition Aware Sampling in VALL-E 2 + def ras_sampling(weighted_scores, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1): +- top_ids = nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k) ++ top_ids = dst_sampling(weighted_scores, top_p=top_p, top_k=top_k) + rep_num = (torch.tensor(decoded_tokens[-win_size:]).to(weighted_scores.device) == top_ids).sum().item() + if rep_num >= win_size * tau_r: + top_ids = random_sampling(weighted_scores, decoded_tokens, sampling) + return top_ids + ++def dst_sampling(weighted_scores, top_p=0.8, top_k=25): ++ ++ sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(descending=True, stable=True) ++ ++ cum_sum = torch.cumsum(sorted_value, dim=0) ++ n = sorted_value.size(0) ++ device = cum_sum.device ++ pre_cum_sum = torch.cat([torch.zeros(1, device=device), cum_sum[:-1]]) ++ ++ indices = torch.arange(n ,device=device) ++ condition = (pre_cum_sum < top_p) & (indices < top_k) ++ ++ max_i_tensor = torch.where(condition, indices, torch.tensor(-1, device=device)) ++ n_selected = max_i_tensor.max() + 1 ++ ++ selected_prob = sorted_value[:n_selected] ++ selected_indices = sorted_idx[:n_selected] ++ ++ top_ids = selected_indices[selected_prob.multinomial(1, replacement=True)] ++ ++ return top_ids + + def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25): + prob, indices = [], [] diff --git a/ACL_PyTorch/built-in/audio/CosyVoice2/README.md b/ACL_PyTorch/built-in/audio/CosyVoice2/README.md index 3c0baf4bda..b80bdef91b 100644 --- a/ACL_PyTorch/built-in/audio/CosyVoice2/README.md +++ b/ACL_PyTorch/built-in/audio/CosyVoice2/README.md @@ -43,7 +43,7 @@ ## 获取本仓源码 ``` git clone https://gitee.com/ascend/ModelZoo-PyTorch.git -cd ModelZoo-PyTorch/ACL_PyTorch/built-in/audio/CosyVoice/CosyVoice2 +cd ModelZoo-PyTorch/ACL_PyTorch/built-in/audio/CosyVoice2 ``` ## 获取源码 @@ -55,6 +55,7 @@ cd ModelZoo-PyTorch/ACL_PyTorch/built-in/audio/CosyVoice/CosyVoice2 cd CosyVoice git reset --hard fd45708 git submodule update --init --recursive + # 根据当前使用机型,叠加patch。如果当前使用机型为313T 800T A2,则使用../800I/diff_CosyVoice_800T.patch git apply ../${platform}/diff_CosyVoice_${platform}.patch # 将infer.py复制到CosyVoice中 cp ../infer.py ./ @@ -63,8 +64,8 @@ cd ModelZoo-PyTorch/ACL_PyTorch/built-in/audio/CosyVoice/CosyVoice2 cd transformers git checkout v4.37.0 cd .. - # 将modeling_qwen模型文件替换到transformers仓内 - mv ../${platform}/modeling_qwen2.py ./transformers/src/transformers/models/qwen2 + # 将modeling_qwen模型文件替换到transformers仓内。800T A2和800I A2共用modeling_qwen2.py。 + cp ../${platform}/modeling_qwen2.py ./transformers/src/transformers/models/qwen2 ``` 文件目录结构大致如下: @@ -75,6 +76,7 @@ cd ModelZoo-PyTorch/ACL_PyTorch/built-in/audio/CosyVoice/CosyVoice2 |── 📄 modeling_qwen2.py |── 📁 800I |── 📄 diff_CosyVoice_800I.patch + |── 📄 diff_CosyVoice_800T.patch |── 📄 modeling_qwen2.py |── 📁 CosyVoice |── 📁 cosyVoice源码文件 # cosyVoice的源码文件,此处不一一列举 @@ -90,7 +92,7 @@ cd ModelZoo-PyTorch/ACL_PyTorch/built-in/audio/CosyVoice/CosyVoice2 pip3 install -r ../requirements.txt apt-get install sox # centos版本 yum install sox ``` - 注:如果遇到无法安装WeTextProcessing的场景,可以参考以下方法手动安装编译 + 注:如果遇到无法安装WeTextProcessing的场景,例如提示安装pyinit报错,可以参考以下方法手动安装编译 ```bash # 下载安装包并解压 wget https://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.8.3.tar.gz -- Gitee From c0b962a1677a1806e2edd905596f0e72954e7b39 Mon Sep 17 00:00:00 2001 From: pu-zhe Date: Sat, 30 Aug 2025 15:10:34 +0800 Subject: [PATCH 3/6] update --- .../CosyVoice2/800I/diff_CosyVoice_800I.patch | 71 +- .../CosyVoice2/800I/diff_CosyVoice_800T.patch | 674 ------------------ .../built-in/audio/CosyVoice2/README.md | 3 +- 3 files changed, 40 insertions(+), 708 deletions(-) delete mode 100644 ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800T.patch diff --git a/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800I.patch b/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800I.patch index 6bec7233c6..c7ed9dcbf8 100644 --- a/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800I.patch +++ b/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800I.patch @@ -1,8 +1,8 @@ diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py -index e2d62e2..dccea41 100644 +index e2d62e2..95da570 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py -@@ -13,11 +13,14 @@ +@@ -13,11 +13,15 @@ # limitations under the License. import os import time @@ -13,37 +13,44 @@ index e2d62e2..dccea41 100644 from hyperpyyaml import load_hyperpyyaml from modelscope import snapshot_download import torch ++import acl +from ais_bench.infer.interface import InferSession from cosyvoice.cli.frontend import CosyVoiceFrontEnd from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model from cosyvoice.utils.file_utils import logging -@@ -126,7 +129,7 @@ class CosyVoice: - +@@ -126,7 +130,7 @@ class CosyVoice: + class CosyVoice2(CosyVoice): - + - def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False): + def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, load_om=False): self.instruct = True if '-Instruct' in model_dir else False self.model_dir = model_dir self.fp16 = fp16 -@@ -155,6 +158,16 @@ class CosyVoice2(CosyVoice): +@@ -155,6 +159,22 @@ class CosyVoice2(CosyVoice): self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'), '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir), self.fp16) + if load_om: ++ soc_version = acl.get_soc_name() ++ context = None ++ if '910B3' in soc_version: ++ context, _ = acl.rt.get_context() + arch = platform.machine() + system = platform.system().lower() + flow_om = InferSession(0, '{}/flow_{}_{}.om'.format(model_dir, system ,arch)) + flow_om_static = InferSession(0, '{}/flow_static.om'.format(model_dir)) + speech_om = InferSession(0, '{}/speech_{}_{}.om'.format(model_dir, system ,arch)) ++ if '910B3' in soc_version: ++ _ = acl.rt.set_context(context) + self.frontend.speech_om = speech_om + self.frontend.flow_om = flow_om + self.model.flow.decoder.flow_om_static = flow_om_static + self.model.flow.decoder.flow_om = flow_om del configs - + def inference_instruct(self, *args, **kwargs): -@@ -171,3 +184,19 @@ class CosyVoice2(CosyVoice): +@@ -171,3 +191,19 @@ class CosyVoice2(CosyVoice): logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) yield model_output start_time = time.time() @@ -73,7 +80,7 @@ index 6e10f00..25ad767 100644 self.inflect_parser = inflect.engine() + self.speech_om = None + self.flow_om = None - + def _extract_text_token(self, text): if isinstance(text, Generator): @@ -92,11 +94,16 @@ class CosyVoiceFrontEnd: @@ -104,7 +111,7 @@ index 9ebf8cb..a8775a1 100644 +++ b/cosyvoice/cli/model.py @@ -99,7 +99,7 @@ class CosyVoiceModel: self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context() - + def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): - with self.llm_context: + with self.llm_context(): @@ -126,7 +133,7 @@ index 9ebf8cb..a8775a1 100644 self.tts_speech_token_dict = {} self.llm_end_dict = {} self.hift_cache_dict = {} - + + # add for support streaming input + self.first_chunk_size = 20 + self.token_offset_dict = {} @@ -274,15 +281,15 @@ index 6a60f6d..fbe7545 100644 import torch.nn.functional as F +import numpy as np from matcha.models.components.flow_matching import BASECFM - - + + @@ -32,6 +33,8 @@ class ConditionalCFM(BASECFM): # Just change the architecture of the estimator here self.estimator = estimator self.lock = threading.Lock() + self.flow_om = None + self.flow_om_static = None - + @torch.inference_mode() def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)): @@ -105,12 +108,26 @@ class ConditionalCFM(BASECFM): @@ -329,17 +336,17 @@ index c47bf05..7f3e4ae 100644 +from torch.nn.utils.parametrize import remove_parametrizations from torch.nn.utils.parametrizations import weight_norm from torch.distributions.uniform import Uniform - + @@ -99,8 +100,8 @@ class ResBlock(torch.nn.Module): - + def remove_weight_norm(self): for idx in range(len(self.convs1)): - remove_weight_norm(self.convs1[idx]) - remove_weight_norm(self.convs2[idx]) + remove_parametrizations(self.convs1[idx], "weight") + remove_parametrizations(self.convs2[idx], "weight") - - + + class SineGen(torch.nn.Module): @@ -319,14 +320,11 @@ class HiFTGenerator(nn.Module): def remove_weight_norm(self): @@ -358,41 +365,41 @@ index c47bf05..7f3e4ae 100644 + remove_parametrizations(self.conv_post, 'weight') for l in self.source_resblocks: l.remove_weight_norm() - + @@ -346,9 +344,7 @@ class HiFTGenerator(nn.Module): self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device)) return inverse_transform - + - def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor: - s_stft_real, s_stft_imag = self._stft(s.squeeze(1)) - s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1) + def decode(self, x: torch.Tensor, s_stft: torch.Tensor, index: torch.int) -> torch.Tensor: - + x = self.conv_pre(x) for i in range(self.num_upsamples): @@ -356,7 +352,7 @@ class HiFTGenerator(nn.Module): x = self.ups[i](x) - + if i == self.num_upsamples - 1: - x = self.reflection_pad(x) + x = torch.cat((x, x[:,:,-2:-1]), -1) - + # fusion si = self.source_downs[i](s_stft) @@ -373,12 +369,10 @@ class HiFTGenerator(nn.Module): - + x = F.leaky_relu(x) x = self.conv_post(x) - magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :]) - phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :]) # actually, sin is redundancy + magnitude = torch.exp(x[:, :index, :]) + phase = torch.sin(x[:, index:, :]) # actually, sin is redundancy - + - x = self._istft(magnitude, phase) - x = torch.clamp(x, -self.audio_limit, self.audio_limit) - return x + return magnitude, phase - + def forward( self, @@ -407,5 +401,12 @@ class HiFTGenerator(nn.Module): @@ -416,7 +423,7 @@ index bbd3305..7eb32ad 100644 @@ -229,16 +229,17 @@ class Qwen2Encoder(torch.nn.Module): super().__init__() self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path) - + - def forward_one_step(self, xs, masks, cache=None): - input_masks = masks[:, -1, :] - outs = self.model( @@ -444,7 +451,7 @@ index bbd3305..7eb32ad 100644 @@ -283,6 +284,15 @@ class Qwen2LM(TransformerLM): self.sampling = sampling self.mix_ratio = mix_ratio - + + # 5. added for support streaming input + self.prompt_speech_token_emb_dict = {} + self.lm_input_dict = {} @@ -481,7 +488,7 @@ index bbd3305..7eb32ad 100644 out_tokens.append(top_ids) - lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1) + lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1).detach().clone() - + @torch.inference_mode() def inference_bistream( @@ -432,3 +449,144 @@ class Qwen2LM(TransformerLM): @@ -635,7 +642,7 @@ index 3e61a8c..d316b92 100644 --- a/cosyvoice/utils/common.py +++ b/cosyvoice/utils/common.py @@ -107,12 +107,33 @@ def init_weights(m, mean=0.0, std=0.01): - + # Repetition Aware Sampling in VALL-E 2 def ras_sampling(weighted_scores, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1): - top_ids = nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k) @@ -644,7 +651,7 @@ index 3e61a8c..d316b92 100644 if rep_num >= win_size * tau_r: top_ids = random_sampling(weighted_scores, decoded_tokens, sampling) return top_ids - + +def dst_sampling(weighted_scores, top_p=0.8, top_k=25): + + sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(descending=True, stable=True) @@ -666,6 +673,6 @@ index 3e61a8c..d316b92 100644 + top_ids = selected_indices[selected_prob.multinomial(1, replacement=True)] + + return top_ids - + def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25): prob, indices = [], [] diff --git a/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800T.patch b/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800T.patch deleted file mode 100644 index d53dbf3121..0000000000 --- a/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800T.patch +++ /dev/null @@ -1,674 +0,0 @@ -diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py -index e2d62e2..0af241c 100644 ---- a/cosyvoice/cli/cosyvoice.py -+++ b/cosyvoice/cli/cosyvoice.py -@@ -13,11 +13,15 @@ - # limitations under the License. - import os - import time -+import platform -+import datetime - from typing import Generator - from tqdm import tqdm - from hyperpyyaml import load_hyperpyyaml - from modelscope import snapshot_download - import torch -+import acl -+from ais_bench.infer.interface import InferSession - from cosyvoice.cli.frontend import CosyVoiceFrontEnd - from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model - from cosyvoice.utils.file_utils import logging -@@ -126,7 +130,7 @@ class CosyVoice: - - class CosyVoice2(CosyVoice): - -- def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False): -+ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, load_om=False): - self.instruct = True if '-Instruct' in model_dir else False - self.model_dir = model_dir - self.fp16 = fp16 -@@ -155,6 +159,18 @@ class CosyVoice2(CosyVoice): - self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'), - '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir), - self.fp16) -+ if load_om: -+ arch = platform.machine() -+ system = platform.system().lower() -+ context, _ = acl.rt.get_context() -+ flow_om = InferSession(0, '{}/flow_{}_{}.om'.format(model_dir, system ,arch)) -+ flow_om_static = InferSession(0, '{}/flow_static.om'.format(model_dir)) -+ speech_om = InferSession(0, '{}/speech_{}_{}.om'.format(model_dir, system ,arch)) -+ _ = acl.rt.set_context(context) -+ self.frontend.speech_om = speech_om -+ self.frontend.flow_om = flow_om -+ self.model.flow.decoder.flow_om_static = flow_om_static -+ self.model.flow.decoder.flow_om = flow_om - del configs - - def inference_instruct(self, *args, **kwargs): -@@ -171,3 +187,19 @@ class CosyVoice2(CosyVoice): - logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) - yield model_output - start_time = time.time() -+ -+ def inference_sft_streaming_input(self, tts_text, char_idx, spk_id, user_id, input_end, stream=False, speed=1.0, text_frontend=True): -+ for i in [tts_text]: -+ model_input = self.frontend.frontend_sft(i, spk_id) -+ model_input["user_id"] = user_id -+ model_input["input_end"] = input_end -+ model_input['char_idx'] = char_idx -+ -+ start_time = time.time() -+ # print('synthesis text {}'.format(i)) -+ for model_output in self.model.tts_streaming_input(**model_input, stream=stream, speed=speed): -+ speech_len = model_output['tts_speech'].shape[1] / self.sample_rate -+ print("finish 1 chunk inference ", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')) -+ logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) -+ yield model_output -+ start_time = time.time() -diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py -index 6e10f00..25ad767 100644 ---- a/cosyvoice/cli/frontend.py -+++ b/cosyvoice/cli/frontend.py -@@ -71,6 +71,8 @@ class CosyVoiceFrontEnd: - self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True) - self.en_tn_model = EnNormalizer() - self.inflect_parser = inflect.engine() -+ self.speech_om = None -+ self.flow_om = None - - def _extract_text_token(self, text): - if isinstance(text, Generator): -@@ -92,11 +94,16 @@ class CosyVoiceFrontEnd: - def _extract_speech_token(self, speech): - assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s' - feat = whisper.log_mel_spectrogram(speech, n_mels=128) -- speech_token = self.speech_tokenizer_session.run(None, -- {self.speech_tokenizer_session.get_inputs()[0].name: -- feat.detach().cpu().numpy(), -- self.speech_tokenizer_session.get_inputs()[1].name: -- np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist() -+ if torch.npu.is_available() and self.speech_om: -+ feed = [feat.detach().cpu().numpy(), np.array([feat.shape[2]], dtype=np.int32)] -+ speech_token = self.speech_om.infer(feed, mode='dymshape', custom_sizes=[100000000])[0].flatten().tolist() -+ self.flow_om.set_context() -+ else: -+ speech_token = self.speech_tokenizer_session.run(None, -+ {self.speech_tokenizer_session.get_inputs()[0].name: -+ feat.detach().cpu().numpy(), -+ self.speech_tokenizer_session.get_inputs()[1].name: -+ np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist() - speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device) - speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device) - return speech_token, speech_token_len -diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py -index 9ebf8cb..a8775a1 100644 ---- a/cosyvoice/cli/model.py -+++ b/cosyvoice/cli/model.py -@@ -99,7 +99,7 @@ class CosyVoiceModel: - self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context() - - def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): -- with self.llm_context: -+ with self.llm_context(): - if isinstance(text, Generator): - assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!' - for i in self.llm.inference_bistream(text=text, -@@ -307,13 +307,25 @@ class CosyVoice2Model(CosyVoiceModel): - self.speech_window = np.hamming(2 * self.source_cache_len) - # rtf and decoding related - self.stream_scale_factor = 1 -- self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() -+ if torch.cuda.is_available(): -+ stream = torch.cuda.Stream(device=self.device) -+ self.llm_context = lambda: torch.cuda.stream(stream) -+ else: -+ self.llm_context = lambda: contextlib.nullcontext() - self.lock = threading.Lock() - # dict used to store session related variable - self.tts_speech_token_dict = {} - self.llm_end_dict = {} - self.hift_cache_dict = {} - -+ # add for support streaming input -+ self.first_chunk_size = 20 -+ self.token_offset_dict = {} -+ self.prompt_text_dict = {} -+ self.prompt_speech_token_dict = {} -+ self.speech_feat_dict = {} -+ self.embedding_dict = {} -+ - def load_jit(self, flow_encoder_model): - flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device) - self.flow.encoder = flow_encoder -@@ -362,12 +374,17 @@ class CosyVoice2Model(CosyVoiceModel): - with self.lock: - self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False - self.hift_cache_dict[this_uuid] = None -- p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid)) -- p.start() - if stream is True: - token_offset = 0 -- while True: -- time.sleep(0.1) -+ # 删除线程操作,串行执行推理,加速首包时延 -+ for i in self.llm.inference(text=text.to(self.device), -+ text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device), -+ prompt_text=prompt_text.to(self.device), -+ prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device), -+ prompt_speech_token=llm_prompt_speech_token.to(self.device), -+ prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device), -+ embedding=llm_embedding.to(self.device)): -+ self.tts_speech_token_dict[this_uuid].append(i) - if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= self.token_hop_len + self.flow.pre_lookahead_len: - this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0) - this_tts_speech = self.token2wav(token=this_tts_speech_token, -@@ -379,10 +396,6 @@ class CosyVoice2Model(CosyVoiceModel): - finalize=False) - token_offset += self.token_hop_len - yield {'tts_speech': this_tts_speech.cpu()} -- if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < self.token_hop_len + self.flow.pre_lookahead_len: -- break -- p.join() -- # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None - this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0) - this_tts_speech = self.token2wav(token=this_tts_speech_token, - prompt_token=flow_prompt_speech_token, -@@ -393,6 +406,8 @@ class CosyVoice2Model(CosyVoiceModel): - finalize=True) - yield {'tts_speech': this_tts_speech.cpu()} - else: -+ p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid)) -+ p.start() - # deal with all tokens - p.join() - this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0) -@@ -409,3 +424,83 @@ class CosyVoice2Model(CosyVoiceModel): - self.tts_speech_token_dict.pop(this_uuid) - self.llm_end_dict.pop(this_uuid) - torch.cuda.empty_cache() -+ -+ def tts_streaming_input(self, text, char_idx, flow_embedding, llm_embedding=torch.zeros(0, 192), -+ prompt_text=torch.zeros(1, 0, dtype=torch.int32), -+ llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), -+ flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), -+ prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs): -+ this_uuid = kwargs.get("user_id", "AscendDefaultUser") -+ if this_uuid not in self.tts_speech_token_dict: -+ self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False -+ self.hift_cache_dict[this_uuid] = None -+ self.token_offset_dict[this_uuid] = 0 -+ -+ self.prompt_text_dict[this_uuid] = prompt_text -+ self.prompt_speech_token_dict[this_uuid] = flow_prompt_speech_token -+ self.speech_feat_dict[this_uuid] = prompt_speech_feat -+ self.embedding_dict[this_uuid] = flow_embedding -+ else: -+ prompt_text = self.prompt_text_dict[this_uuid] -+ llm_prompt_speech_token = self.prompt_speech_token_dict[this_uuid] -+ flow_prompt_speech_token = self.prompt_speech_token_dict[this_uuid] -+ flow_embedding = self.embedding_dict[this_uuid] -+ llm_embedding = self.embedding_dict[this_uuid] -+ prompt_speech_feat = self.speech_feat_dict[this_uuid] -+ -+ for i in self.llm.inference_bistream_streaming_input(text=text, -+ char_idx=torch.tensor([char_idx]).to(self.device), -+ prompt_text=prompt_text.to(self.device), -+ prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device), -+ prompt_speech_token=llm_prompt_speech_token.to(self.device), -+ prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device), -+ embedding=llm_embedding.to(self.device), -+ uuid=this_uuid, input_end=kwargs['input_end']): -+ self.tts_speech_token_dict[this_uuid].append(i) -+ -+ assert stream is True, "output must be streaming" -+ -+ while True: -+ is_first_chunk_ready = (self.token_offset_dict[this_uuid] == 0 and len(self.tts_speech_token_dict[this_uuid]) >= self.first_chunk_size + self.flow.pre_lookahead_len) -+ is_next_chunk_ready = (self.token_offset_dict[this_uuid] > 0 and len(self.tts_speech_token_dict[this_uuid]) - self.token_offset_dict[this_uuid] >= self.token_hop_len + self.flow.pre_lookahead_len) -+ if is_first_chunk_ready or is_next_chunk_ready: -+ if self.token_offset_dict[this_uuid] == 0: -+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:self.first_chunk_size + self.flow.pre_lookahead_len]).unsqueeze(dim=0) -+ else: -+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:self.token_offset_dict[this_uuid] + self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0) # 0-53, 0-103, 0-153... -+ this_tts_speech = self.token2wav(token=this_tts_speech_token, -+ prompt_token=flow_prompt_speech_token, -+ prompt_feat=prompt_speech_feat, -+ embedding=flow_embedding, -+ uuid=this_uuid, -+ token_offset=self.token_offset_dict[this_uuid], -+ finalize=False) -+ if self.token_offset_dict[this_uuid] == 0: -+ self.token_offset_dict[this_uuid] += self.first_chunk_size -+ else: -+ self.token_offset_dict[this_uuid] += self.token_hop_len -+ yield {'tts_speech': this_tts_speech.cpu()} -+ # 是否需要退出循环(token 不够下一次推理) -+ if len(self.tts_speech_token_dict[this_uuid]) - self.token_offset_dict[this_uuid] < self.token_hop_len + self.flow.pre_lookahead_len: -+ break -+ -+ if kwargs['input_end'] is True: -+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0) -+ this_tts_speech = self.token2wav(token=this_tts_speech_token, -+ prompt_token=flow_prompt_speech_token, -+ prompt_feat=prompt_speech_feat, -+ embedding=flow_embedding, -+ uuid=this_uuid, -+ token_offset=self.token_offset_dict[this_uuid], -+ finalize=True) -+ yield {'tts_speech': this_tts_speech.cpu()} -+ -+ self.tts_speech_token_dict.pop(this_uuid) -+ self.llm_end_dict.pop(this_uuid) -+ self.hift_cache_dict.pop(this_uuid) -+ -+ self.token_offset_dict.pop(this_uuid) -+ self.prompt_text_dict.pop(this_uuid) -+ self.prompt_speech_token_dict.pop(this_uuid) -+ self.speech_feat_dict.pop(this_uuid) -+ self.embedding_dict.pop(this_uuid) -diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py -index 6a60f6d..fbe7545 100644 ---- a/cosyvoice/flow/flow_matching.py -+++ b/cosyvoice/flow/flow_matching.py -@@ -14,6 +14,7 @@ - import threading - import torch - import torch.nn.functional as F -+import numpy as np - from matcha.models.components.flow_matching import BASECFM - - -@@ -32,6 +33,8 @@ class ConditionalCFM(BASECFM): - # Just change the architecture of the estimator here - self.estimator = estimator - self.lock = threading.Lock() -+ self.flow_om = None -+ self.flow_om_static = None - - @torch.inference_mode() - def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)): -@@ -105,12 +108,26 @@ class ConditionalCFM(BASECFM): - t_in[:] = t.unsqueeze(0) - spks_in[0] = spks - cond_in[0] = cond -- dphi_dt = self.forward_estimator( -- x_in, mask_in, -- mu_in, t_in, -- spks_in, -- cond_in -- ) -+ # 动态分档推理, 在流式输出中,每次输出的token数目固定,可以采取动态分档模型执行推理 -+ if torch.npu.is_available() and self.flow_om_static and x.size(2)%100==0 and x.size(2)<800: -+ feed_list = [x_in, mask_in, mu_in, t_in, spks_in, cond_in] -+ feed = [i.cpu().detach().numpy().astype(np.float32) for i in feed_list] -+ dphi_dt = self.flow_om_static.infer(feed, mode="dymdims") -+ self.flow_om.set_context() -+ dphi_dt = torch.from_numpy(dphi_dt[0]).npu() -+ # 输出的token数目不固定场景采用动态模型推理 -+ elif torch.npu.is_available() and self.flow_om: -+ feed_list = [x_in, mask_in, mu_in, t_in, spks_in, cond_in] -+ feed = [i.cpu().detach().numpy().astype(np.float32) for i in feed_list] -+ dphi_dt = self.flow_om.infer(feed, mode="dymshape", custom_sizes=10000000) -+ dphi_dt = torch.from_numpy(dphi_dt[0]).npu() -+ else: -+ dphi_dt = self.forward_estimator( -+ x_in, mask_in, -+ mu_in, t_in, -+ spks_in, -+ cond_in -+ ) - dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0) - dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt) - x = x + dt * dphi_dt -diff --git a/cosyvoice/hifigan/generator.py b/cosyvoice/hifigan/generator.py -index c47bf05..7f3e4ae 100644 ---- a/cosyvoice/hifigan/generator.py -+++ b/cosyvoice/hifigan/generator.py -@@ -23,6 +23,7 @@ import torch.nn.functional as F - from torch.nn import Conv1d - from torch.nn import ConvTranspose1d - from torch.nn.utils import remove_weight_norm -+from torch.nn.utils.parametrize import remove_parametrizations - from torch.nn.utils.parametrizations import weight_norm - from torch.distributions.uniform import Uniform - -@@ -99,8 +100,8 @@ class ResBlock(torch.nn.Module): - - def remove_weight_norm(self): - for idx in range(len(self.convs1)): -- remove_weight_norm(self.convs1[idx]) -- remove_weight_norm(self.convs2[idx]) -+ remove_parametrizations(self.convs1[idx], "weight") -+ remove_parametrizations(self.convs2[idx], "weight") - - - class SineGen(torch.nn.Module): -@@ -319,14 +320,11 @@ class HiFTGenerator(nn.Module): - def remove_weight_norm(self): - print('Removing weight norm...') - for l in self.ups: -- remove_weight_norm(l) -+ remove_parametrizations(l, 'weight') - for l in self.resblocks: - l.remove_weight_norm() -- remove_weight_norm(self.conv_pre) -- remove_weight_norm(self.conv_post) -- self.m_source.remove_weight_norm() -- for l in self.source_downs: -- remove_weight_norm(l) -+ remove_parametrizations(self.conv_pre, 'weight') -+ remove_parametrizations(self.conv_post, 'weight') - for l in self.source_resblocks: - l.remove_weight_norm() - -@@ -346,9 +344,7 @@ class HiFTGenerator(nn.Module): - self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device)) - return inverse_transform - -- def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor: -- s_stft_real, s_stft_imag = self._stft(s.squeeze(1)) -- s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1) -+ def decode(self, x: torch.Tensor, s_stft: torch.Tensor, index: torch.int) -> torch.Tensor: - - x = self.conv_pre(x) - for i in range(self.num_upsamples): -@@ -356,7 +352,7 @@ class HiFTGenerator(nn.Module): - x = self.ups[i](x) - - if i == self.num_upsamples - 1: -- x = self.reflection_pad(x) -+ x = torch.cat((x, x[:,:,-2:-1]), -1) - - # fusion - si = self.source_downs[i](s_stft) -@@ -373,12 +369,10 @@ class HiFTGenerator(nn.Module): - - x = F.leaky_relu(x) - x = self.conv_post(x) -- magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :]) -- phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :]) # actually, sin is redundancy -+ magnitude = torch.exp(x[:, :index, :]) -+ phase = torch.sin(x[:, index:, :]) # actually, sin is redundancy - -- x = self._istft(magnitude, phase) -- x = torch.clamp(x, -self.audio_limit, self.audio_limit) -- return x -+ return magnitude, phase - - def forward( - self, -@@ -407,5 +401,12 @@ class HiFTGenerator(nn.Module): - # use cache_source to avoid glitch - if cache_source.shape[2] != 0: - s[:, :, :cache_source.shape[2]] = cache_source -- generated_speech = self.decode(x=speech_feat, s=s) -+ # torchair编译,对decode函数做部分适配 -+ s_stft_real, s_stft_imag = self._stft(s.squeeze(1)) -+ s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1) -+ # 字典取值操作无法被dynamo编译,把decode内部的index拿到外面计算 -+ index = self.istft_params["n_fft"] // 2 + 1 -+ magnitude, phase = self.decode(x=speech_feat, s_stft=s_stft, index=index) -+ x = self._istft(magnitude, phase) -+ generated_speech = torch.clamp(x, -self.audio_limit, self.audio_limit) - return generated_speech, s -diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py -index bbd3305..7eb32ad 100644 ---- a/cosyvoice/llm/llm.py -+++ b/cosyvoice/llm/llm.py -@@ -229,16 +229,17 @@ class Qwen2Encoder(torch.nn.Module): - super().__init__() - self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path) - -- def forward_one_step(self, xs, masks, cache=None): -- input_masks = masks[:, -1, :] -- outs = self.model( -- inputs_embeds=xs, -- attention_mask=input_masks, -- output_hidden_states=True, -- return_dict=True, -- use_cache=True, -- past_key_values=cache, -- ) -+ def forward_one_step(self, xs, masks, prompt_length, cache=None): -+ with torch.no_grad(): -+ outs = self.model( -+ inputs_embeds=xs, -+ attention_mask=masks, -+ prompt_length=prompt_length, -+ output_hidden_states=True, -+ return_dict=True, -+ use_cache=True, -+ past_key_values=cache, -+ ) - xs = outs.hidden_states[-1] - new_cache = outs.past_key_values - return xs, new_cache -@@ -283,6 +284,15 @@ class Qwen2LM(TransformerLM): - self.sampling = sampling - self.mix_ratio = mix_ratio - -+ # 5. added for support streaming input -+ self.prompt_speech_token_emb_dict = {} -+ self.lm_input_dict = {} -+ self.out_tokens_dict = {} -+ self.cache_dict = {} -+ self.text_cache_dict = {} -+ self.next_fill_index = {} -+ self.prompt_length = {} -+ - @torch.inference_mode() - def inference( - self, -@@ -318,9 +328,16 @@ class Qwen2LM(TransformerLM): - # 5. step by step decode - out_tokens = [] - cache = None -+ input_length = lm_input.shape[1] - for i in range(max_len): -+ prompt_length = input_length + i -+ if i == 0: -+ masks = torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool).logical_not() -+ else: -+ masks = None - y_pred, cache = self.llm.forward_one_step(lm_input, -- masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool), -+ masks=masks, -+ prompt_length=prompt_length, - cache=cache) - logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) - top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item() -@@ -331,7 +348,7 @@ class Qwen2LM(TransformerLM): - # in stream mode, yield token one by one - yield top_ids - out_tokens.append(top_ids) -- lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1) -+ lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1).detach().clone() - - @torch.inference_mode() - def inference_bistream( -@@ -432,3 +449,144 @@ class Qwen2LM(TransformerLM): - # in stream mode, yield token one by one - yield top_ids - lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1) -+ -+ @torch.inference_mode() -+ def inference_bistream_streaming_input( -+ self, -+ text: torch.Tensor, -+ char_idx: torch.Tensor, -+ prompt_text: torch.Tensor, -+ prompt_text_len: torch.Tensor, -+ prompt_speech_token: torch.Tensor, -+ prompt_speech_token_len: torch.Tensor, -+ embedding: torch.Tensor, -+ uuid: str, -+ input_end: bool, -+ sampling: int = 25, -+ max_token_text_ratio: float = 20, -+ min_token_text_ratio: float = 2, -+ ) -> Generator[torch.Tensor, None, None]: -+ -+ def build_causal_mask(query_len, key_len, devices): -+ num_past = key_len - query_len -+ assert num_past >= 0 -+ causal_mask = torch.triu(torch.ones((query_len, query_len), device=devices), diagonal=1).to(torch.bool) -+ left_padding = torch.zeros((query_len, num_past), dtype=torch.bool, device=devices) -+ full_masks = torch.cat([left_padding, causal_mask], dim=-1) -+ return full_masks.unsqueeze(0) -+ -+ device = prompt_text.device -+ -+ if uuid not in self.cache_dict: -+ sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1) -+ if prompt_speech_token_len != 0: -+ self.prompt_speech_token_emb_dict[uuid] = self.speech_embedding(prompt_speech_token) -+ else: -+ self.prompt_speech_token_emb_dict[uuid] = torch.zeros(1, 0, self.llm_input_size, dtype=prompt_text.dtype).to(device) -+ -+ self.lm_input_dict[uuid] = torch.concat([sos_eos_emb], dim=1) # [1,1,896] -+ -+ self.out_tokens_dict[uuid] = [] -+ self.cache_dict[uuid] = None -+ -+ self.text_cache_dict[uuid] = self.llm.model.model.embed_tokens(prompt_text) # [1, prompt_text, 896] -+ self.next_fill_index[uuid] = -1 -+ self.prompt_length[uuid] = 0 -+ -+ text_emb = self.llm.model.model.embed_tokens(text) -+ -+ for i in range(text_emb.size(1)): -+ self.text_cache_dict[uuid] = torch.concat([self.text_cache_dict[uuid], text_emb[:, i].unsqueeze(1)], dim=1) -+ index = 0 -+ while self.prompt_speech_token_emb_dict[uuid].size(1) != 0: -+ if self.text_cache_dict[uuid].size(1) >= self.mix_ratio[0]: -+ lm_input_text, lm_input_speech = self.text_cache_dict[uuid][:, :self.mix_ratio[0]], self.prompt_speech_token_emb_dict[uuid][:, :self.mix_ratio[1]] -+ index += 1 -+ logging.info('append {} text token {} speech token'.format(lm_input_text.size(1), lm_input_speech.size(1))) -+ self.lm_input_dict[uuid] = torch.concat([self.lm_input_dict[uuid], lm_input_text, lm_input_speech], dim=1) -+ self.text_cache_dict[uuid], self.prompt_speech_token_emb_dict[uuid] = self.text_cache_dict[uuid][:, self.mix_ratio[0]:], self.prompt_speech_token_emb_dict[uuid][:, self.mix_ratio[1]:] -+ else: -+ break -+ -+ if self.prompt_speech_token_emb_dict[uuid].size(1) == 0: # 文本token数量多于音频token,混合完以后,剩余文本token,开始解码 -+ # 若上一次解码的 token 是 fill_token,说明 LLM 想要更多 text token -+ # 或者首次预测时,还没开始解码,out_tokens_dict 为空 -+ if ((len(self.out_tokens_dict[uuid]) != 0 and self.out_tokens_dict[uuid][-1] == self.speech_token_size + 2) -+ or (len(self.out_tokens_dict[uuid]) == 0 and self.lm_input_dict[uuid].size(1) == 1)): -+ # token数量够了 -+ if self.text_cache_dict[uuid].size(1) >= self.mix_ratio[0]: -+ lm_input_text = self.text_cache_dict[uuid][:, :self.mix_ratio[0]] # 抽出5个token -+ if len(self.out_tokens_dict[uuid]) != 0 and self.out_tokens_dict[uuid][-1] == self.speech_token_size + 2: # 预测出filling token,前面cache已经缓存,当前直接输入即可 -+ self.lm_input_dict[uuid] = lm_input_text -+ else: # sft刚开始预测,需要和sos token拼接在一起 -+ self.lm_input_dict[uuid] = torch.concat([self.lm_input_dict[uuid], lm_input_text], dim=1) -+ self.text_cache_dict[uuid] = self.text_cache_dict[uuid][:, self.mix_ratio[0]:] -+ else: -+ continue -+ -+ while True: -+ self.prompt_length[uuid] += self.lm_input_dict[uuid].shape[1] -+ seq_len = self.prompt_length[uuid] -+ if self.lm_input_dict[uuid].shape[1] > 1: -+ masks = build_causal_mask(self.lm_input_dict[uuid].shape[1], seq_len, -+ self.lm_input_dict[uuid].device) -+ else: -+ masks = None -+ y_pred, self.cache_dict[uuid] = self.llm.forward_one_step(self.lm_input_dict[uuid], -+ masks=masks, -+ prompt_length=seq_len, -+ cache=self.cache_dict[uuid]) -+ logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) -+ # 判断是否生成 filling_token: -+ if self.next_fill_index[uuid] != -1 and len(self.out_tokens_dict[uuid]) == self.next_fill_index[uuid]: -+ top_ids = self.speech_token_size + 2 # 该预测filling token了 -+ self.next_fill_index[uuid] += (self.mix_ratio[1] + 1) # 找到下一个filling token的位置 -+ else: -+ top_ids = self.sampling_ids(logp.squeeze(dim=0), self.out_tokens_dict[uuid], sampling, ignore_eos=True).item() -+ # 特殊 token 处理, fill_token → 中断预测、等待新文本 token。 -+ if top_ids == self.speech_token_size + 2: -+ self.next_fill_index[uuid] = len(self.out_tokens_dict[uuid]) + self.mix_ratio[1] + 1 # -1 > 30 -+ self.out_tokens_dict[uuid].append(top_ids) -+ if top_ids >= self.speech_token_size: -+ if top_ids == self.speech_token_size + 2: # 预测到了filling token, break掉迎接新的文本token -+ break -+ else: -+ raise ValueError('should not get token {}'.format(top_ids)) -+ yield top_ids -+ self.lm_input_dict[uuid] = self.speech_embedding.weight[top_ids].reshape(1, 1, -1).detach().clone() -+ -+ if input_end: -+ # 3. final decode 文本全部送完,进行最后的解码。 -+ task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1) -+ self.lm_input_dict[uuid] = torch.concat([self.lm_input_dict[uuid], self.text_cache_dict[uuid], task_id_emb, self.prompt_speech_token_emb_dict[uuid]], dim=1) -+ logging.info('no more text token, decode until met eos') -+ while True: -+ self.prompt_length[uuid] += self.lm_input_dict[uuid].shape[1] -+ seq_len = self.prompt_length[uuid] -+ if self.lm_input_dict[uuid].shape[1] > 1: -+ masks = build_causal_mask(self.lm_input_dict[uuid].shape[1], seq_len, self.lm_input_dict[uuid].device) -+ else: -+ masks = None -+ y_pred, self.cache_dict[uuid] = self.llm.forward_one_step(self.lm_input_dict[uuid], -+ masks=masks, -+ prompt_length=seq_len, -+ cache=self.cache_dict[uuid]) -+ logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) -+ top_ids = self.sampling_ids(logp.squeeze(dim=0), self.out_tokens_dict[uuid], sampling, ignore_eos=False).item() -+ self.out_tokens_dict[uuid].append(top_ids) -+ if top_ids >= self.speech_token_size: -+ if top_ids == self.speech_token_size: -+ break -+ else: -+ raise ValueError('should not get token {}'.format(top_ids)) -+ # in stream mode, yield token one by one -+ yield top_ids -+ self.lm_input_dict[uuid] = self.speech_embedding.weight[top_ids].reshape(1, 1, -1).detach().clone() -+ -+ # this user is done -+ self.prompt_speech_token_emb_dict.pop(uuid) -+ self.lm_input_dict.pop(uuid) -+ self.out_tokens_dict.pop(uuid) -+ self.cache_dict.pop(uuid) -+ self.text_cache_dict.pop(uuid) -+ self.next_fill_index.pop(uuid) -\ No newline at end of file -diff --git a/cosyvoice/utils/common.py b/cosyvoice/utils/common.py -index 3e61a8c..d316b92 100644 ---- a/cosyvoice/utils/common.py -+++ b/cosyvoice/utils/common.py -@@ -107,12 +107,33 @@ def init_weights(m, mean=0.0, std=0.01): - - # Repetition Aware Sampling in VALL-E 2 - def ras_sampling(weighted_scores, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1): -- top_ids = nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k) -+ top_ids = dst_sampling(weighted_scores, top_p=top_p, top_k=top_k) - rep_num = (torch.tensor(decoded_tokens[-win_size:]).to(weighted_scores.device) == top_ids).sum().item() - if rep_num >= win_size * tau_r: - top_ids = random_sampling(weighted_scores, decoded_tokens, sampling) - return top_ids - -+def dst_sampling(weighted_scores, top_p=0.8, top_k=25): -+ -+ sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(descending=True, stable=True) -+ -+ cum_sum = torch.cumsum(sorted_value, dim=0) -+ n = sorted_value.size(0) -+ device = cum_sum.device -+ pre_cum_sum = torch.cat([torch.zeros(1, device=device), cum_sum[:-1]]) -+ -+ indices = torch.arange(n ,device=device) -+ condition = (pre_cum_sum < top_p) & (indices < top_k) -+ -+ max_i_tensor = torch.where(condition, indices, torch.tensor(-1, device=device)) -+ n_selected = max_i_tensor.max() + 1 -+ -+ selected_prob = sorted_value[:n_selected] -+ selected_indices = sorted_idx[:n_selected] -+ -+ top_ids = selected_indices[selected_prob.multinomial(1, replacement=True)] -+ -+ return top_ids - - def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25): - prob, indices = [], [] diff --git a/ACL_PyTorch/built-in/audio/CosyVoice2/README.md b/ACL_PyTorch/built-in/audio/CosyVoice2/README.md index b80bdef91b..3fa14c4683 100644 --- a/ACL_PyTorch/built-in/audio/CosyVoice2/README.md +++ b/ACL_PyTorch/built-in/audio/CosyVoice2/README.md @@ -55,7 +55,7 @@ cd ModelZoo-PyTorch/ACL_PyTorch/built-in/audio/CosyVoice2 cd CosyVoice git reset --hard fd45708 git submodule update --init --recursive - # 根据当前使用机型,叠加patch。如果当前使用机型为313T 800T A2,则使用../800I/diff_CosyVoice_800T.patch + # 根据当前使用机型,叠加patch。如果当前使用机型为313T 800T A2,和800I共用patch文件 git apply ../${platform}/diff_CosyVoice_${platform}.patch # 将infer.py复制到CosyVoice中 cp ../infer.py ./ @@ -76,7 +76,6 @@ cd ModelZoo-PyTorch/ACL_PyTorch/built-in/audio/CosyVoice2 |── 📄 modeling_qwen2.py |── 📁 800I |── 📄 diff_CosyVoice_800I.patch - |── 📄 diff_CosyVoice_800T.patch |── 📄 modeling_qwen2.py |── 📁 CosyVoice |── 📁 cosyVoice源码文件 # cosyVoice的源码文件,此处不一一列举 -- Gitee From ec17d7a9f9fdab757fa354c801c1825648622c48 Mon Sep 17 00:00:00 2001 From: pu-zhe Date: Mon, 1 Sep 2025 08:27:57 +0800 Subject: [PATCH 4/6] update Readme --- ACL_PyTorch/built-in/audio/CosyVoice2/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ACL_PyTorch/built-in/audio/CosyVoice2/README.md b/ACL_PyTorch/built-in/audio/CosyVoice2/README.md index 3fa14c4683..804dadcfe5 100644 --- a/ACL_PyTorch/built-in/audio/CosyVoice2/README.md +++ b/ACL_PyTorch/built-in/audio/CosyVoice2/README.md @@ -110,7 +110,8 @@ cd ModelZoo-PyTorch/ACL_PyTorch/built-in/audio/CosyVoice2 3. 安装msit工具 - 参考[msit](https://gitee.com/ascend/msit)安装工具中的benchmark和surgeon组件。(未安装会提示 ais_bench 导入失败报错) + 参考[msit](https://gitee.com/ascend/msit/blob/master/msit/docs/install/README.md)安装工具中的benchmark和surgeon组件。(未安装会提示 ais_bench 导入失败报错) + 推荐使用git clone源码方式安装msit组件,否则推理过程中易出现报错The stream is not in the current context. 4. 获取权重数据 -- Gitee From 0ff2c8db49c311fbe324e4a2439ffafdc786f1b9 Mon Sep 17 00:00:00 2001 From: pu-zhe Date: Mon, 1 Sep 2025 11:58:35 +0800 Subject: [PATCH 5/6] clean --- .../CosyVoice2/800I/diff_CosyVoice_800I.patch | 14 +++++++++----- ACL_PyTorch/built-in/audio/CosyVoice2/CosyVoice | 1 + 2 files changed, 10 insertions(+), 5 deletions(-) create mode 160000 ACL_PyTorch/built-in/audio/CosyVoice2/CosyVoice diff --git a/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800I.patch b/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800I.patch index c7ed9dcbf8..d5021894c1 100644 --- a/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800I.patch +++ b/ACL_PyTorch/built-in/audio/CosyVoice2/800I/diff_CosyVoice_800I.patch @@ -1,5 +1,5 @@ diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py -index e2d62e2..95da570 100644 +index e2d62e2..a0512a4 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -13,11 +13,15 @@ @@ -27,7 +27,7 @@ index e2d62e2..95da570 100644 self.instruct = True if '-Instruct' in model_dir else False self.model_dir = model_dir self.fp16 = fp16 -@@ -155,6 +159,22 @@ class CosyVoice2(CosyVoice): +@@ -155,6 +159,26 @@ class CosyVoice2(CosyVoice): self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'), '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir), self.fp16) @@ -35,14 +35,18 @@ index e2d62e2..95da570 100644 + soc_version = acl.get_soc_name() + context = None + if '910B3' in soc_version: -+ context, _ = acl.rt.get_context() ++ context, ret = acl.rt.get_context() ++ if ret: ++ raise RuntimeError(f"Get context failed, retcode is {ret}.") + arch = platform.machine() + system = platform.system().lower() + flow_om = InferSession(0, '{}/flow_{}_{}.om'.format(model_dir, system ,arch)) + flow_om_static = InferSession(0, '{}/flow_static.om'.format(model_dir)) + speech_om = InferSession(0, '{}/speech_{}_{}.om'.format(model_dir, system ,arch)) + if '910B3' in soc_version: -+ _ = acl.rt.set_context(context) ++ ret = acl.rt.set_context(context) ++ if ret: ++ raise RuntimeError(f"Set context failed, retcode is {ret}.") + self.frontend.speech_om = speech_om + self.frontend.flow_om = flow_om + self.model.flow.decoder.flow_om_static = flow_om_static @@ -50,7 +54,7 @@ index e2d62e2..95da570 100644 del configs def inference_instruct(self, *args, **kwargs): -@@ -171,3 +191,19 @@ class CosyVoice2(CosyVoice): +@@ -171,3 +195,19 @@ class CosyVoice2(CosyVoice): logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) yield model_output start_time = time.time() diff --git a/ACL_PyTorch/built-in/audio/CosyVoice2/CosyVoice b/ACL_PyTorch/built-in/audio/CosyVoice2/CosyVoice new file mode 160000 index 0000000000..526aae30a7 --- /dev/null +++ b/ACL_PyTorch/built-in/audio/CosyVoice2/CosyVoice @@ -0,0 +1 @@ +Subproject commit 526aae30a71b136600fad7698b591e4c71e2666a -- Gitee From d5550baec21cfd91ae76beccb2009ad152ee9533 Mon Sep 17 00:00:00 2001 From: pu-zhe Date: Mon, 1 Sep 2025 11:59:20 +0800 Subject: [PATCH 6/6] clean --- ACL_PyTorch/built-in/audio/CosyVoice2/CosyVoice | 1 - 1 file changed, 1 deletion(-) delete mode 160000 ACL_PyTorch/built-in/audio/CosyVoice2/CosyVoice diff --git a/ACL_PyTorch/built-in/audio/CosyVoice2/CosyVoice b/ACL_PyTorch/built-in/audio/CosyVoice2/CosyVoice deleted file mode 160000 index 526aae30a7..0000000000 --- a/ACL_PyTorch/built-in/audio/CosyVoice2/CosyVoice +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 526aae30a71b136600fad7698b591e4c71e2666a -- Gitee