197 Star 1.3K Fork 1.2K

GVPAscend/MindSpeed-LLM

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
indexed_dataset.py 2.04 KB
一键复制 编辑 原始数据 按行查看 历史
shenjiarun 提交于 8个月前 . !2030CodeChcek整改-master
# Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved.
from typing import List, Union
from functools import wraps
import numpy
import torch
class BufferWriter:
"""
Write the sequences in chunks rather than one by one
"""
def __init__(self, data_file, dtype, buffer_chunk_size=10 ** 5):
self.data_file = data_file
self.dtype = dtype
self.buffer_threshold = buffer_chunk_size
self.buffer = []
def reset_buffer(self):
self.buffer = []
def write(self):
if self.buffer:
buffer_array = numpy.array(self.buffer, dtype=self.dtype)
self.data_file.write(buffer_array.tobytes(order="C"))
self.reset_buffer()
def add(self, lst: List):
self.buffer.extend(lst)
if len(self.buffer) >= self.buffer_threshold:
self.write()
def add_item_from_list(self, lst: List, mode: int = 0) -> None:
"""Add a single item to the dataset. Control the writing process using a buffer.
Args:
self (IndexedDatasetBuilder): The builder object
lst (list): The item to add to the data file
mode (int, optional): The mode for the item. Defaults to 0.
"""
self.buffer_writer.add(lst)
self.sequence_lengths.append(len(lst))
if self.multimodal:
self.sequence_modes.append(mode)
def indexed_dataset_builder_init_wrapper(init_func):
@wraps(init_func)
def wrapper(self, *args, **kwargs):
init_func(self, *args, **kwargs)
self.buffer_writer = BufferWriter(data_file=self.data_file, dtype=self.dtype)
return wrapper
def add_item_wrapper(fn):
@wraps(fn)
def wrapper(self, sequence: Union[List, torch.Tensor], mode: int = 0) -> None:
if isinstance(sequence, list):
return add_item_from_list(self, sequence, mode)
else:
return fn(self, sequence, mode)
return wrapper
def finalize_wrapper(fn):
@wraps(fn)
def wrapper(self, *args, **kwargs):
self.buffer_writer.write()
fn(self, *args, **kwargs)
return wrapper
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/ascend/MindSpeed-LLM.git
git@gitee.com:ascend/MindSpeed-LLM.git
ascend
MindSpeed-LLM
MindSpeed-LLM
2.1.0

搜索帮助