From 4e3fb10d5f7606504943c773a933cf287fd25db4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A3=AE=E9=95=87?= Date: Thu, 20 Jun 2024 15:28:22 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=9A=E8=BF=9B=E7=A8=8B=E5=88=9B=E5=BB=BA?= =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=E5=A4=B9=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mindformers/tools/resume_ckpt.py | 3 ++- mindformers/tools/utils.py | 26 +++++++++++++++----------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/mindformers/tools/resume_ckpt.py b/mindformers/tools/resume_ckpt.py index 5d4ec7f1ea..5f7ca6a7c9 100644 --- a/mindformers/tools/resume_ckpt.py +++ b/mindformers/tools/resume_ckpt.py @@ -17,6 +17,7 @@ import os import time import json import shutil +import random from mindformers.tools.logger import logger from mindformers.tools.utils import ( @@ -162,7 +163,7 @@ def wait_get_resume_ckpt(resume_record_dir, gap_time=5, limit_time=3600): return resume_ckpt if time.time() - start_time > limit_time: raise RuntimeError("Wait rank_0 get resume checkpoint timeout!") - time.sleep(gap_time) + time.sleep(gap_time + random.uniform(-1, 1)) def get_minimum_epoch_step_and_ckpt(checkpoint_dir): diff --git a/mindformers/tools/utils.py b/mindformers/tools/utils.py index 004948a5c4..9b83c7201f 100644 --- a/mindformers/tools/utils.py +++ b/mindformers/tools/utils.py @@ -16,7 +16,9 @@ import json import os import re +import time import shutil +import random import tempfile from multiprocessing import Process from typing import Dict, List, Tuple, Union @@ -493,34 +495,36 @@ def is_publicly_accessible_path(path): def remake_folder(folder_path, permissions): """make folder""" + from .logger import logger remaked_txt = os.path.join(folder_path, "remaked.txt") if is_main_rank(): if os.path.exists(folder_path) and os.listdir(folder_path): shutil.rmtree(folder_path) os.makedirs(folder_path, exist_ok=True) os.chmod(folder_path, permissions) - f = open(remaked_txt, "w") - f.close() - while True: - if os.path.exists(remaked_txt): - break + # pylint: disable=W0612 + with open(remaked_txt, "w") as f: + pass + while not os.path.exists(remaked_txt): + time.sleep(0.1 + random.uniform(0, 0.1)) + logger.info(f"Folder {folder_path} remake with permissions {oct(permissions)}") def remove_folder(folder_path): """delete folder""" + from .logger import logger if check_in_modelarts(): import moxing as mox if mox.file.exists(folder_path) and not get_real_rank(): mox.file.remove(folder_path, recursive=True) - while True: - if not mox.file.exists(folder_path): - break + while mox.file.exists(folder_path): + time.sleep(0.1 + random.uniform(0, 0.1)) else: if os.path.exists(folder_path) and not get_real_rank(): shutil.rmtree(folder_path) - while True: - if not os.path.exists(folder_path): - break + while os.path.exists(folder_path): + time.sleep(0.1 + random.uniform(0, 0.1)) + logger.info(f"Folder {folder_path} removed successfully") def get_epoch_and_step_from_ckpt_name(ckpt_file): -- Gitee