diff --git a/ACL_PyTorch/built-in/cv/GroundingDINO/mmdetection_diff.patch b/ACL_PyTorch/built-in/cv/GroundingDINO/mmdetection_diff.patch index df7b55bbe8328c0437ea4d3799719a7906e0c2bf..b2467d316ef80a5b40af23248cfe0ba526793049 100644 --- a/ACL_PyTorch/built-in/cv/GroundingDINO/mmdetection_diff.patch +++ b/ACL_PyTorch/built-in/cv/GroundingDINO/mmdetection_diff.patch @@ -56,6 +56,40 @@ index 0eb5cd2f..a9cf3ffc 100644 level_start_index=level_start_index, valid_ratios=valid_ratios) decoder_inputs_dict = dict( +diff --git a/mmdet/models/detectors/glip.py b/mmdet/models/detectors/glip.py +index 45cfe7d3..abc9004b 100644 +--- a/mmdet/models/detectors/glip.py ++++ b/mmdet/models/detectors/glip.py +@@ -25,13 +25,25 @@ def find_noun_phrases(caption: str) -> list: + >>> caption = 'There is two cat and a remote in the picture' + >>> find_noun_phrases(caption) # ['cat', 'a remote', 'the picture'] + """ ++ # try: ++ # import nltk ++ # nltk.download('punkt', download_dir='~/nltk_data') ++ # nltk.download('averaged_perceptron_tagger', download_dir='~/nltk_data') ++ # except ImportError: ++ # raise RuntimeError('nltk is not installed, please install it by: ' ++ # 'pip install nltk.') ++ ++ import nltk + try: +- import nltk ++ nltk.data.find("tokenizers/punkt") ++ except LookupError: + nltk.download('punkt', download_dir='~/nltk_data') ++ ++ try: ++ nltk.data.find("taggers/averaged_perceptron_tagger") ++ except LookupError: + nltk.download('averaged_perceptron_tagger', download_dir='~/nltk_data') +- except ImportError: +- raise RuntimeError('nltk is not installed, please install it by: ' +- 'pip install nltk.') ++ + + caption = caption.lower() + tokens = nltk.word_tokenize(caption) diff --git a/mmdet/models/detectors/grounding_dino.py b/mmdet/models/detectors/grounding_dino.py index b1ab7c2d..d4a15e98 100644 --- a/mmdet/models/detectors/grounding_dino.py diff --git a/ACL_PyTorch/built-in/cv/SAM/README.md b/ACL_PyTorch/built-in/cv/SAM/README.md index 7e0577be265a77f3f4f40aed9f7e61e7cfeac751..50bd2cd7f1e42ffc614ea437c30fc7b3a126df1a 100644 --- a/ACL_PyTorch/built-in/cv/SAM/README.md +++ b/ACL_PyTorch/built-in/cv/SAM/README.md @@ -1,5 +1,6 @@ -# SAM 推理指导 +# SAM(ONNX)-推理指导 +## 概述 Segment Anything Model (SAM) 是由 Meta 开源的图像分割大模型,在计算机视觉领域(CV)取得了新的突破。SAM 可在不需要任何标注的情况下,对任何图像中的任何物体进行分割,SAM 的开源引起了业界的广泛反响,被称为计算机视觉领域的 GPT。 - 论文: @@ -17,7 +18,47 @@ Segment Anything Model (SAM) 是由 Meta 开源的图像分割大模型,在计 model_name=sam_vit_b_01ec64 ``` -## 1. 输入输出数据 +## 推理环境准备 + +- 该模型需要以下插件与驱动 + +**表 1** 版本配套表 + + | 配套 | 版本 | 环境准备指导 | + | ---- | ---- | ---- | + | 固件与驱动 | 25.2.RC1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | + | CANN | 8.2.RC1 | - | + | Python | 3.11.10 | - | + | PyTorch | 2.1.0 | - | + | 说明:仅支持Atlas 300I Duo 推理卡,请以CANN版本选择实际固件与驱动版本。 | \ | \ | + +## 快速上手 + +### 1. 获取源码 + +```bash +git clone https://gitee.com/ascend/ModelZoo-PyTorch.git +cd ModelZoo-PyTorch/ACL_PyTorch/built-in/cv/SAM +git clone https://github.com/facebookresearch/segment-anything.git +cd segment-anything +git reset --hard 6fdee8f2727f4506cfbbe553e23b895e27956588 +git apply ../segment_anything_diff.patch +pip3 install -e . +cd .. +``` + +### 2. 安装依赖。 + +- 安装基础环境。 + +```bash +pip3 install -r requirements.txt +``` +说明:如果某些库通过此方式安装失败,可使用 pip3 install 单独进行安装。 + +- 安装 [msit](https://gitee.com/ascend/msit/tree/master/msit/) 的 surgeon 组件和 benchmark 组件。 + +### 3. 输入输出数据描述 SAM 首先会自动分割图像中的所有内容,但是如果你需要分割某一个目标物体,则需要你输入一个目标物体上的坐标,比如一张图片你想让SAM分割Cat或Dog这个目标的提示坐标,SAM会自动在照片中猫或狗进行分割,在离线推理时,会转成encoder模型和decoder模型,其输入输出详情如下: @@ -53,52 +94,9 @@ SAM 首先会自动分割图像中的所有内容,但是如果你需要分割 | low_res_masks | FLOAT32 | -1 x 1 x -1 x -1 | ND | -## 2. 推理环境准备 - -- 该模型需要以下插件与驱动 - - **表 1** 版本配套表 - -| 配套 | 版本 | 环境准备指导 | -| ---- | ---- | ---- | -| 固件与驱动 | 25.2.RC1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | -| CANN | 8.2.RC1 | - | -| MindIE | 2.1.RC1 | - | -| Python | 3.11.10 | - | -| PyTorch | 2.1.0 | - | -| 说明:Atlas 300I Duo 推理卡请以CANN版本选择实际固件与驱动版本。 | \ | \ | - -## 3. 快速上手 - -### 3.1 获取源码 - -``` -git clone https://gitee.com/ascend/ModelZoo-PyTorch.git -cd ModelZoo-PyTorch/ACL_PyTorch/built-in/cv/SAM -git clone https://github.com/facebookresearch/segment-anything.git -cd segment-anything -git reset --hard 6fdee8f2727f4506cfbbe553e23b895e27956588 -git apply ../segment_anything_diff.patch -pip3 install -e . -cd .. -``` - -### 3.2 安装依赖。 - -1. 安装基础环境。 - - ```bash - pip3 install -r requirements.txt - ``` - - 说明:如果某些库通过此方式安装失败,可使用 pip3 install 单独进行安装。 - -2. 安装 [msit](https://gitee.com/ascend/msit/tree/master/msit/) 的 surgeon 组件和 benchmark 组件。 - -### 3.3 准备数据集 - -GitHub 仓库没有提供精度和性能的测试手段,这里取仓库里的 demo 图片进行测试。 +### 4. 准备数据集 +- 取仓库里的 demo 图片进行端到端测试。 ```bash mkdir data cd data @@ -106,9 +104,23 @@ wget -O demo.jpg https://raw.githubusercontent.com/facebookresearch/segment-anyt cd .. ``` -### 3.4 模型转换 +- 下载coco2017数据集进行精度测试。 + +下载并解压COCO-2017数据集的[图片](https://gitee.com/link?target=http%3A%2F%2Fimages.cocodataset.org%2Fzips%2Fval2017.zip)与[标注](https://gitee.com/link?target=http%3A%2F%2Fimages.cocodataset.org%2Fannotations%2Fannotations_trainval2017.zip),放置coco2017目录下 + + ``` + coco2017 + ├── annotations/ + │ └── instances_val2017.json + └── val2017/ + ├── 000000000139.jpg + ├── 000000000139.jpg + └── ... + ``` + +### 5. 模型转换 -#### 3.4.1 获取权重文件 +#### 5.1 获取权重文件 GitHub 仓库提供了三种大小的权重文件:vit_h、vit_l、vit_b。这里以 vit_b 为例。 @@ -119,7 +131,7 @@ wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth cd .. ``` -#### 3.4.2 导出 ONNX 模型 +#### 5.2 导出 ONNX 模型 ```bash python3 segment-anything/scripts/export_onnx_model.py \ @@ -140,7 +152,7 @@ python3 segment-anything/scripts/export_onnx_model.py \ - decoder-output:保存decoder模型的输出ONNX模型的文件路径。 - return-single-mask:设置最优mask模式。 -#### 3.4.3 使用 onnxsim 简化 ONNX 模型 +#### 5.3 使用 onnxsim 简化 ONNX 模型 这里以 batchsize=1 为例。 @@ -156,7 +168,7 @@ onnxsim models/decoder.onnx models/decoder_sim.onnx - 第二个参数:简化后的 ONNX 保存路径。 - overwrite-input-shape:指定输入的维度。 -#### 3.4.4 运行改图脚本,修改 ONNX 模型以适配昇腾芯片 +#### 5.4 运行改图脚本,修改 ONNX 模型以适配昇腾芯片 ```bash python3 encoder_onnx_modify.py \ @@ -169,9 +181,9 @@ python3 encoder_onnx_modify.py \ - 第一个参数:原 ONNX 路径。 - 第二个参数:适配后的 ONNX 保存路径。 -#### 3.4.5 使用 ATC 工具将 ONNX 模型转为 OM 模型 +#### 5.5 使用 ATC 工具将 ONNX 模型转为 OM 模型 -1. 配置环境变量。 +- 配置环境变量。 ```bash source /usr/local/Ascend/ascend-toolkit/set_env.sh @@ -180,7 +192,7 @@ python3 encoder_onnx_modify.py \ > **说明:** 该脚本中环境变量仅供参考,请以实际安装环境配置环境变量。详细介绍请参见《[CANN 开发辅助工具指南 \(推理\)](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=developer-documents&subcategory=auxiliary-development-tools)》。 -2. 执行命令查看芯片名称($\{chip\_name\})。 +- 执行命令查看芯片名称($\{chip\_name\})。 ```bash npu-smi info @@ -198,7 +210,7 @@ python3 encoder_onnx_modify.py \ +===================+=================+======================================================+ ``` -3. 执行 atc 命令。 +- 执行 atc 命令。 ```bash atc \ @@ -234,9 +246,9 @@ python3 encoder_onnx_modify.py \ 更多参数说明请参考 [ATC 参数概览](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha002/devaids/auxiliarydevtool/atlasatc_16_0039.html)(如果链接失效,请从 [CANN 社区版文档](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition) 查找目录「应用开发 > ATC 模型转换 > 参数说明 > 参数概览」) -### 3.5 推理验证 +### 6 推理验证 -1. 端到端推理。成功执行下述命令后会在save-path参数指定的目录生成离线推理的结果。 +6.1 端到端推理。成功执行下述命令后会在save-path参数指定的目录生成离线推理的结果。 ```bash python3 sam_end2end_infer.py \ @@ -271,7 +283,7 @@ python3 encoder_onnx_modify.py \ ![](./assets/om_truck_result.JPG) -2. 性能验证。 +6.2 性能验证。 1. encoder 纯推理性能验证。 @@ -305,9 +317,38 @@ python3 encoder_onnx_modify.py \ - loop: 循环次数 - batchsize: 模型batch size -## 4. 模型推理性能 & 精度 +6.3 精度验证。 + +SAM 官方未提供精度评测手段,这里提供对应脚本,基于 COCO 验证集标注框作为输入提示,使用 SAM 预测分割掩码,并与 COCO 标注掩码逐实例进行 IoU 计算,最后对所有实例的 IoU 结果取平均,得到整体的平均交并比(mIoU)。 + + ```bash + python sam_coco_metric.py \ + --dataset-path coco2017 \ + --save-path outputs \ + --encoder-model-path models/encoder_sim.om \ + --decoder-model-path models/decoder_sim.om \ + --device-id 0 \ + --max-instances 0 + ``` + +参数说明: + +- dataset-path: coco数据集目录 +- save-path: SAM预测掩码结果存储路径 +- encoder-model-path:encoder的OM模型路径 +- decoder-model-path:decoder的OM模型路径 +- device-id: 指定推理的NPU设备ID +- max-instances: 评测的最大实例数量,默认为0表示测评完整验证集 +## 4. 模型推理性能 & 精度 +性能结果: | 芯片型号 | 模型 | Batch Size | 性能 | | ---- | ---- | ---- | ---- | | 300I Pro | encoder | 1 | 4.43 fps | | 300I Pro | decoder | 1 | 679.77 fps | + +精度结果: +| 芯片型号 | 模型 | Batch Size | 精度(mIoU) | +| ---- | ---- | ---- | ---- | +| 300I Pro | SAM | 1 | 0.7654 | + diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..b8e416deb51e782316d7cd770e6d1dd350ab512b --- /dev/null +++ b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py @@ -0,0 +1,191 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import argparse + +import cv2 +import numpy as np +from tqdm import tqdm +from pycocotools.coco import COCO +from pycocotools import mask as maskUtils + +from ais_bench.infer.interface import InferSession + +from sam_preprocessing_pytorch import encoder_preprocessing, decoder_preprocessing +from sam_postprocessing_pytorch import sam_postprocessing + + +def rle_to_mask(rle, h, w): + """COCO segmentation → binary mask (h,w) uint8.""" + if isinstance(rle, list): + rles = maskUtils.frPyObjects(rle, h, w) + rle = maskUtils.merge(rles) + elif isinstance(rle, dict) and isinstance(rle.get("counts"), list): + rle = maskUtils.frPyObjects(rle, h, w) + return maskUtils.decode(rle).astype(np.uint8) + + +def compute_iou(pred_mask, gt_mask): + pred = (pred_mask > 0).astype(np.uint8) + gt = (gt_mask > 0).astype(np.uint8) + inter = (pred & gt).sum() + union = (pred | gt).sum() + return float(inter) / float(union) if union > 0 else 0.0 + + +def coco_bbox_to_xyxy(bbox_xywh): + x, y, w, h = bbox_xywh + return [x, y, x + w, y + h] + + +def encoder_infer(session_encoder, x): + encoder_outputs = session_encoder.infer([x]) + image_embedding = encoder_outputs[0] + return image_embedding + + +def decoder_infer(session_decoder, decoder_inputs): + decoder_outputs = session_decoder.infer(decoder_inputs, mode="dymdims", custom_sizes=[1000, 1000000]) + low_res_masks = decoder_outputs[1] + return low_res_masks + + +def save_mask_overlay(masks, image, save_dir, image_name): + overlay = image.copy() + alpha = 0.5 + + for mask in masks: + if mask.sum() == 0: + continue + color = np.random.randint(0, 255, (3,), dtype=np.uint8) # 每个实例随机颜色 + overlay[mask > 0] = (overlay[mask > 0] * (1 - alpha) + color * alpha).astype(np.uint8) + + base, ext = os.path.splitext(image_name) + save_path = os.path.join(save_dir, f"{base}_sam_pre{ext}") + cv2.imwrite(save_path, overlay) + + +def evaluate_sam_on_coco(coco_root, save_path, encoder, decoder, max_instances=0): + ann_file = os.path.join(coco_root, "annotations", "instances_val2017.json") + img_root = os.path.join(coco_root, "val2017") + if not os.path.isfile(ann_file): + raise FileNotFoundError(f"COCO annotations not found: {ann_file}") + if not os.path.isdir(img_root): + raise FileNotFoundError(f"COCO val2017 images not found: {img_root}") + + coco = COCO(ann_file) + img_ids = coco.getImgIds() + + session_encoder = encoder + session_decoder = decoder + + ious = [] + counted = 0 + + for img_id in tqdm(img_ids, desc="Evaluating"): + img_info = coco.loadImgs(img_id)[0] + img_path = os.path.join(img_root, img_info["file_name"]) + image = cv2.imread(img_path) + + H, W = image.shape[:2] + + x = encoder_preprocessing(image) + image_embedding = encoder_infer(session_encoder, x) + + ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False) + anns = coco.loadAnns(ann_ids) + + mask_list = [] + for ann in anns: + + if max_instances > 0 and counted >= max_instances: + break + + box_xyxy = coco_bbox_to_xyxy(ann["bbox"]) + + decoder_inputs = decoder_preprocessing(image_embedding, box=box_xyxy, image=image) + low_res_masks = decoder_infer(session_decoder, decoder_inputs) + masks = sam_postprocessing(low_res_masks, image) + + pred2d = masks[0][0].astype(np.uint8) + mask_list.append(pred2d) + pred_bin = pred2d.astype(np.uint8) + + gt_mask = rle_to_mask(ann["segmentation"], H, W) + iou = compute_iou(pred_bin, gt_mask) + ious.append(iou) + counted += 1 + + if save_path is not None and len(mask_list) > 0: + save_mask_overlay(mask_list, image, save_path, img_info["file_name"]) + + if max_instances > 0 and counted >= max_instances: + break + + miou = float(np.mean(ious)) if counted > 0 else 0.0 + print("\n=========== COCO Evaluation (Box Prompt) ===========") + print(f"Instances Evaluated : {counted}") + print(f"Mean IoU (mIoU) : {miou:.4f}") + print("====================================================\n") + + +def check_device_range_valid(value): + # if contain , split to int list + min_value = 0 + max_value = 255 + if ',' in value: + ilist = [int(v) for v in value.split(',')] + for ivalue in ilist[:2]: + if ivalue < min_value or ivalue > max_value: + raise argparse.ArgumentTypeError("{} of device:{} is invalid. valid value range is [{}, {}]".format( + ivalue, value, min_value, max_value)) + return ilist[:2] + else: + # default as single int value + ivalue = int(value) + if ivalue < min_value or ivalue > max_value: + raise argparse.ArgumentTypeError("device:{} is invalid. valid value range is [{}, {}]".format( + ivalue, min_value, max_value)) + return ivalue + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--dataset-path', type=str, default='./datasets/', help='input path to coco dataset') + parser.add_argument('--save-path', type=str, default=None, help='output path to image') + parser.add_argument('--encoder-model-path', type=str, default='./models/encoder_sim.om', help='path to encoder model') + parser.add_argument('--decoder-model-path', type=str, default='./models/decoder_sim.om', help='path to decoder model') + parser.add_argument('--device-id', type=check_device_range_valid, default=0, help='NPU device id.') + parser.add_argument('--max-instances', type=int, default=0, help='Maximum number of instances to evaluate (0 = all).') + args = parser.parse_args() + + if args.save_path and not os.path.exists(args.save_path): + os.makedirs(os.path.realpath(args.save_path), mode=0o744) + + session_encoder = InferSession(args.device_id, args.encoder_model_path) + session_decoder = InferSession(args.device_id, args.decoder_model_path) + + evaluate_sam_on_coco( + args.dataset_path, + args.save_path, + session_encoder, + session_decoder, + max_instances=args.max_instances + ) + +if __name__ == "__main__": + main() + diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_end2end_infer.py b/ACL_PyTorch/built-in/cv/SAM/sam_end2end_infer.py index 25db4ffd006b5603580ff1206c7326dfdeb7f797..952c95520dc5c0ec4829c6ce8dfce3064569db87 100644 --- a/ACL_PyTorch/built-in/cv/SAM/sam_end2end_infer.py +++ b/ACL_PyTorch/built-in/cv/SAM/sam_end2end_infer.py @@ -69,11 +69,11 @@ def decoder_infer(session_decoder, decoder_inputs): return low_res_masks -def sam_infer(src_path, session_encoder, session_decoder, input_point, save_path): +def sam_infer(src_path, session_encoder, session_decoder, input_point=None, box=None, save_path="./"): image = cv2.imread(src_path) x = encoder_preprocessing(image) image_embedding = encoder_infer(session_encoder, x) - decoder_inputs = decoder_preprocessing(image_embedding, input_point, image) + decoder_inputs = decoder_preprocessing(image_embedding, input_point=input_point, box=box, image=image) low_res_masks = decoder_infer(session_decoder, decoder_inputs) masks = sam_postprocessing(low_res_masks, image) save_mask(masks, image, src_path, save_path, random_color=True) @@ -95,8 +95,7 @@ def main(): session_encoder = InferSession(args.device_id, args.encoder_model_path) session_decoder = InferSession(args.device_id, args.decoder_model_path) - sam_infer(args.src_path, session_encoder, session_decoder, args.input_point, args.save_path) - + sam_infer(args.src_path, session_encoder, session_decoder, input_point=args.input_point, save_path=args.save_path) if __name__ == '__main__': main() diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py b/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py index d8b5a1474207a1fc4f554beac8d5f5bbbcbd2979..3afa5e11fed50da8f2ffb0849db295cf3df1ba7e 100644 --- a/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py +++ b/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py @@ -35,12 +35,27 @@ def encoder_preprocessing(image): return image -def decoder_preprocessing(image_embedding, input_point, image): - input_point = np.array(input_point) - input_label = [1] * len(input_point) - input_label = np.array(input_label) - onnx_coord = np.concatenate([input_point, np.array([[0.0, 0.0]])], axis=0)[None, :, :] - onnx_label = np.concatenate([input_label, np.array([-1])], axis=0)[None, :].astype(np.float32) +def decoder_preprocessing(image_embedding, input_point=None, box=None, image=None): + coords_list = [] + labels_list = [] + + if input_point is not None and len(input_point) > 0: + input_point = np.array(input_point, dtype=np.float32) + input_label = np.ones(len(input_point), dtype=np.float32) + coords_list.append(input_point) + labels_list.append(input_label) + + coords_list.append(np.array([[0.0, 0.0]], dtype=np.float32)) + labels_list.append(np.array([-1], dtype=np.float32)) + + if box is not None: + box = np.array(box, dtype=np.float32).reshape(2, 2) + coords_list.append(box) + labels_list.append(np.array([2, 3], dtype=np.float32)) + + onnx_coord = np.concatenate(coords_list, axis=0)[None, :, :] + onnx_label = np.concatenate(labels_list, axis=0)[None, :].astype(np.float32) + transform = ResizeLongestSide(IMAGE_SIZE) onnx_coord = transform.apply_coords(onnx_coord, image.shape[: 2]).astype(np.float32) onnx_mask_input = np.zeros((1, 1, 256, 256), dtype=np.float32)