From 5df852635afeb30a4ae55ec8349c1f1beaa31a2b Mon Sep 17 00:00:00 2001
From: mahaoliang <mahaoliang@gmail.com>
Date: Sun, 27 Jul 2025 17:48:07 +0800
Subject: [PATCH 1/6] Add SDF generation functionality and related components

- Implement SDFGenerator class for generating Slice Definition Files (SDF).
- Create DependencyAnalyzer for analyzing binary dependencies.
- Add EnvironmentManager for managing package installation and cleanup.
- Introduce SDFWriter for building and writing SDF data to YAML files.
- Add classifier for categorizing files into slices.
- Implement command-line interface for SDF generation.
- Update README with new system dependencies.
- Add unit tests for SDFWriter functionality.
---
 README.md                              |   2 +-
 tests/python/writer_test.py            |  78 ++++++++++
 tools/cmd/gen.py                       |  45 ++++++
 tools/generator/__init__.py            |   0
 tools/generator/classifier.py          | 102 ++++++++++++
 tools/generator/dependency_analyzer.py | 208 +++++++++++++++++++++++++
 tools/generator/environment.py         |  96 ++++++++++++
 tools/generator/sdfgenerator.py        |  86 ++++++++++
 tools/generator/writer.py              | 124 +++++++++++++++
 tools/main.py                          |   2 +
 10 files changed, 742 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/writer_test.py
 create mode 100644 tools/cmd/gen.py
 create mode 100644 tools/generator/__init__.py
 create mode 100644 tools/generator/classifier.py
 create mode 100644 tools/generator/dependency_analyzer.py
 create mode 100644 tools/generator/environment.py
 create mode 100644 tools/generator/sdfgenerator.py
 create mode 100644 tools/generator/writer.py

diff --git a/README.md b/README.md
index 849f423..acc00db 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ splitter处于开发阶段，当前仅支持在openEuler上部署（建议使用
 
 1. 安装系统依赖
 ```
-dnf install python3-dnf git python3-pip cpio
+dnf install python3-dnf git python3-pip cpio binutils
 ```
 
 2. 克隆源码仓库
diff --git a/tests/python/writer_test.py b/tests/python/writer_test.py
new file mode 100644
index 0000000..ffe28e8
--- /dev/null
+++ b/tests/python/writer_test.py
@@ -0,0 +1,78 @@
+import unittest
+from tools.generator.writer import SDFWriter
+
+
+class TestCompressPaths(unittest.TestCase):
+    def setUp(self):
+        self.writer = SDFWriter(
+            output="test_output",
+            package_name="test_package",
+            classified_slices={},
+            slice_deps={},
+        )
+
+    def test_empty_set(self):
+        """Test an empty set of files"""
+        result = self.writer._compress_paths(set())
+        self.assertEqual(result, set())
+
+    def test_no_so_files(self):
+        """Test the case without .so files"""
+        files = {"file1.txt", "dir/file2.py"}
+        result = self.writer._compress_paths(files)
+        self.assertEqual(result, files)
+
+    def test_single_so_file(self):
+        """Test a single .so file"""
+        files = {"libtest.so"}
+        result = self.writer._compress_paths(files)
+        self.assertEqual(result, files)
+
+    def test_multiple_unrelated_so_files(self):
+        """Test multiple unrelated .so files"""
+        files = {"libA.so", "libB.so", "libC.so"}
+        result = self.writer._compress_paths(files)
+        self.assertEqual(result, files)
+
+    def test_versioned_libs(self):
+        """Test compression of versioned library files"""
+        files = {
+            "libtest.so.1",
+            "libtest.so.1.2",
+            "libtest.so.1.2.3",
+            "other.so",
+        }
+        expected = {"libtest.so.1*", "other.so"}
+        result = self.writer._compress_paths(files)
+        self.assertEqual(result, expected)
+
+    def test_mixed_files(self):
+        """Test mixed file types (.so and other files)"""
+        files = {
+            "libtest.so.1",
+            "libtest.so.1.2",
+            "file.txt",
+            "libother.so",
+            "script.py",
+        }
+        expected = {"libtest.so.1*", "libother.so", "file.txt", "script.py"}
+        result = self.writer._compress_paths(files)
+        self.assertEqual(result, expected)
+
+    def test_multiple_version_groups(self):
+        """Test multiple groups of versioned library files"""
+        files = {"libA.so.1", "libA.so.1.2", "libB.so.1", "libB.so.1.2", "libC.so"}
+        expected = {"libA.so.1*", "libB.so.1*", "libC.so"}
+        result = self.writer._compress_paths(files)
+        self.assertEqual(result, expected)
+
+    def test_partial_matches(self):
+        """Test partial but not exact matches"""
+        files = {"libtest.so.1", "libtestX.so.1.2", "libtest.so.1.2"}
+        expected = {"libtest.so.1*", "libtestX.so.1.2"}
+        result = self.writer._compress_paths(files)
+        self.assertEqual(result, expected)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/cmd/gen.py b/tools/cmd/gen.py
new file mode 100644
index 0000000..8a62c54
--- /dev/null
+++ b/tools/cmd/gen.py
@@ -0,0 +1,45 @@
+import click
+import platform
+
+from tools.generator.sdfgenerator import SDFGenerator
+
+
+@click.command(
+    name="gen",
+    help="Automatically generate a Slice Definition File (SDF) for an openEuler package.",
+)
+@click.option(
+    "-r",
+    "--release",
+    required=True,
+    help="This decides which openEuler release you will use, such as `openEuler-24.03-LTS-SP1`.",
+)
+@click.option(
+    "-a",
+    "--arch",
+    default=None,
+    help="The architecture. If not provided, it will be auto-detected from the host machine.",
+)
+@click.option(
+    "-o", "--output", default=".", help="The directory to save the generated SDF file."
+)
+@click.option(
+    "-p",
+    "--package",
+    required=True,
+    help="The name of the RPM package to generate an SDF for (e.g., 'attr').",
+)
+def gen(release, arch, output, package):
+    """
+    CLI command to orchestrate SDF generation.
+    """
+    if not arch:
+        arch = platform.machine()
+        click.echo(f"Architecture not specified, auto-detected: {arch}")
+
+    click.echo(
+        f"Starting SDF generation for '{package}' on openEuler-{release} ({arch})..."
+    )
+
+    generator = SDFGenerator(release=release, arch=arch, output=output, package=package)
+    generator.gen()
diff --git a/tools/generator/__init__.py b/tools/generator/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/generator/classifier.py b/tools/generator/classifier.py
new file mode 100644
index 0000000..717c882
--- /dev/null
+++ b/tools/generator/classifier.py
@@ -0,0 +1,102 @@
+from collections import defaultdict
+import os
+from pathlib import Path
+
+# Slice Type Definitions
+SLICE_TYPE_COPYRIGHT = "_copyright"
+SLICE_TYPE_CONFIG = "_config"
+SLICE_TYPE_BINS = "_bins"
+SLICE_TYPE_LIBS = "_libs"
+SLICE_TYPE_FILES = "_files"
+
+# Rule Definition
+CLASSIFICATION_RULES = [
+    (SLICE_TYPE_COPYRIGHT, ("is_copyright", True)),
+    (SLICE_TYPE_CONFIG, ("prefix", "/etc/")),
+    (
+        SLICE_TYPE_BINS,
+        ("prefix", "/usr/bin/"),
+        ("prefix", "/usr/sbin/"),
+        ("prefix", "/usr/libexec/"),
+    ),
+    (SLICE_TYPE_LIBS, ("is_library", True)),
+    (SLICE_TYPE_FILES, ("is_other", True)),
+]
+
+MATCH_FUNCTIONS = {
+    "prefix": str.startswith,
+    "is_copyright": lambda path, _: _is_copyright(path),
+    "is_library": lambda path, _: _is_library(path),
+    "is_other": lambda path, _: _is_other(path),
+}
+
+IGNORE_PREFIXES = [
+    "/etc/ima/",
+]
+
+
+def _is_copyright(filepath: str) -> bool:
+    """
+    Checks if a file is a copyright/license file.
+    This includes standard paths and special cases found in doc directories.
+    """
+    # Standard prefix-based check
+    if filepath.startswith("/usr/share/licenses/"):
+        return True
+
+    # Case-insensitive check for common license filenames
+    filename = Path(filepath).name.lower()
+    if filename in ("license", "copying", "copyright", "notice"):
+        # If the filename itself suggests it's a license, it's a strong indicator.
+        # This will catch cases like '/usr/share/doc/c-ares/LICENSE.md'
+        return True
+
+    return False
+
+
+def _is_library(filepath: str) -> bool:
+    """Checks if a file is a shared library."""
+    return ".so" in filepath and (
+        filepath.startswith("/usr/lib") or filepath.startswith("/lib")
+    )
+
+
+def _is_other(filepath: str) -> bool:
+    """Default rule that always matches."""
+    return True
+
+
+def classify_files(package_name: str, files: list[str]) -> dict[str, set[str]]:
+    """
+    Classifies a list of files into slices based on a defined set of rules.
+
+    Args:
+        package_name: The name of the package.
+        files: A list of file paths from the RPM.
+
+    Returns:
+        A dictionary mapping slice names to a set of file paths.
+    """
+    classified_slices = defaultdict(set)
+    for filepath in files:
+        # Skip files that start with ignored prefixes
+        if any(filepath.startswith(prefix) for prefix in IGNORE_PREFIXES):
+            continue
+
+        slice_type_suffix = SLICE_TYPE_FILES
+        for rule_suffix, *conditions in CLASSIFICATION_RULES:
+            is_match = False
+            for match_type, pattern in conditions:
+                checker_func = MATCH_FUNCTIONS.get(match_type)
+                if checker_func and checker_func(filepath, pattern):
+                    is_match = True
+                    break
+
+            if is_match:
+                slice_type_suffix = rule_suffix
+                break
+
+        slice_name = f"{package_name}{slice_type_suffix}"
+        classified_slices[slice_name].add(os.path.normpath(filepath))
+
+    return classified_slices
diff --git a/tools/generator/dependency_analyzer.py b/tools/generator/dependency_analyzer.py
new file mode 100644
index 0000000..37a2587
--- /dev/null
+++ b/tools/generator/dependency_analyzer.py
@@ -0,0 +1,208 @@
+import subprocess
+import re
+import os
+from collections import defaultdict
+from tools.logger import logger
+from tools.generator.classifier import (
+    SLICE_TYPE_BINS,
+    SLICE_TYPE_LIBS,
+    SLICE_TYPE_CONFIG,
+)
+
+
+class DependencyAnalyzer:
+    """
+    Analyzes binary dependencies for a given package.
+    """
+
+    def __init__(
+        self,
+        package_name: str,
+        pkg_extract_path: str,
+        classified_slices: dict[str, set[str]],
+    ):
+        self.package_name = package_name
+        self.pkg_extract_path = pkg_extract_path
+        self.classified_slices = classified_slices
+
+        # Caches to improve performance
+        self._file_to_pkg_cache = {}
+        self._ldconfig_cache = None
+
+    def analyze(self) -> tuple[dict[str, set[str]], set[str]]:
+        """
+        Main entry point for the analysis.
+        Orchestrates the process of analyzing dependencies for all relevant slices.
+
+        Returns:
+            A tuple containing:
+            - A dictionary mapping slice names to sets of dependent slices.
+            - A set of invalid files that could not be processed.
+        """
+        # Step 1: Analyze external binary dependencies (existing logic)
+        slice_deps, invalid_files = self._analyze_external_dependencies()
+
+        # Step 2: Inject internal dependencies (new logic)
+        self._inject_internal_dependencies(slice_deps)
+
+        return slice_deps, invalid_files
+
+    def _analyze_external_dependencies(self) -> tuple[dict[str, set[str]], set[str]]:
+        """
+        Analyzes dependencies on external packages using readelf.
+        This method checks for shared libraries used by binaries and maps them to their owning packages.
+        Returns:
+            A tuple containing:
+            - A dictionary mapping slice names to sets of dependent slices.
+            - A set of invalid files that could not be processed.
+        """
+
+        slice_deps = defaultdict(set)
+        invalid_files = set()
+
+        # 1. Pre-load the ldconfig cache once for the entire analysis
+        self._load_ldconfig_cache()
+
+        for slice_name, file_set in self.classified_slices.items():
+            if not (
+                slice_name.endswith(SLICE_TYPE_BINS)
+                or slice_name.endswith(SLICE_TYPE_LIBS)
+            ):
+                continue
+
+            for file in file_set:
+                full_file_path = os.path.join(self.pkg_extract_path, file.lstrip("/"))
+
+                if not os.path.isfile(full_file_path):
+                    continue
+
+                # 2. Get dependencies for a single binary file
+                needed_libs, is_valid_elf = self._get_needed_libraries(full_file_path)
+                if not is_valid_elf:
+                    logger.info(
+                        f"'{os.path.basename(file)}' is not a valid ELF file. It will be excluded from the SDF."
+                    )
+                    invalid_files.add(file)
+                    continue
+
+                for lib_name in needed_libs:
+                    # 3. Resolve the library name to a package owner
+                    owner_pkg = self._resolve_library_to_package(lib_name)
+
+                    if owner_pkg and owner_pkg != self.package_name:
+                        dep_slice = f"{owner_pkg}_libs"
+                        logger.debug(
+                            f"'{slice_name}' dependency found: {file} -> {lib_name} ({dep_slice})"
+                        )
+                        slice_deps[slice_name].add(dep_slice)
+
+        return slice_deps, invalid_files
+
+    def _inject_internal_dependencies(self, slice_deps: dict[str, set[str]]):
+        """
+        Adds dependencies between slices of the same package (e.g., bins -> config).
+        """
+        logger.debug("Injecting internal dependencies...")
+
+        # Construct the names of the potential slices within this package
+        config_slice_name = f"{self.package_name}{SLICE_TYPE_CONFIG}"
+        bins_slice_name = f"{self.package_name}{SLICE_TYPE_BINS}"
+        libs_slice_name = f"{self.package_name}{SLICE_TYPE_LIBS}"
+
+        # Check if a config slice exists for this package
+        if config_slice_name in self.classified_slices:
+            # If bins slice exists, make it depend on the config slice
+            if bins_slice_name in self.classified_slices:
+                logger.info(
+                    f"Adding internal dependency: {bins_slice_name} -> {config_slice_name}"
+                )
+                slice_deps[bins_slice_name].add(config_slice_name)
+
+            # If libs slice exists, make it depend on the config slice
+            if libs_slice_name in self.classified_slices:
+                logger.info(
+                    f"Adding internal dependency: {libs_slice_name} -> {config_slice_name}"
+                )
+                slice_deps[libs_slice_name].add(config_slice_name)
+
+    def _load_ldconfig_cache(self):
+        """
+        Executes `ldconfig -p` once and caches its output.
+        """
+        logger.debug("Loading ldconfig cache...")
+        if self._ldconfig_cache is None:
+            try:
+                ldconfig_proc = subprocess.run(
+                    ["ldconfig", "-p"], capture_output=True, text=True, check=True
+                )
+                self._ldconfig_cache = ldconfig_proc.stdout
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                logger.error(
+                    "`ldconfig -p` failed. Dependency analysis will be severely impacted."
+                )
+                self._ldconfig_cache = ""  # Set to empty string to avoid re-running
+
+    def _get_pkg_owner(self, file_path: str) -> str:
+        """Finds the package that owns a file using an instance cache."""
+        if file_path in self._file_to_pkg_cache:
+            return self._file_to_pkg_cache[file_path]
+        try:
+            rpm_qf_proc = subprocess.run(
+                ["rpm", "-qf", file_path], capture_output=True, text=True, check=True
+            )
+            pkg_full_name = rpm_qf_proc.stdout.strip()
+            rpm_q_qf_proc = subprocess.run(
+                ["rpm", "-q", "--qf", "%{NAME}", pkg_full_name],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            owner_pkg = rpm_q_qf_proc.stdout.strip()
+            self._file_to_pkg_cache[file_path] = owner_pkg
+            return owner_pkg
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            self._file_to_pkg_cache[file_path] = ""
+            return ""
+
+    def _get_needed_libraries(self, binary_path: str) -> tuple[list[str], bool]:
+        """
+        Runs `readelf -d` on a binary and returns a list of its needed libraries.
+        """
+        needed = []
+        try:
+            readelf_result = subprocess.run(
+                ["readelf", "-d", binary_path],
+                capture_output=True,
+                text=True,
+                check=True,
+                encoding="utf-8",
+                errors="ignore",
+            )
+            for line in readelf_result.stdout.strip().split("\n"):
+                if "(NEEDED)" in line:
+                    match = re.search(r"\[(.*)\]", line)
+                    if match:
+                        needed.append(match.group(1))
+            return needed, True
+        except subprocess.CalledProcessError as e:
+            return [], False
+
+    def _resolve_library_to_package(self, lib_name: str) -> str:
+        """
+        Resolves a library name (e.g., 'libc.so.6') to its owning package name (e.g., 'glibc').
+        Uses the ldconfig cache to find the library path.
+        """
+        if not self._ldconfig_cache:
+            return ""
+
+        lib_path_match = re.search(
+            rf"\s+{re.escape(lib_name)}\s*.*=>\s*(/.*)", self._ldconfig_cache
+        )
+        if not lib_path_match:
+            logger.warning(
+                f"Could not find path for library '{lib_name}' in ldconfig cache."
+            )
+            return ""
+
+        lib_path = lib_path_match.group(1)
+        return self._get_pkg_owner(lib_path)
diff --git a/tools/generator/environment.py b/tools/generator/environment.py
new file mode 100644
index 0000000..c7c8b48
--- /dev/null
+++ b/tools/generator/environment.py
@@ -0,0 +1,96 @@
+import subprocess
+from tools.logger import logger
+
+
+class EnvironmentManager:
+    """
+    A context manager to prepare and clean up the system environment
+    for SDF analysis.
+    """
+
+    def __init__(self, package_to_install: str):
+        self.package_to_install = package_to_install
+        self.newly_installed_packages = []
+
+    def __enter__(self):
+        """
+        The 'prepare' stage. This is executed when entering the 'with' block.
+        It installs dependencies and records what was installed.
+        """
+        logger.info("Preparing environment by installing dependencies...")
+
+        try:
+            # Get the list of packages BEFORE installation
+            before_install_proc = subprocess.run(
+                ["rpm", "-qa", "--qf", "%{NAME}\n"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            packages_before = set(before_install_proc.stdout.strip().split("\n"))
+
+            # Install the target package and its dependencies
+            subprocess.run(
+                ["dnf", "install", "-y", self.package_to_install], check=True
+            )
+
+            # Get the list of packages AFTER installation
+            after_install_proc = subprocess.run(
+                ["rpm", "-qa", "--qf", "%{NAME}\n"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            packages_after = set(after_install_proc.stdout.strip().split("\n"))
+
+            # Find the difference to know what to clean up
+            self.newly_installed_packages = sorted(
+                list(packages_after - packages_before)
+            )
+
+            if self.newly_installed_packages:
+                logger.info(
+                    f"The following packages were newly installed: {self.newly_installed_packages}"
+                )
+            else:
+                logger.info(
+                    "No new packages were installed (all dependencies were already met)."
+                )
+
+            logger.info("Environment prepared successfully.")
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Failed to prepare environment. Command '{e.cmd}' failed.")
+            logger.error(f"Stderr: {e.stderr}")
+            # Re-raise to stop the process if preparation fails
+            raise
+
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """
+        The 'cleanup' stage. This is executed when exiting the 'with' block,
+        regardless of whether an exception occurred.
+        """
+        if self.newly_installed_packages:
+            logger.info(
+                f"Cleaning up environment by removing {len(self.newly_installed_packages)} package(s)..."
+            )
+            try:
+                cleanup_command = [
+                    "dnf",
+                    "remove",
+                    "-y",
+                ] + self.newly_installed_packages
+                subprocess.run(cleanup_command, check=True)
+                logger.info("Environment cleaned up successfully.")
+            except subprocess.CalledProcessError as e:
+                logger.error("CRITICAL: Environment cleanup failed!")
+                logger.error(f"Command '{e.cmd}' failed. Stderr: {e.stderr}")
+                logger.error(
+                    f"The following packages may have been left on the system: {self.newly_installed_packages}"
+                )
+        else:
+            logger.info("No new packages to clean up.")
+
+        return False
diff --git a/tools/generator/sdfgenerator.py b/tools/generator/sdfgenerator.py
new file mode 100644
index 0000000..8c4a27b
--- /dev/null
+++ b/tools/generator/sdfgenerator.py
@@ -0,0 +1,86 @@
+import os
+import tempfile
+from tools.download import rpm
+from tools.parse import parse
+from tools.splitter.splitter import _architecture_check, _clone_slices
+from tools.logger import logger
+from tools import SLICE_PATH
+
+from tools.generator import classifier
+from tools.generator.dependency_analyzer import DependencyAnalyzer
+from tools.generator.writer import SDFWriter
+from tools.generator.environment import EnvironmentManager
+
+
+class SDFGenerator:
+    """Class to generate SDF files for a given package."""
+
+    # Class attributes for the generator
+    package: str
+    release: str
+    arch: str
+    output: str
+
+    def __init__(self, release: str, arch: str, output: str, package: str):
+        self.release = f"openEuler-{release.upper()}"
+        self.arch = _architecture_check(arch)
+        self.output = os.path.abspath(output)
+        self.package = package
+
+    def gen(self):
+        """
+        Main entry point for generating SDF files.
+        """
+        logger.info(f"===== Starting SDF Generation for: {self.package} =====")
+
+        # The 'with' statement automatically handles setup and teardown
+        with EnvironmentManager(self.package):
+
+            _clone_slices(self.release, SLICE_PATH)
+
+            # Initialize DNF client for downloading
+            logger.info(f"Downloading package: {self.package}...")
+            dnf_client = rpm.init_dnf_client(self.arch, self.release, self.output)
+            local_pkg_path = rpm.download(dnf_client, self.package)
+            if not local_pkg_path:
+                logger.error(f"Failed to download package {self.package}.")
+                rpm.clear(dnf_client)
+                return
+            logger.info(f"Package downloaded to: {local_pkg_path}")
+
+            # extracting RPM files
+            pkg_dir = tempfile.TemporaryDirectory()
+            logger.info(f"Extracting {local_pkg_path} to {pkg_dir} for analysis...")
+            parse.extract_files(local_pkg_path, pkg_dir.name, ["/*"])
+            pkg_files = get_pkg_files(pkg_dir.name)
+
+            # Classify files into slices
+            classified_slices = classifier.classify_files(self.package, pkg_files)
+            for slice_name, files in classified_slices.items():
+                logger.info(f"Slice '{slice_name}' contains {files} ")
+
+            # Analyze dependencies
+            analyzer = DependencyAnalyzer(self.package, pkg_dir.name, classified_slices)
+            slice_deps, invalid_files = analyzer.analyze()
+            for slice_name, deps in slice_deps.items():
+                logger.info(f"Slice '{slice_name}' depends on: {deps}")
+
+            # Write the SDF file
+            writer = SDFWriter(
+                self.output, self.package, classified_slices, slice_deps, invalid_files
+            )
+            writer.write()
+
+            rpm.clear(dnf_client)
+
+        logger.info(f"===== Finished SDF Generation for: {self.package} =====")
+
+
+def get_pkg_files(path):
+    files = []
+    for root, _, filenames in os.walk(path):
+        for filename in filenames:
+            full_path = os.path.join(root, filename)
+            rel_path = os.path.relpath(full_path, path)
+            files.append(f"/{rel_path}")
+    return files
diff --git a/tools/generator/writer.py b/tools/generator/writer.py
new file mode 100644
index 0000000..a1f14ea
--- /dev/null
+++ b/tools/generator/writer.py
@@ -0,0 +1,124 @@
+import yaml
+from pathlib import Path
+from tools.logger import logger
+
+from tools.generator.classifier import SLICE_TYPE_FILES
+
+
+class SDFWriter:
+    """
+    Builds the SDF data structure and writes it to a YAML file.
+    """
+
+    def __init__(
+        self,
+        output: str,
+        package_name: str,
+        classified_slices: dict[str, set[str]],
+        slice_deps: dict[str, set[str]],
+        invalid_files: set[str] = None,
+    ):
+        self.output_path = Path(output) / f"{package_name}.yaml"
+        self.package_name = package_name
+        self.classified_slices = classified_slices
+        self.slice_deps = slice_deps
+        self.invalid_files = invalid_files or set()
+
+    def write(self):
+        """
+        Main entry point to build the data and write the file.
+        """
+        sdf_data = self._build_sdf_structure()
+
+        self.output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.output_path, "w", encoding="utf-8") as f:
+            yaml.dump(sdf_data, f, indent=2, sort_keys=False, default_flow_style=False)
+
+        logger.info(f"SDF file written to: {self.output_path}")
+
+    def _build_sdf_structure(self) -> dict:
+        """
+        Assembles the final SDF data dictionary.
+        """
+        copyright_slice_name = f"{self.package_name}_copyright"
+        sdf_data = {
+            "package": self.package_name,
+            "deps": (
+                [copyright_slice_name]
+                if copyright_slice_name in self.classified_slices
+                else []
+            ),
+            "slices": {},
+        }
+
+        files_slice_name = f"{self.package_name}{SLICE_TYPE_FILES}"
+        for slice_name, files in sorted(self.classified_slices.items()):
+            if slice_name == files_slice_name:
+                logger.debug(f"Ignoring '{slice_name}' slice from the final output.")
+                continue
+
+            # Remove any files that were identified as invalid
+            valid_files = files - self.invalid_files
+            if not valid_files:
+                logger.info(
+                    f"Slice '{slice_name}' is empty after filtering invalid files, excluding from SDF."
+                )
+                continue
+
+            short_slice_name = slice_name.replace(f"{self.package_name}_", "", 1)
+            slice_content = {}
+
+            if self.slice_deps.get(slice_name):
+                slice_content["deps"] = sorted(list(self.slice_deps[slice_name]))
+
+            # Apply path compression before adding to the structure
+            compressed_files = self._compress_paths(valid_files)
+            slice_content["contents"] = {"common": sorted(list(compressed_files))}
+
+            sdf_data["slices"][short_slice_name] = slice_content
+
+        return sdf_data
+
+    def _compress_paths(self, file_set: set[str]) -> set[str]:
+        """
+        Performs a robust path compression for versioned shared libraries.
+        This version uses a direct prefix matching approach.
+        """
+        # We only attempt to compress files that look like libraries
+        libs = sorted([f for f in file_set if ".so" in f], key=len)
+        other_files = {f for f in file_set if ".so" not in f}
+
+        if not libs:
+            return other_files
+
+        # The core idea: iterate through the sorted libs. If a lib is a prefix
+        # of subsequent libs, it becomes a candidate for a wildcard.
+        compressed_libs = set()
+
+        # Use a boolean array to mark which libraries have been consumed
+        # by a wildcard prefix.
+        consumed = [False] * len(libs)
+
+        for i in range(len(libs)):
+            if consumed[i]:
+                continue
+
+            # The current library is a potential prefix
+            prefix = libs[i]
+            is_prefix_for_others = False
+
+            for j in range(i + 1, len(libs)):
+                if libs[j].startswith(prefix):
+                    # If we find at least one longer file that starts with our prefix,
+                    # it confirms this is a valid compression case.
+                    is_prefix_for_others = True
+                    consumed[j] = True  # Mark the longer path as consumed
+
+            if is_prefix_for_others:
+                # Add the prefix with a wildcard
+                compressed_libs.add(f"{prefix}*")
+            else:
+                # If it wasn't a prefix for any other lib, add it as is
+                compressed_libs.add(prefix)
+
+        return compressed_libs.union(other_files)
diff --git a/tools/main.py b/tools/main.py
index fa535a4..678e0e4 100644
--- a/tools/main.py
+++ b/tools/main.py
@@ -1,5 +1,6 @@
 import click
 from tools.cmd.cut import cut
+from tools.cmd.gen import gen
 
 
 @click.group(help="""
@@ -12,6 +13,7 @@ def entrance():
 def _add_commands():
     # Unified interface for extension.
     entrance.add_command(cut)
+    entrance.add_command(gen)
 
 def main():
     _add_commands()
-- 
Gitee


From 1ca11de8811bb8fe5b8173f6028270f39e50e48c Mon Sep 17 00:00:00 2001
From: mahaoliang <mahaoliang@gmail.com>
Date: Tue, 29 Jul 2025 12:19:25 +0800
Subject: [PATCH 2/6] fix(tools): enhance ELF file validation in dependency
 analysis

- Pre-check files with `file` command to ensure they are ELF binaries
- Skip dependency analysis for non-ELF files
- Improve error handling for `file` and `readelf` commands
- Provide clearer logging and error messages
---
 tools/generator/dependency_analyzer.py | 52 +++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 5 deletions(-)

diff --git a/tools/generator/dependency_analyzer.py b/tools/generator/dependency_analyzer.py
index 37a2587..43fc755 100644
--- a/tools/generator/dependency_analyzer.py
+++ b/tools/generator/dependency_analyzer.py
@@ -79,9 +79,6 @@ class DependencyAnalyzer:
                 # 2. Get dependencies for a single binary file
                 needed_libs, is_valid_elf = self._get_needed_libraries(full_file_path)
                 if not is_valid_elf:
-                    logger.info(
-                        f"'{os.path.basename(file)}' is not a valid ELF file. It will be excluded from the SDF."
-                    )
                     invalid_files.add(file)
                     continue
 
@@ -166,8 +163,44 @@ class DependencyAnalyzer:
 
     def _get_needed_libraries(self, binary_path: str) -> tuple[list[str], bool]:
         """
-        Runs `readelf -d` on a binary and returns a list of its needed libraries.
+        Runs `readelf -d` on a binary after pre-checking with `file` command.
+        Returns:
+            A tuple containing:
+            - A list of needed libraries.
+            - A boolean indicating if the binary is a valid ELF file.
         """
+        # Step 1: Pre-check the file type using the `file` command.
+        try:
+            file_proc = subprocess.run(
+                ["file", "-L", binary_path],
+                capture_output=True,
+                text=True,
+                check=True,
+                encoding="utf-8",
+                errors="ignore",
+            )
+            file_type_desc = file_proc.stdout.lower()
+
+            # If the file is not an ELF binary, skip dependency analysis.
+            if "elf" not in file_type_desc:
+                logger.info(
+                    f"Skipping dependency analysis for non-ELF file: "
+                    f"{os.path.basename(binary_path)} ({file_type_desc.strip()})"
+                )
+                return [], False
+
+        except FileNotFoundError as e:
+            raise RuntimeError(
+                "FATAL: `file` command not found. Please ensure the `file` "
+                "package is installed."
+            ) from e
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(
+                f"`file` command failed on {os.path.basename(binary_path)}: "
+                f"{e.stderr.strip()}. Halting analysis."
+            ) from e
+
+        # Step 2: If it's an ELF binary, run `readelf -d` to get needed libraries.
         needed = []
         try:
             readelf_result = subprocess.run(
@@ -184,8 +217,17 @@ class DependencyAnalyzer:
                     if match:
                         needed.append(match.group(1))
             return needed, True
+        except FileNotFoundError as e:
+            raise RuntimeError(
+                "FATAL: `readelf` command not found. Please ensure the `binutils` "
+                "package is installed."
+            ) from e
         except subprocess.CalledProcessError as e:
-            return [], False
+            raise RuntimeError(
+                f"readelf failed on '{os.path.basename(binary_path)}' "
+                f"which was identified as an ELF file. The file might be corrupted. "
+                f"Halting analysis. Error: {e.stderr.strip()}"
+            ) from e
 
     def _resolve_library_to_package(self, lib_name: str) -> str:
         """
-- 
Gitee


From f153e103061c8f1bd67a464eced8c260b294274c Mon Sep 17 00:00:00 2001
From: mahaoliang <mahaoliang@gmail.com>
Date: Tue, 29 Jul 2025 12:40:07 +0800
Subject: [PATCH 3/6] refactor(tools): improve code readability and structure
 in DependencyAnalyzer

---
 tools/generator/dependency_analyzer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/generator/dependency_analyzer.py b/tools/generator/dependency_analyzer.py
index 43fc755..5cbdcfc 100644
--- a/tools/generator/dependency_analyzer.py
+++ b/tools/generator/dependency_analyzer.py
@@ -60,9 +60,10 @@ class DependencyAnalyzer:
         slice_deps = defaultdict(set)
         invalid_files = set()
 
-        # 1. Pre-load the ldconfig cache once for the entire analysis
+        # Pre-load the ldconfig cache once for the entire analysis
         self._load_ldconfig_cache()
 
+        # Iterate through classified slices to find binaries and libraries
         for slice_name, file_set in self.classified_slices.items():
             if not (
                 slice_name.endswith(SLICE_TYPE_BINS)
@@ -76,14 +77,14 @@ class DependencyAnalyzer:
                 if not os.path.isfile(full_file_path):
                     continue
 
-                # 2. Get dependencies for a single binary file
+                # Get dependencies for a single binary file
                 needed_libs, is_valid_elf = self._get_needed_libraries(full_file_path)
                 if not is_valid_elf:
                     invalid_files.add(file)
                     continue
 
                 for lib_name in needed_libs:
-                    # 3. Resolve the library name to a package owner
+                    # Resolve the library name to a package owner
                     owner_pkg = self._resolve_library_to_package(lib_name)
 
                     if owner_pkg and owner_pkg != self.package_name:
-- 
Gitee


From fff2704d3e6b4f88f8409cb05082abb065fab597 Mon Sep 17 00:00:00 2001
From: mahaoliang <mahaoliang@gmail.com>
Date: Fri, 1 Aug 2025 22:59:11 +0800
Subject: [PATCH 4/6] =?UTF-8?q?=E9=87=8D=E6=9E=84=EF=BC=9A=E4=BC=98?=
 =?UTF-8?q?=E5=8C=96=E6=96=87=E4=BB=B6=E5=88=86=E7=B1=BB=E4=B8=8E=E4=BE=9D?=
 =?UTF-8?q?=E8=B5=96=E5=88=86=E6=9E=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 在文件分类时，剔除无法归类的文件。
- 对于 bin 和 lib 中的文件，不是简单的使用扩展名 .so 进行判断，而是用 file 判断是否为 ELF 类型。
---
 tools/generator/classifier.py          | 136 +++++++++++++++----------
 tools/generator/dependency_analyzer.py | 120 +++++++---------------
 tools/generator/sdfgenerator.py        |  27 ++---
 tools/generator/writer.py              |  18 +---
 4 files changed, 132 insertions(+), 169 deletions(-)

diff --git a/tools/generator/classifier.py b/tools/generator/classifier.py
index 717c882..ac6bbe9 100644
--- a/tools/generator/classifier.py
+++ b/tools/generator/classifier.py
@@ -1,41 +1,43 @@
 from collections import defaultdict
 import os
 from pathlib import Path
+import subprocess
 
 # Slice Type Definitions
 SLICE_TYPE_COPYRIGHT = "_copyright"
 SLICE_TYPE_CONFIG = "_config"
 SLICE_TYPE_BINS = "_bins"
 SLICE_TYPE_LIBS = "_libs"
-SLICE_TYPE_FILES = "_files"
-
-# Rule Definition
-CLASSIFICATION_RULES = [
-    (SLICE_TYPE_COPYRIGHT, ("is_copyright", True)),
-    (SLICE_TYPE_CONFIG, ("prefix", "/etc/")),
-    (
-        SLICE_TYPE_BINS,
-        ("prefix", "/usr/bin/"),
-        ("prefix", "/usr/sbin/"),
-        ("prefix", "/usr/libexec/"),
-    ),
-    (SLICE_TYPE_LIBS, ("is_library", True)),
-    (SLICE_TYPE_FILES, ("is_other", True)),
-]
-
-MATCH_FUNCTIONS = {
-    "prefix": str.startswith,
-    "is_copyright": lambda path, _: _is_copyright(path),
-    "is_library": lambda path, _: _is_library(path),
-    "is_other": lambda path, _: _is_other(path),
-}
-
-IGNORE_PREFIXES = [
-    "/etc/ima/",
-]
 
 
-def _is_copyright(filepath: str) -> bool:
+def _get_file_type_desc(filepath: str) -> str:
+    """
+    Uses `file -L` to get the canonical type of a file.
+    Returns a lower-cased description string.
+    """
+    if not os.path.exists(filepath):
+        return ""
+    try:
+        file_proc = subprocess.run(
+            ["file", "-L", filepath],
+            capture_output=True,
+            text=True,
+            check=True,
+            encoding="utf-8",
+            errors="ignore",
+        )
+        return file_proc.stdout.lower()
+    except FileNotFoundError as e:
+        raise RuntimeError(
+            "FATAL: `file` command not found. Please ensure the `file` package is installed."
+        ) from e
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(
+            f"`file` command failed on {filepath}: {e.stderr.strip()}. Halting analysis."
+        ) from e
+
+
+def _is_copyright(filepath: str, pkg_root_dir: str) -> bool:
     """
     Checks if a file is a copyright/license file.
     This includes standard paths and special cases found in doc directories.
@@ -46,7 +48,7 @@ def _is_copyright(filepath: str) -> bool:
 
     # Case-insensitive check for common license filenames
     filename = Path(filepath).name.lower()
-    if filename in ("license", "copying", "copyright", "notice"):
+    if any(keyword in filename for keyword in ("license", "copying", "copyright", "notice")):
         # If the filename itself suggests it's a license, it's a strong indicator.
         # This will catch cases like '/usr/share/doc/c-ares/LICENSE.md'
         return True
@@ -54,48 +56,80 @@ def _is_copyright(filepath: str) -> bool:
     return False
 
 
-def _is_library(filepath: str) -> bool:
+def _is_config(filepath: str, pkg_root_dir: str) -> bool:
+    if filepath.startswith("/etc/ima/"):
+        return False
+    return filepath.startswith("/etc/")
+
+
+def _is_bin(filepath: str, pkg_root_dir: str) -> bool:
+    """
+    Checks if a file is a binary executable.
+    """
+    if not (filepath.startswith(("/usr/bin/", "/usr/sbin/", "/usr/libexec/"))):
+        return False
+
+    full_path = os.path.join(pkg_root_dir, filepath.lstrip("/"))
+    desc = _get_file_type_desc(full_path)
+    return "elf" in desc and "executable" in desc
+
+
+def _is_library(filepath: str, pkg_root_dir: str) -> bool:
     """Checks if a file is a shared library."""
-    return ".so" in filepath and (
-        filepath.startswith("/usr/lib") or filepath.startswith("/lib")
-    )
+    if not (filepath.startswith("/usr/lib") or filepath.startswith("/lib")):
+        return False
+
+    full_path = os.path.join(pkg_root_dir, filepath.lstrip("/"))
+    desc = _get_file_type_desc(full_path)
+    return "elf" in desc and "shared object" in desc
+
 
+def _get_pkg_files(pkg_root_dir: str) -> list[str]:
+    files = []
+    for root, _, filenames in os.walk(pkg_root_dir):
+        for filename in filenames:
+            full_path = os.path.join(root, filename)
+            rel_path = os.path.relpath(full_path, pkg_root_dir)
+            files.append(f"/{rel_path}")
+    return files
 
-def _is_other(filepath: str) -> bool:
-    """Default rule that always matches."""
-    return True
 
+# Rule Definition
+CLASSIFICATION_RULES = [
+    (SLICE_TYPE_COPYRIGHT, _is_copyright),
+    (SLICE_TYPE_CONFIG, _is_config),
+    (SLICE_TYPE_BINS, _is_bin),
+    (SLICE_TYPE_LIBS, _is_library),
+]
 
-def classify_files(package_name: str, files: list[str]) -> dict[str, set[str]]:
+
+def classify_files(package_name: str, pkg_root_dir: str) -> dict[str, set[str]]:
     """
     Classifies a list of files into slices based on a defined set of rules.
 
     Args:
         package_name: The name of the package.
-        files: A list of file paths from the RPM.
+        pkg_root_dir: The root path where files were extracted.
 
     Returns:
         A dictionary mapping slice names to a set of file paths.
     """
+    files = _get_pkg_files(pkg_root_dir)
     classified_slices = defaultdict(set)
+
     for filepath in files:
-        # Skip files that start with ignored prefixes
-        if any(filepath.startswith(prefix) for prefix in IGNORE_PREFIXES):
-            continue
 
-        slice_type_suffix = SLICE_TYPE_FILES
-        for rule_suffix, *conditions in CLASSIFICATION_RULES:
-            is_match = False
-            for match_type, pattern in conditions:
-                checker_func = MATCH_FUNCTIONS.get(match_type)
-                if checker_func and checker_func(filepath, pattern):
-                    is_match = True
-                    break
-
-            if is_match:
-                slice_type_suffix = rule_suffix
+        slice_type_suffix = None
+
+        for suffix, checker_func in CLASSIFICATION_RULES:
+            if checker_func(filepath, pkg_root_dir):
+                slice_type_suffix = suffix
                 break
 
+        # If no rule matched, skip the file
+        if slice_type_suffix is None:
+            continue
+
         slice_name = f"{package_name}{slice_type_suffix}"
         classified_slices[slice_name].add(os.path.normpath(filepath))
 
diff --git a/tools/generator/dependency_analyzer.py b/tools/generator/dependency_analyzer.py
index 5cbdcfc..fb87477 100644
--- a/tools/generator/dependency_analyzer.py
+++ b/tools/generator/dependency_analyzer.py
@@ -29,46 +29,39 @@ class DependencyAnalyzer:
         self._file_to_pkg_cache = {}
         self._ldconfig_cache = None
 
-    def analyze(self) -> tuple[dict[str, set[str]], set[str]]:
+    def analyze(self) -> dict[str, set[str]]:
         """
         Main entry point for the analysis.
         Orchestrates the process of analyzing dependencies for all relevant slices.
 
         Returns:
-            A tuple containing:
-            - A dictionary mapping slice names to sets of dependent slices.
-            - A set of invalid files that could not be processed.
+        - A dictionary mapping slice names to sets of dependent slices.
         """
         # Step 1: Analyze external binary dependencies (existing logic)
-        slice_deps, invalid_files = self._analyze_external_dependencies()
+        slice_deps = self._analyze_external_dependencies()
 
         # Step 2: Inject internal dependencies (new logic)
         self._inject_internal_dependencies(slice_deps)
 
-        return slice_deps, invalid_files
+        return slice_deps
 
-    def _analyze_external_dependencies(self) -> tuple[dict[str, set[str]], set[str]]:
+    def _analyze_external_dependencies(self) -> dict[str, set[str]]:
         """
         Analyzes dependencies on external packages using readelf.
-        This method checks for shared libraries used by binaries and maps them to their owning packages.
+        This method checks for shared libraries used by binaries
+        and maps them to their owning packages.
         Returns:
-            A tuple containing:
-            - A dictionary mapping slice names to sets of dependent slices.
-            - A set of invalid files that could not be processed.
+        - A dictionary mapping slice names to sets of dependent slices.
         """
 
         slice_deps = defaultdict(set)
-        invalid_files = set()
 
         # Pre-load the ldconfig cache once for the entire analysis
         self._load_ldconfig_cache()
 
         # Iterate through classified slices to find binaries and libraries
         for slice_name, file_set in self.classified_slices.items():
-            if not (
-                slice_name.endswith(SLICE_TYPE_BINS)
-                or slice_name.endswith(SLICE_TYPE_LIBS)
-            ):
+            if not (slice_name.endswith(SLICE_TYPE_BINS) or slice_name.endswith(SLICE_TYPE_LIBS)):
                 continue
 
             for file in file_set:
@@ -78,10 +71,7 @@ class DependencyAnalyzer:
                     continue
 
                 # Get dependencies for a single binary file
-                needed_libs, is_valid_elf = self._get_needed_libraries(full_file_path)
-                if not is_valid_elf:
-                    invalid_files.add(file)
-                    continue
+                needed_libs = self._get_needed_libraries(full_file_path)
 
                 for lib_name in needed_libs:
                     # Resolve the library name to a package owner
@@ -94,7 +84,7 @@ class DependencyAnalyzer:
                         )
                         slice_deps[slice_name].add(dep_slice)
 
-        return slice_deps, invalid_files
+        return slice_deps
 
     def _inject_internal_dependencies(self, slice_deps: dict[str, set[str]]):
         """
@@ -111,16 +101,12 @@ class DependencyAnalyzer:
         if config_slice_name in self.classified_slices:
             # If bins slice exists, make it depend on the config slice
             if bins_slice_name in self.classified_slices:
-                logger.info(
-                    f"Adding internal dependency: {bins_slice_name} -> {config_slice_name}"
-                )
+                logger.info(f"Adding internal dependency: {bins_slice_name} -> {config_slice_name}")
                 slice_deps[bins_slice_name].add(config_slice_name)
 
             # If libs slice exists, make it depend on the config slice
             if libs_slice_name in self.classified_slices:
-                logger.info(
-                    f"Adding internal dependency: {libs_slice_name} -> {config_slice_name}"
-                )
+                logger.info(f"Adding internal dependency: {libs_slice_name} -> {config_slice_name}")
                 slice_deps[libs_slice_name].add(config_slice_name)
 
     def _load_ldconfig_cache(self):
@@ -128,17 +114,19 @@ class DependencyAnalyzer:
         Executes `ldconfig -p` once and caches its output.
         """
         logger.debug("Loading ldconfig cache...")
-        if self._ldconfig_cache is None:
-            try:
-                ldconfig_proc = subprocess.run(
-                    ["ldconfig", "-p"], capture_output=True, text=True, check=True
-                )
-                self._ldconfig_cache = ldconfig_proc.stdout
-            except (subprocess.CalledProcessError, FileNotFoundError):
-                logger.error(
-                    "`ldconfig -p` failed. Dependency analysis will be severely impacted."
-                )
-                self._ldconfig_cache = ""  # Set to empty string to avoid re-running
+        try:
+            ldconfig_proc = subprocess.run(
+                ["ldconfig", "-p"], capture_output=True, text=True, check=True
+            )
+            self._ldconfig_cache = ldconfig_proc.stdout
+        except FileNotFoundError as e:
+            raise RuntimeError(
+                "`ldconfig` command not found. Please ensure the `glibc` package is installed."
+            ) from e
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(
+                f"`ldconfig` command failed: {e.stderr.strip()}. Halting analysis."
+            ) from e
 
     def _get_pkg_owner(self, file_path: str) -> str:
         """Finds the package that owns a file using an instance cache."""
@@ -158,50 +146,22 @@ class DependencyAnalyzer:
             owner_pkg = rpm_q_qf_proc.stdout.strip()
             self._file_to_pkg_cache[file_path] = owner_pkg
             return owner_pkg
-        except (subprocess.CalledProcessError, FileNotFoundError):
+        except FileNotFoundError as e:
+            raise RuntimeError(
+                "FATAL: `rpm` command not found. Please ensure the `rpm` package is installed."
+            ) from e
+        except subprocess.CalledProcessError:
             self._file_to_pkg_cache[file_path] = ""
             return ""
 
-    def _get_needed_libraries(self, binary_path: str) -> tuple[list[str], bool]:
+    def _get_needed_libraries(self, binary_path: str) -> list[str]:
         """
-        Runs `readelf -d` on a binary after pre-checking with `file` command.
+        Runs `readelf -d` on a binary. Assumes the file is a valid ELF
+        as it has been pre-filtered by the classifier.
+
         Returns:
-            A tuple containing:
-            - A list of needed libraries.
-            - A boolean indicating if the binary is a valid ELF file.
+            A list of needed libraries.
         """
-        # Step 1: Pre-check the file type using the `file` command.
-        try:
-            file_proc = subprocess.run(
-                ["file", "-L", binary_path],
-                capture_output=True,
-                text=True,
-                check=True,
-                encoding="utf-8",
-                errors="ignore",
-            )
-            file_type_desc = file_proc.stdout.lower()
-
-            # If the file is not an ELF binary, skip dependency analysis.
-            if "elf" not in file_type_desc:
-                logger.info(
-                    f"Skipping dependency analysis for non-ELF file: "
-                    f"{os.path.basename(binary_path)} ({file_type_desc.strip()})"
-                )
-                return [], False
-
-        except FileNotFoundError as e:
-            raise RuntimeError(
-                "FATAL: `file` command not found. Please ensure the `file` "
-                "package is installed."
-            ) from e
-        except subprocess.CalledProcessError as e:
-            raise RuntimeError(
-                f"`file` command failed on {os.path.basename(binary_path)}: "
-                f"{e.stderr.strip()}. Halting analysis."
-            ) from e
-
-        # Step 2: If it's an ELF binary, run `readelf -d` to get needed libraries.
         needed = []
         try:
             readelf_result = subprocess.run(
@@ -217,7 +177,7 @@ class DependencyAnalyzer:
                     match = re.search(r"\[(.*)\]", line)
                     if match:
                         needed.append(match.group(1))
-            return needed, True
+            return needed
         except FileNotFoundError as e:
             raise RuntimeError(
                 "FATAL: `readelf` command not found. Please ensure the `binutils` "
@@ -225,7 +185,7 @@ class DependencyAnalyzer:
             ) from e
         except subprocess.CalledProcessError as e:
             raise RuntimeError(
-                f"readelf failed on '{os.path.basename(binary_path)}' "
+                f"readelf failed on '{binary_path}' "
                 f"which was identified as an ELF file. The file might be corrupted. "
                 f"Halting analysis. Error: {e.stderr.strip()}"
             ) from e
@@ -242,9 +202,7 @@ class DependencyAnalyzer:
             rf"\s+{re.escape(lib_name)}\s*.*=>\s*(/.*)", self._ldconfig_cache
         )
         if not lib_path_match:
-            logger.warning(
-                f"Could not find path for library '{lib_name}' in ldconfig cache."
-            )
+            logger.warning(f"Could not find path for library '{lib_name}' in ldconfig cache.")
             return ""
 
         lib_path = lib_path_match.group(1)
diff --git a/tools/generator/sdfgenerator.py b/tools/generator/sdfgenerator.py
index 8c4a27b..99fdf72 100644
--- a/tools/generator/sdfgenerator.py
+++ b/tools/generator/sdfgenerator.py
@@ -49,38 +49,25 @@ class SDFGenerator:
             logger.info(f"Package downloaded to: {local_pkg_path}")
 
             # extracting RPM files
-            pkg_dir = tempfile.TemporaryDirectory()
-            logger.info(f"Extracting {local_pkg_path} to {pkg_dir} for analysis...")
-            parse.extract_files(local_pkg_path, pkg_dir.name, ["/*"])
-            pkg_files = get_pkg_files(pkg_dir.name)
+            pkg_root_dir = tempfile.TemporaryDirectory()
+            logger.info(f"Extracting {local_pkg_path} to {pkg_root_dir} for analysis...")
+            parse.extract_files(local_pkg_path, pkg_root_dir.name, ["/*"])
 
             # Classify files into slices
-            classified_slices = classifier.classify_files(self.package, pkg_files)
+            classified_slices = classifier.classify_files(self.package, pkg_root_dir.name)
             for slice_name, files in classified_slices.items():
                 logger.info(f"Slice '{slice_name}' contains {files} ")
 
             # Analyze dependencies
-            analyzer = DependencyAnalyzer(self.package, pkg_dir.name, classified_slices)
-            slice_deps, invalid_files = analyzer.analyze()
+            analyzer = DependencyAnalyzer(self.package, pkg_root_dir.name, classified_slices)
+            slice_deps = analyzer.analyze()
             for slice_name, deps in slice_deps.items():
                 logger.info(f"Slice '{slice_name}' depends on: {deps}")
 
             # Write the SDF file
-            writer = SDFWriter(
-                self.output, self.package, classified_slices, slice_deps, invalid_files
-            )
+            writer = SDFWriter(self.output, self.package, classified_slices, slice_deps)
             writer.write()
 
             rpm.clear(dnf_client)
 
         logger.info(f"===== Finished SDF Generation for: {self.package} =====")
-
-
-def get_pkg_files(path):
-    files = []
-    for root, _, filenames in os.walk(path):
-        for filename in filenames:
-            full_path = os.path.join(root, filename)
-            rel_path = os.path.relpath(full_path, path)
-            files.append(f"/{rel_path}")
-    return files
diff --git a/tools/generator/writer.py b/tools/generator/writer.py
index a1f14ea..41f6c7a 100644
--- a/tools/generator/writer.py
+++ b/tools/generator/writer.py
@@ -2,8 +2,6 @@ import yaml
 from pathlib import Path
 from tools.logger import logger
 
-from tools.generator.classifier import SLICE_TYPE_FILES
-
 
 class SDFWriter:
     """
@@ -16,13 +14,11 @@ class SDFWriter:
         package_name: str,
         classified_slices: dict[str, set[str]],
         slice_deps: dict[str, set[str]],
-        invalid_files: set[str] = None,
     ):
         self.output_path = Path(output) / f"{package_name}.yaml"
         self.package_name = package_name
         self.classified_slices = classified_slices
         self.slice_deps = slice_deps
-        self.invalid_files = invalid_files or set()
 
     def write(self):
         """
@@ -51,19 +47,7 @@ class SDFWriter:
             "slices": {},
         }
 
-        files_slice_name = f"{self.package_name}{SLICE_TYPE_FILES}"
         for slice_name, files in sorted(self.classified_slices.items()):
-            if slice_name == files_slice_name:
-                logger.debug(f"Ignoring '{slice_name}' slice from the final output.")
-                continue
-
-            # Remove any files that were identified as invalid
-            valid_files = files - self.invalid_files
-            if not valid_files:
-                logger.info(
-                    f"Slice '{slice_name}' is empty after filtering invalid files, excluding from SDF."
-                )
-                continue
 
             short_slice_name = slice_name.replace(f"{self.package_name}_", "", 1)
             slice_content = {}
@@ -72,7 +56,7 @@ class SDFWriter:
                 slice_content["deps"] = sorted(list(self.slice_deps[slice_name]))
 
             # Apply path compression before adding to the structure
-            compressed_files = self._compress_paths(valid_files)
+            compressed_files = self._compress_paths(files)
             slice_content["contents"] = {"common": sorted(list(compressed_files))}
 
             sdf_data["slices"][short_slice_name] = slice_content
-- 
Gitee


From d40052fce340d2f08bce4761b632466e2e6808f8 Mon Sep 17 00:00:00 2001
From: mahaoliang <mahaoliang@gmail.com>
Date: Fri, 8 Aug 2025 11:01:33 +0800
Subject: [PATCH 5/6] =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E5=9F=BA=E4=BA=8EDocker?=
 =?UTF-8?q?=E7=9A=84SDF=E7=94=9F=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 为SDF生成器环境添加Dockerfile
- 创建gen-sdf-docker.sh脚本，用于在Docker中运行SDF生成
- 移除EnvironmentManager类，并更新SDFGenerator以适配Docker
- 重构SDF生成流程，使其在Docker容器内运行
---
 bin/Dockerfile                  |  26 +++++++++
 bin/gen-sdf-docker.sh           | 100 ++++++++++++++++++++++++++++++++
 tools/generator/environment.py  |  96 ------------------------------
 tools/generator/sdfgenerator.py |  58 +++++++++---------
 4 files changed, 153 insertions(+), 127 deletions(-)
 create mode 100644 bin/Dockerfile
 create mode 100755 bin/gen-sdf-docker.sh
 delete mode 100644 tools/generator/environment.py

diff --git a/bin/Dockerfile b/bin/Dockerfile
new file mode 100644
index 0000000..1bf3f67
--- /dev/null
+++ b/bin/Dockerfile
@@ -0,0 +1,26 @@
+# Dockerfile for the SDF Generator environment
+
+# Start from a specific openEuler release.
+ARG RELEASE_TAG=24.03-lts
+FROM hub.oepkgs.net/openeuler/openeuler:${RELEASE_TAG}
+
+# Install all the dependencies for the splitter tool.
+RUN dnf install -y \
+    python3-dnf \
+    git \
+    python3-pip \
+    cpio \
+    binutils \
+    file && \
+    dnf clean all
+
+# Copy the splitter source code into the image.
+COPY . /splitter
+
+# Install the splitter tool itself and its dependencies.
+RUN cd /splitter && \
+    pip3 install -i https://repo.huaweicloud.com/repository/pypi/simple .
+
+WORKDIR /splitter
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/bin/gen-sdf-docker.sh b/bin/gen-sdf-docker.sh
new file mode 100755
index 0000000..e97fae5
--- /dev/null
+++ b/bin/gen-sdf-docker.sh
@@ -0,0 +1,100 @@
+#!/usr/bin/env bash
+set -eu
+
+# base directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "${SCRIPT_DIR}/.." || exit 1
+BASE_DIR=$(pwd)
+
+# Help Function
+usage() {
+    echo "Usage: $0 -p <package-name> -r <release> -o <output-dir>"
+    echo "  -p, --package     Required. The name of the RPM package to generate an SDF for (e.g., 'brotli')."
+    echo "  -r, --release     Required. The openEuler release (e.g., '24.03-LTS')."
+    echo "  -o, --output      Required. The directory to save the generated SDF file."
+    echo "  -h, --help        Show this help message."
+    exit 1
+}
+
+# Defaults
+ARCH=$(uname -m)
+RELEASE=""
+PACKAGE_NAME=""
+OUTPUT_DIR=""
+
+# Argument Parsing with getopts
+while getopts ":p:r:o:h" opt; do
+  case ${opt} in
+    p )
+      PACKAGE_NAME=$OPTARG
+      ;;
+    r )
+      RELEASE=$OPTARG
+      ;;
+    o )
+      OUTPUT_DIR=$OPTARG
+      ;;
+    h )
+      usage
+      ;;
+    \? )
+      echo "Invalid Option: -$OPTARG" 1>&2
+      usage
+      ;;
+    : )
+      echo "Invalid Option: -$OPTARG requires an argument" 1>&2
+      usage
+      ;;
+  esac
+done
+
+# Input Validation
+if [[ -z "$PACKAGE_NAME" || -z "$RELEASE" || -z "$OUTPUT_DIR" ]]; then
+    echo "Error: Missing required arguments."
+    usage
+fi
+
+# Absolute path for the output directory for Docker mount
+mkdir -p "${OUTPUT_DIR}"
+ABS_OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
+
+RELEASE_TAG="${RELEASE,,}"  # Convert to lowercase for consistency
+CUSTOM_IMAGE_NAME="sdf-generator-base:${RELEASE_TAG}"
+CONTAINER_NAME="sdf-generator-${PACKAGE_NAME}-$$"
+
+echo ">>> Starting SDF Generation for '${PACKAGE_NAME}'"
+echo "    Release: ${RELEASE}"
+echo "    Release Tag: ${RELEASE_TAG}"
+echo "    Arch: ${ARCH}"
+echo "    Output Dir: ${ABS_OUTPUT_DIR}"
+echo "    Using Docker Image: ${CUSTOM_IMAGE_NAME}"
+
+# Docker Image Build
+echo ">>> Checking for custom base image: ${CUSTOM_IMAGE_NAME}"
+if [[ -z "$(docker images -q "${CUSTOM_IMAGE_NAME}")" ]]; then
+    echo ">>> Base image not found. Building it now..."
+    docker build --no-cache --build-arg RELEASE_TAG="${RELEASE_TAG}" \
+           -t "${CUSTOM_IMAGE_NAME}" \
+           -f "${BASE_DIR}/bin/Dockerfile" \
+           "${BASE_DIR}"
+    echo ">>> Base image built successfully."
+else
+    echo ">>> Base image found."
+fi
+
+echo ">>> Output SDF will be saved to: ${ABS_OUTPUT_DIR}"
+
+# The command to be executed inside the container.
+INSTALL_CMD="dnf install -y ${PACKAGE_NAME}"
+GENERATE_CMD="splitter gen -p ${PACKAGE_NAME} -r ${RELEASE} -o /output -a ${ARCH}"
+FULL_COMMAND="${INSTALL_CMD} && ${GENERATE_CMD}"
+
+echo ">>> Starting Docker container from custom image..."
+docker run --name "${CONTAINER_NAME}" \
+           -v "${ABS_OUTPUT_DIR}:/output" \
+           --rm \
+           "${CUSTOM_IMAGE_NAME}" \
+           /bin/bash -c "${FULL_COMMAND}"
+
+echo ">>> SDF Generation complete. Docker container has been removed."
+echo ">>> Done."
\ No newline at end of file
diff --git a/tools/generator/environment.py b/tools/generator/environment.py
deleted file mode 100644
index c7c8b48..0000000
--- a/tools/generator/environment.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import subprocess
-from tools.logger import logger
-
-
-class EnvironmentManager:
-    """
-    A context manager to prepare and clean up the system environment
-    for SDF analysis.
-    """
-
-    def __init__(self, package_to_install: str):
-        self.package_to_install = package_to_install
-        self.newly_installed_packages = []
-
-    def __enter__(self):
-        """
-        The 'prepare' stage. This is executed when entering the 'with' block.
-        It installs dependencies and records what was installed.
-        """
-        logger.info("Preparing environment by installing dependencies...")
-
-        try:
-            # Get the list of packages BEFORE installation
-            before_install_proc = subprocess.run(
-                ["rpm", "-qa", "--qf", "%{NAME}\n"],
-                capture_output=True,
-                text=True,
-                check=True,
-            )
-            packages_before = set(before_install_proc.stdout.strip().split("\n"))
-
-            # Install the target package and its dependencies
-            subprocess.run(
-                ["dnf", "install", "-y", self.package_to_install], check=True
-            )
-
-            # Get the list of packages AFTER installation
-            after_install_proc = subprocess.run(
-                ["rpm", "-qa", "--qf", "%{NAME}\n"],
-                capture_output=True,
-                text=True,
-                check=True,
-            )
-            packages_after = set(after_install_proc.stdout.strip().split("\n"))
-
-            # Find the difference to know what to clean up
-            self.newly_installed_packages = sorted(
-                list(packages_after - packages_before)
-            )
-
-            if self.newly_installed_packages:
-                logger.info(
-                    f"The following packages were newly installed: {self.newly_installed_packages}"
-                )
-            else:
-                logger.info(
-                    "No new packages were installed (all dependencies were already met)."
-                )
-
-            logger.info("Environment prepared successfully.")
-
-        except subprocess.CalledProcessError as e:
-            logger.error(f"Failed to prepare environment. Command '{e.cmd}' failed.")
-            logger.error(f"Stderr: {e.stderr}")
-            # Re-raise to stop the process if preparation fails
-            raise
-
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        """
-        The 'cleanup' stage. This is executed when exiting the 'with' block,
-        regardless of whether an exception occurred.
-        """
-        if self.newly_installed_packages:
-            logger.info(
-                f"Cleaning up environment by removing {len(self.newly_installed_packages)} package(s)..."
-            )
-            try:
-                cleanup_command = [
-                    "dnf",
-                    "remove",
-                    "-y",
-                ] + self.newly_installed_packages
-                subprocess.run(cleanup_command, check=True)
-                logger.info("Environment cleaned up successfully.")
-            except subprocess.CalledProcessError as e:
-                logger.error("CRITICAL: Environment cleanup failed!")
-                logger.error(f"Command '{e.cmd}' failed. Stderr: {e.stderr}")
-                logger.error(
-                    f"The following packages may have been left on the system: {self.newly_installed_packages}"
-                )
-        else:
-            logger.info("No new packages to clean up.")
-
-        return False
diff --git a/tools/generator/sdfgenerator.py b/tools/generator/sdfgenerator.py
index 99fdf72..8bbafb8 100644
--- a/tools/generator/sdfgenerator.py
+++ b/tools/generator/sdfgenerator.py
@@ -9,7 +9,6 @@ from tools import SLICE_PATH
 from tools.generator import classifier
 from tools.generator.dependency_analyzer import DependencyAnalyzer
 from tools.generator.writer import SDFWriter
-from tools.generator.environment import EnvironmentManager
 
 
 class SDFGenerator:
@@ -33,41 +32,38 @@ class SDFGenerator:
         """
         logger.info(f"===== Starting SDF Generation for: {self.package} =====")
 
-        # The 'with' statement automatically handles setup and teardown
-        with EnvironmentManager(self.package):
+        _clone_slices(self.release, SLICE_PATH)
 
-            _clone_slices(self.release, SLICE_PATH)
-
-            # Initialize DNF client for downloading
-            logger.info(f"Downloading package: {self.package}...")
-            dnf_client = rpm.init_dnf_client(self.arch, self.release, self.output)
-            local_pkg_path = rpm.download(dnf_client, self.package)
-            if not local_pkg_path:
-                logger.error(f"Failed to download package {self.package}.")
-                rpm.clear(dnf_client)
-                return
-            logger.info(f"Package downloaded to: {local_pkg_path}")
+        # Initialize DNF client for downloading
+        logger.info(f"Downloading package: {self.package}...")
+        dnf_client = rpm.init_dnf_client(self.arch, self.release, self.output)
+        local_pkg_path = rpm.download(dnf_client, self.package)
+        if not local_pkg_path:
+            logger.error(f"Failed to download package {self.package}.")
+            rpm.clear(dnf_client)
+            return
+        logger.info(f"Package downloaded to: {local_pkg_path}")
 
-            # extracting RPM files
-            pkg_root_dir = tempfile.TemporaryDirectory()
-            logger.info(f"Extracting {local_pkg_path} to {pkg_root_dir} for analysis...")
-            parse.extract_files(local_pkg_path, pkg_root_dir.name, ["/*"])
+        # extracting RPM files
+        pkg_root_dir = tempfile.TemporaryDirectory()
+        logger.info(f"Extracting {local_pkg_path} to {pkg_root_dir} for analysis...")
+        parse.extract_files(local_pkg_path, pkg_root_dir.name, ["/*"])
 
-            # Classify files into slices
-            classified_slices = classifier.classify_files(self.package, pkg_root_dir.name)
-            for slice_name, files in classified_slices.items():
-                logger.info(f"Slice '{slice_name}' contains {files} ")
+        # Classify files into slices
+        classified_slices = classifier.classify_files(self.package, pkg_root_dir.name)
+        for slice_name, files in classified_slices.items():
+            logger.info(f"Slice '{slice_name}' contains {files} ")
 
-            # Analyze dependencies
-            analyzer = DependencyAnalyzer(self.package, pkg_root_dir.name, classified_slices)
-            slice_deps = analyzer.analyze()
-            for slice_name, deps in slice_deps.items():
-                logger.info(f"Slice '{slice_name}' depends on: {deps}")
+        # Analyze dependencies
+        analyzer = DependencyAnalyzer(self.package, pkg_root_dir.name, classified_slices)
+        slice_deps = analyzer.analyze()
+        for slice_name, deps in slice_deps.items():
+            logger.info(f"Slice '{slice_name}' depends on: {deps}")
 
-            # Write the SDF file
-            writer = SDFWriter(self.output, self.package, classified_slices, slice_deps)
-            writer.write()
+        # Write the SDF file
+        writer = SDFWriter(self.output, self.package, classified_slices, slice_deps)
+        writer.write()
 
-            rpm.clear(dnf_client)
+        rpm.clear(dnf_client)
 
         logger.info(f"===== Finished SDF Generation for: {self.package} =====")
-- 
Gitee


From 10f57b8ff011760c4fd2b3ee794101a8433baa95 Mon Sep 17 00:00:00 2001
From: mahaoliang <mahaoliang@gmail.com>
Date: Tue, 12 Aug 2025 11:09:45 +0800
Subject: [PATCH 6/6] ensure Docker is installed and running before building

---
 bin/gen-sdf-docker.sh | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/bin/gen-sdf-docker.sh b/bin/gen-sdf-docker.sh
index e97fae5..e986cb2 100755
--- a/bin/gen-sdf-docker.sh
+++ b/bin/gen-sdf-docker.sh
@@ -6,6 +6,21 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "${SCRIPT_DIR}/.." || exit 1
 BASE_DIR=$(pwd)
 
+ensure_docker_ready() {
+    # Check if Docker command exists
+    if ! command -v docker &> /dev/null; then
+        echo ">>> Docker not found. Installing ..."
+        # Check for root privileges. Exit if not root.
+        if [[ $EUID -ne 0 ]]; then
+            echo "This script must be run as root to install Docker." 
+            exit 1
+        fi
+        dnf install -y docker
+        systemctl restart docker
+        echo ">>> Docker installed."
+    fi
+}
+
 # Help Function
 usage() {
     echo "Usage: $0 -p <package-name> -r <release> -o <output-dir>"
@@ -54,6 +69,8 @@ if [[ -z "$PACKAGE_NAME" || -z "$RELEASE" || -z "$OUTPUT_DIR" ]]; then
     usage
 fi
 
+ensure_docker_ready
+
 # Absolute path for the output directory for Docker mount
 mkdir -p "${OUTPUT_DIR}"
 ABS_OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
-- 
Gitee