From 5df852635afeb30a4ae55ec8349c1f1beaa31a2b Mon Sep 17 00:00:00 2001 From: mahaoliang Date: Sun, 27 Jul 2025 17:48:07 +0800 Subject: [PATCH 1/6] Add SDF generation functionality and related components - Implement SDFGenerator class for generating Slice Definition Files (SDF). - Create DependencyAnalyzer for analyzing binary dependencies. - Add EnvironmentManager for managing package installation and cleanup. - Introduce SDFWriter for building and writing SDF data to YAML files. - Add classifier for categorizing files into slices. - Implement command-line interface for SDF generation. - Update README with new system dependencies. - Add unit tests for SDFWriter functionality. --- README.md | 2 +- tests/python/writer_test.py | 78 ++++++++++ tools/cmd/gen.py | 45 ++++++ tools/generator/__init__.py | 0 tools/generator/classifier.py | 102 ++++++++++++ tools/generator/dependency_analyzer.py | 208 +++++++++++++++++++++++++ tools/generator/environment.py | 96 ++++++++++++ tools/generator/sdfgenerator.py | 86 ++++++++++ tools/generator/writer.py | 124 +++++++++++++++ tools/main.py | 2 + 10 files changed, 742 insertions(+), 1 deletion(-) create mode 100644 tests/python/writer_test.py create mode 100644 tools/cmd/gen.py create mode 100644 tools/generator/__init__.py create mode 100644 tools/generator/classifier.py create mode 100644 tools/generator/dependency_analyzer.py create mode 100644 tools/generator/environment.py create mode 100644 tools/generator/sdfgenerator.py create mode 100644 tools/generator/writer.py diff --git a/README.md b/README.md index 849f423..acc00db 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ splitter处于开发阶段,当前仅支持在openEuler上部署(建议使用 1. 安装系统依赖 ``` -dnf install python3-dnf git python3-pip cpio +dnf install python3-dnf git python3-pip cpio binutils ``` 2. 克隆源码仓库 diff --git a/tests/python/writer_test.py b/tests/python/writer_test.py new file mode 100644 index 0000000..ffe28e8 --- /dev/null +++ b/tests/python/writer_test.py @@ -0,0 +1,78 @@ +import unittest +from tools.generator.writer import SDFWriter + + +class TestCompressPaths(unittest.TestCase): + def setUp(self): + self.writer = SDFWriter( + output="test_output", + package_name="test_package", + classified_slices={}, + slice_deps={}, + ) + + def test_empty_set(self): + """Test an empty set of files""" + result = self.writer._compress_paths(set()) + self.assertEqual(result, set()) + + def test_no_so_files(self): + """Test the case without .so files""" + files = {"file1.txt", "dir/file2.py"} + result = self.writer._compress_paths(files) + self.assertEqual(result, files) + + def test_single_so_file(self): + """Test a single .so file""" + files = {"libtest.so"} + result = self.writer._compress_paths(files) + self.assertEqual(result, files) + + def test_multiple_unrelated_so_files(self): + """Test multiple unrelated .so files""" + files = {"libA.so", "libB.so", "libC.so"} + result = self.writer._compress_paths(files) + self.assertEqual(result, files) + + def test_versioned_libs(self): + """Test compression of versioned library files""" + files = { + "libtest.so.1", + "libtest.so.1.2", + "libtest.so.1.2.3", + "other.so", + } + expected = {"libtest.so.1*", "other.so"} + result = self.writer._compress_paths(files) + self.assertEqual(result, expected) + + def test_mixed_files(self): + """Test mixed file types (.so and other files)""" + files = { + "libtest.so.1", + "libtest.so.1.2", + "file.txt", + "libother.so", + "script.py", + } + expected = {"libtest.so.1*", "libother.so", "file.txt", "script.py"} + result = self.writer._compress_paths(files) + self.assertEqual(result, expected) + + def test_multiple_version_groups(self): + """Test multiple groups of versioned library files""" + files = {"libA.so.1", "libA.so.1.2", "libB.so.1", "libB.so.1.2", "libC.so"} + expected = {"libA.so.1*", "libB.so.1*", "libC.so"} + result = self.writer._compress_paths(files) + self.assertEqual(result, expected) + + def test_partial_matches(self): + """Test partial but not exact matches""" + files = {"libtest.so.1", "libtestX.so.1.2", "libtest.so.1.2"} + expected = {"libtest.so.1*", "libtestX.so.1.2"} + result = self.writer._compress_paths(files) + self.assertEqual(result, expected) + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/cmd/gen.py b/tools/cmd/gen.py new file mode 100644 index 0000000..8a62c54 --- /dev/null +++ b/tools/cmd/gen.py @@ -0,0 +1,45 @@ +import click +import platform + +from tools.generator.sdfgenerator import SDFGenerator + + +@click.command( + name="gen", + help="Automatically generate a Slice Definition File (SDF) for an openEuler package.", +) +@click.option( + "-r", + "--release", + required=True, + help="This decides which openEuler release you will use, such as `openEuler-24.03-LTS-SP1`.", +) +@click.option( + "-a", + "--arch", + default=None, + help="The architecture. If not provided, it will be auto-detected from the host machine.", +) +@click.option( + "-o", "--output", default=".", help="The directory to save the generated SDF file." +) +@click.option( + "-p", + "--package", + required=True, + help="The name of the RPM package to generate an SDF for (e.g., 'attr').", +) +def gen(release, arch, output, package): + """ + CLI command to orchestrate SDF generation. + """ + if not arch: + arch = platform.machine() + click.echo(f"Architecture not specified, auto-detected: {arch}") + + click.echo( + f"Starting SDF generation for '{package}' on openEuler-{release} ({arch})..." + ) + + generator = SDFGenerator(release=release, arch=arch, output=output, package=package) + generator.gen() diff --git a/tools/generator/__init__.py b/tools/generator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tools/generator/classifier.py b/tools/generator/classifier.py new file mode 100644 index 0000000..717c882 --- /dev/null +++ b/tools/generator/classifier.py @@ -0,0 +1,102 @@ +from collections import defaultdict +import os +from pathlib import Path + +# Slice Type Definitions +SLICE_TYPE_COPYRIGHT = "_copyright" +SLICE_TYPE_CONFIG = "_config" +SLICE_TYPE_BINS = "_bins" +SLICE_TYPE_LIBS = "_libs" +SLICE_TYPE_FILES = "_files" + +# Rule Definition +CLASSIFICATION_RULES = [ + (SLICE_TYPE_COPYRIGHT, ("is_copyright", True)), + (SLICE_TYPE_CONFIG, ("prefix", "/etc/")), + ( + SLICE_TYPE_BINS, + ("prefix", "/usr/bin/"), + ("prefix", "/usr/sbin/"), + ("prefix", "/usr/libexec/"), + ), + (SLICE_TYPE_LIBS, ("is_library", True)), + (SLICE_TYPE_FILES, ("is_other", True)), +] + +MATCH_FUNCTIONS = { + "prefix": str.startswith, + "is_copyright": lambda path, _: _is_copyright(path), + "is_library": lambda path, _: _is_library(path), + "is_other": lambda path, _: _is_other(path), +} + +IGNORE_PREFIXES = [ + "/etc/ima/", +] + + +def _is_copyright(filepath: str) -> bool: + """ + Checks if a file is a copyright/license file. + This includes standard paths and special cases found in doc directories. + """ + # Standard prefix-based check + if filepath.startswith("/usr/share/licenses/"): + return True + + # Case-insensitive check for common license filenames + filename = Path(filepath).name.lower() + if filename in ("license", "copying", "copyright", "notice"): + # If the filename itself suggests it's a license, it's a strong indicator. + # This will catch cases like '/usr/share/doc/c-ares/LICENSE.md' + return True + + return False + + +def _is_library(filepath: str) -> bool: + """Checks if a file is a shared library.""" + return ".so" in filepath and ( + filepath.startswith("/usr/lib") or filepath.startswith("/lib") + ) + + +def _is_other(filepath: str) -> bool: + """Default rule that always matches.""" + return True + + +def classify_files(package_name: str, files: list[str]) -> dict[str, set[str]]: + """ + Classifies a list of files into slices based on a defined set of rules. + + Args: + package_name: The name of the package. + files: A list of file paths from the RPM. + + Returns: + A dictionary mapping slice names to a set of file paths. + """ + classified_slices = defaultdict(set) + for filepath in files: + # Skip files that start with ignored prefixes + if any(filepath.startswith(prefix) for prefix in IGNORE_PREFIXES): + continue + + slice_type_suffix = SLICE_TYPE_FILES + for rule_suffix, *conditions in CLASSIFICATION_RULES: + is_match = False + for match_type, pattern in conditions: + checker_func = MATCH_FUNCTIONS.get(match_type) + if checker_func and checker_func(filepath, pattern): + is_match = True + break + + if is_match: + slice_type_suffix = rule_suffix + break + + slice_name = f"{package_name}{slice_type_suffix}" + classified_slices[slice_name].add(os.path.normpath(filepath)) + + return classified_slices diff --git a/tools/generator/dependency_analyzer.py b/tools/generator/dependency_analyzer.py new file mode 100644 index 0000000..37a2587 --- /dev/null +++ b/tools/generator/dependency_analyzer.py @@ -0,0 +1,208 @@ +import subprocess +import re +import os +from collections import defaultdict +from tools.logger import logger +from tools.generator.classifier import ( + SLICE_TYPE_BINS, + SLICE_TYPE_LIBS, + SLICE_TYPE_CONFIG, +) + + +class DependencyAnalyzer: + """ + Analyzes binary dependencies for a given package. + """ + + def __init__( + self, + package_name: str, + pkg_extract_path: str, + classified_slices: dict[str, set[str]], + ): + self.package_name = package_name + self.pkg_extract_path = pkg_extract_path + self.classified_slices = classified_slices + + # Caches to improve performance + self._file_to_pkg_cache = {} + self._ldconfig_cache = None + + def analyze(self) -> tuple[dict[str, set[str]], set[str]]: + """ + Main entry point for the analysis. + Orchestrates the process of analyzing dependencies for all relevant slices. + + Returns: + A tuple containing: + - A dictionary mapping slice names to sets of dependent slices. + - A set of invalid files that could not be processed. + """ + # Step 1: Analyze external binary dependencies (existing logic) + slice_deps, invalid_files = self._analyze_external_dependencies() + + # Step 2: Inject internal dependencies (new logic) + self._inject_internal_dependencies(slice_deps) + + return slice_deps, invalid_files + + def _analyze_external_dependencies(self) -> tuple[dict[str, set[str]], set[str]]: + """ + Analyzes dependencies on external packages using readelf. + This method checks for shared libraries used by binaries and maps them to their owning packages. + Returns: + A tuple containing: + - A dictionary mapping slice names to sets of dependent slices. + - A set of invalid files that could not be processed. + """ + + slice_deps = defaultdict(set) + invalid_files = set() + + # 1. Pre-load the ldconfig cache once for the entire analysis + self._load_ldconfig_cache() + + for slice_name, file_set in self.classified_slices.items(): + if not ( + slice_name.endswith(SLICE_TYPE_BINS) + or slice_name.endswith(SLICE_TYPE_LIBS) + ): + continue + + for file in file_set: + full_file_path = os.path.join(self.pkg_extract_path, file.lstrip("/")) + + if not os.path.isfile(full_file_path): + continue + + # 2. Get dependencies for a single binary file + needed_libs, is_valid_elf = self._get_needed_libraries(full_file_path) + if not is_valid_elf: + logger.info( + f"'{os.path.basename(file)}' is not a valid ELF file. It will be excluded from the SDF." + ) + invalid_files.add(file) + continue + + for lib_name in needed_libs: + # 3. Resolve the library name to a package owner + owner_pkg = self._resolve_library_to_package(lib_name) + + if owner_pkg and owner_pkg != self.package_name: + dep_slice = f"{owner_pkg}_libs" + logger.debug( + f"'{slice_name}' dependency found: {file} -> {lib_name} ({dep_slice})" + ) + slice_deps[slice_name].add(dep_slice) + + return slice_deps, invalid_files + + def _inject_internal_dependencies(self, slice_deps: dict[str, set[str]]): + """ + Adds dependencies between slices of the same package (e.g., bins -> config). + """ + logger.debug("Injecting internal dependencies...") + + # Construct the names of the potential slices within this package + config_slice_name = f"{self.package_name}{SLICE_TYPE_CONFIG}" + bins_slice_name = f"{self.package_name}{SLICE_TYPE_BINS}" + libs_slice_name = f"{self.package_name}{SLICE_TYPE_LIBS}" + + # Check if a config slice exists for this package + if config_slice_name in self.classified_slices: + # If bins slice exists, make it depend on the config slice + if bins_slice_name in self.classified_slices: + logger.info( + f"Adding internal dependency: {bins_slice_name} -> {config_slice_name}" + ) + slice_deps[bins_slice_name].add(config_slice_name) + + # If libs slice exists, make it depend on the config slice + if libs_slice_name in self.classified_slices: + logger.info( + f"Adding internal dependency: {libs_slice_name} -> {config_slice_name}" + ) + slice_deps[libs_slice_name].add(config_slice_name) + + def _load_ldconfig_cache(self): + """ + Executes `ldconfig -p` once and caches its output. + """ + logger.debug("Loading ldconfig cache...") + if self._ldconfig_cache is None: + try: + ldconfig_proc = subprocess.run( + ["ldconfig", "-p"], capture_output=True, text=True, check=True + ) + self._ldconfig_cache = ldconfig_proc.stdout + except (subprocess.CalledProcessError, FileNotFoundError): + logger.error( + "`ldconfig -p` failed. Dependency analysis will be severely impacted." + ) + self._ldconfig_cache = "" # Set to empty string to avoid re-running + + def _get_pkg_owner(self, file_path: str) -> str: + """Finds the package that owns a file using an instance cache.""" + if file_path in self._file_to_pkg_cache: + return self._file_to_pkg_cache[file_path] + try: + rpm_qf_proc = subprocess.run( + ["rpm", "-qf", file_path], capture_output=True, text=True, check=True + ) + pkg_full_name = rpm_qf_proc.stdout.strip() + rpm_q_qf_proc = subprocess.run( + ["rpm", "-q", "--qf", "%{NAME}", pkg_full_name], + capture_output=True, + text=True, + check=True, + ) + owner_pkg = rpm_q_qf_proc.stdout.strip() + self._file_to_pkg_cache[file_path] = owner_pkg + return owner_pkg + except (subprocess.CalledProcessError, FileNotFoundError): + self._file_to_pkg_cache[file_path] = "" + return "" + + def _get_needed_libraries(self, binary_path: str) -> tuple[list[str], bool]: + """ + Runs `readelf -d` on a binary and returns a list of its needed libraries. + """ + needed = [] + try: + readelf_result = subprocess.run( + ["readelf", "-d", binary_path], + capture_output=True, + text=True, + check=True, + encoding="utf-8", + errors="ignore", + ) + for line in readelf_result.stdout.strip().split("\n"): + if "(NEEDED)" in line: + match = re.search(r"\[(.*)\]", line) + if match: + needed.append(match.group(1)) + return needed, True + except subprocess.CalledProcessError as e: + return [], False + + def _resolve_library_to_package(self, lib_name: str) -> str: + """ + Resolves a library name (e.g., 'libc.so.6') to its owning package name (e.g., 'glibc'). + Uses the ldconfig cache to find the library path. + """ + if not self._ldconfig_cache: + return "" + + lib_path_match = re.search( + rf"\s+{re.escape(lib_name)}\s*.*=>\s*(/.*)", self._ldconfig_cache + ) + if not lib_path_match: + logger.warning( + f"Could not find path for library '{lib_name}' in ldconfig cache." + ) + return "" + + lib_path = lib_path_match.group(1) + return self._get_pkg_owner(lib_path) diff --git a/tools/generator/environment.py b/tools/generator/environment.py new file mode 100644 index 0000000..c7c8b48 --- /dev/null +++ b/tools/generator/environment.py @@ -0,0 +1,96 @@ +import subprocess +from tools.logger import logger + + +class EnvironmentManager: + """ + A context manager to prepare and clean up the system environment + for SDF analysis. + """ + + def __init__(self, package_to_install: str): + self.package_to_install = package_to_install + self.newly_installed_packages = [] + + def __enter__(self): + """ + The 'prepare' stage. This is executed when entering the 'with' block. + It installs dependencies and records what was installed. + """ + logger.info("Preparing environment by installing dependencies...") + + try: + # Get the list of packages BEFORE installation + before_install_proc = subprocess.run( + ["rpm", "-qa", "--qf", "%{NAME}\n"], + capture_output=True, + text=True, + check=True, + ) + packages_before = set(before_install_proc.stdout.strip().split("\n")) + + # Install the target package and its dependencies + subprocess.run( + ["dnf", "install", "-y", self.package_to_install], check=True + ) + + # Get the list of packages AFTER installation + after_install_proc = subprocess.run( + ["rpm", "-qa", "--qf", "%{NAME}\n"], + capture_output=True, + text=True, + check=True, + ) + packages_after = set(after_install_proc.stdout.strip().split("\n")) + + # Find the difference to know what to clean up + self.newly_installed_packages = sorted( + list(packages_after - packages_before) + ) + + if self.newly_installed_packages: + logger.info( + f"The following packages were newly installed: {self.newly_installed_packages}" + ) + else: + logger.info( + "No new packages were installed (all dependencies were already met)." + ) + + logger.info("Environment prepared successfully.") + + except subprocess.CalledProcessError as e: + logger.error(f"Failed to prepare environment. Command '{e.cmd}' failed.") + logger.error(f"Stderr: {e.stderr}") + # Re-raise to stop the process if preparation fails + raise + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + The 'cleanup' stage. This is executed when exiting the 'with' block, + regardless of whether an exception occurred. + """ + if self.newly_installed_packages: + logger.info( + f"Cleaning up environment by removing {len(self.newly_installed_packages)} package(s)..." + ) + try: + cleanup_command = [ + "dnf", + "remove", + "-y", + ] + self.newly_installed_packages + subprocess.run(cleanup_command, check=True) + logger.info("Environment cleaned up successfully.") + except subprocess.CalledProcessError as e: + logger.error("CRITICAL: Environment cleanup failed!") + logger.error(f"Command '{e.cmd}' failed. Stderr: {e.stderr}") + logger.error( + f"The following packages may have been left on the system: {self.newly_installed_packages}" + ) + else: + logger.info("No new packages to clean up.") + + return False diff --git a/tools/generator/sdfgenerator.py b/tools/generator/sdfgenerator.py new file mode 100644 index 0000000..8c4a27b --- /dev/null +++ b/tools/generator/sdfgenerator.py @@ -0,0 +1,86 @@ +import os +import tempfile +from tools.download import rpm +from tools.parse import parse +from tools.splitter.splitter import _architecture_check, _clone_slices +from tools.logger import logger +from tools import SLICE_PATH + +from tools.generator import classifier +from tools.generator.dependency_analyzer import DependencyAnalyzer +from tools.generator.writer import SDFWriter +from tools.generator.environment import EnvironmentManager + + +class SDFGenerator: + """Class to generate SDF files for a given package.""" + + # Class attributes for the generator + package: str + release: str + arch: str + output: str + + def __init__(self, release: str, arch: str, output: str, package: str): + self.release = f"openEuler-{release.upper()}" + self.arch = _architecture_check(arch) + self.output = os.path.abspath(output) + self.package = package + + def gen(self): + """ + Main entry point for generating SDF files. + """ + logger.info(f"===== Starting SDF Generation for: {self.package} =====") + + # The 'with' statement automatically handles setup and teardown + with EnvironmentManager(self.package): + + _clone_slices(self.release, SLICE_PATH) + + # Initialize DNF client for downloading + logger.info(f"Downloading package: {self.package}...") + dnf_client = rpm.init_dnf_client(self.arch, self.release, self.output) + local_pkg_path = rpm.download(dnf_client, self.package) + if not local_pkg_path: + logger.error(f"Failed to download package {self.package}.") + rpm.clear(dnf_client) + return + logger.info(f"Package downloaded to: {local_pkg_path}") + + # extracting RPM files + pkg_dir = tempfile.TemporaryDirectory() + logger.info(f"Extracting {local_pkg_path} to {pkg_dir} for analysis...") + parse.extract_files(local_pkg_path, pkg_dir.name, ["/*"]) + pkg_files = get_pkg_files(pkg_dir.name) + + # Classify files into slices + classified_slices = classifier.classify_files(self.package, pkg_files) + for slice_name, files in classified_slices.items(): + logger.info(f"Slice '{slice_name}' contains {files} ") + + # Analyze dependencies + analyzer = DependencyAnalyzer(self.package, pkg_dir.name, classified_slices) + slice_deps, invalid_files = analyzer.analyze() + for slice_name, deps in slice_deps.items(): + logger.info(f"Slice '{slice_name}' depends on: {deps}") + + # Write the SDF file + writer = SDFWriter( + self.output, self.package, classified_slices, slice_deps, invalid_files + ) + writer.write() + + rpm.clear(dnf_client) + + logger.info(f"===== Finished SDF Generation for: {self.package} =====") + + +def get_pkg_files(path): + files = [] + for root, _, filenames in os.walk(path): + for filename in filenames: + full_path = os.path.join(root, filename) + rel_path = os.path.relpath(full_path, path) + files.append(f"/{rel_path}") + return files diff --git a/tools/generator/writer.py b/tools/generator/writer.py new file mode 100644 index 0000000..a1f14ea --- /dev/null +++ b/tools/generator/writer.py @@ -0,0 +1,124 @@ +import yaml +from pathlib import Path +from tools.logger import logger + +from tools.generator.classifier import SLICE_TYPE_FILES + + +class SDFWriter: + """ + Builds the SDF data structure and writes it to a YAML file. + """ + + def __init__( + self, + output: str, + package_name: str, + classified_slices: dict[str, set[str]], + slice_deps: dict[str, set[str]], + invalid_files: set[str] = None, + ): + self.output_path = Path(output) / f"{package_name}.yaml" + self.package_name = package_name + self.classified_slices = classified_slices + self.slice_deps = slice_deps + self.invalid_files = invalid_files or set() + + def write(self): + """ + Main entry point to build the data and write the file. + """ + sdf_data = self._build_sdf_structure() + + self.output_path.parent.mkdir(parents=True, exist_ok=True) + with open(self.output_path, "w", encoding="utf-8") as f: + yaml.dump(sdf_data, f, indent=2, sort_keys=False, default_flow_style=False) + + logger.info(f"SDF file written to: {self.output_path}") + + def _build_sdf_structure(self) -> dict: + """ + Assembles the final SDF data dictionary. + """ + copyright_slice_name = f"{self.package_name}_copyright" + sdf_data = { + "package": self.package_name, + "deps": ( + [copyright_slice_name] + if copyright_slice_name in self.classified_slices + else [] + ), + "slices": {}, + } + + files_slice_name = f"{self.package_name}{SLICE_TYPE_FILES}" + for slice_name, files in sorted(self.classified_slices.items()): + if slice_name == files_slice_name: + logger.debug(f"Ignoring '{slice_name}' slice from the final output.") + continue + + # Remove any files that were identified as invalid + valid_files = files - self.invalid_files + if not valid_files: + logger.info( + f"Slice '{slice_name}' is empty after filtering invalid files, excluding from SDF." + ) + continue + + short_slice_name = slice_name.replace(f"{self.package_name}_", "", 1) + slice_content = {} + + if self.slice_deps.get(slice_name): + slice_content["deps"] = sorted(list(self.slice_deps[slice_name])) + + # Apply path compression before adding to the structure + compressed_files = self._compress_paths(valid_files) + slice_content["contents"] = {"common": sorted(list(compressed_files))} + + sdf_data["slices"][short_slice_name] = slice_content + + return sdf_data + + def _compress_paths(self, file_set: set[str]) -> set[str]: + """ + Performs a robust path compression for versioned shared libraries. + This version uses a direct prefix matching approach. + """ + # We only attempt to compress files that look like libraries + libs = sorted([f for f in file_set if ".so" in f], key=len) + other_files = {f for f in file_set if ".so" not in f} + + if not libs: + return other_files + + # The core idea: iterate through the sorted libs. If a lib is a prefix + # of subsequent libs, it becomes a candidate for a wildcard. + compressed_libs = set() + + # Use a boolean array to mark which libraries have been consumed + # by a wildcard prefix. + consumed = [False] * len(libs) + + for i in range(len(libs)): + if consumed[i]: + continue + + # The current library is a potential prefix + prefix = libs[i] + is_prefix_for_others = False + + for j in range(i + 1, len(libs)): + if libs[j].startswith(prefix): + # If we find at least one longer file that starts with our prefix, + # it confirms this is a valid compression case. + is_prefix_for_others = True + consumed[j] = True # Mark the longer path as consumed + + if is_prefix_for_others: + # Add the prefix with a wildcard + compressed_libs.add(f"{prefix}*") + else: + # If it wasn't a prefix for any other lib, add it as is + compressed_libs.add(prefix) + + return compressed_libs.union(other_files) diff --git a/tools/main.py b/tools/main.py index fa535a4..678e0e4 100644 --- a/tools/main.py +++ b/tools/main.py @@ -1,5 +1,6 @@ import click from tools.cmd.cut import cut +from tools.cmd.gen import gen @click.group(help=""" @@ -12,6 +13,7 @@ def entrance(): def _add_commands(): # Unified interface for extension. entrance.add_command(cut) + entrance.add_command(gen) def main(): _add_commands() -- Gitee From 1ca11de8811bb8fe5b8173f6028270f39e50e48c Mon Sep 17 00:00:00 2001 From: mahaoliang Date: Tue, 29 Jul 2025 12:19:25 +0800 Subject: [PATCH 2/6] fix(tools): enhance ELF file validation in dependency analysis - Pre-check files with `file` command to ensure they are ELF binaries - Skip dependency analysis for non-ELF files - Improve error handling for `file` and `readelf` commands - Provide clearer logging and error messages --- tools/generator/dependency_analyzer.py | 52 +++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/tools/generator/dependency_analyzer.py b/tools/generator/dependency_analyzer.py index 37a2587..43fc755 100644 --- a/tools/generator/dependency_analyzer.py +++ b/tools/generator/dependency_analyzer.py @@ -79,9 +79,6 @@ class DependencyAnalyzer: # 2. Get dependencies for a single binary file needed_libs, is_valid_elf = self._get_needed_libraries(full_file_path) if not is_valid_elf: - logger.info( - f"'{os.path.basename(file)}' is not a valid ELF file. It will be excluded from the SDF." - ) invalid_files.add(file) continue @@ -166,8 +163,44 @@ class DependencyAnalyzer: def _get_needed_libraries(self, binary_path: str) -> tuple[list[str], bool]: """ - Runs `readelf -d` on a binary and returns a list of its needed libraries. + Runs `readelf -d` on a binary after pre-checking with `file` command. + Returns: + A tuple containing: + - A list of needed libraries. + - A boolean indicating if the binary is a valid ELF file. """ + # Step 1: Pre-check the file type using the `file` command. + try: + file_proc = subprocess.run( + ["file", "-L", binary_path], + capture_output=True, + text=True, + check=True, + encoding="utf-8", + errors="ignore", + ) + file_type_desc = file_proc.stdout.lower() + + # If the file is not an ELF binary, skip dependency analysis. + if "elf" not in file_type_desc: + logger.info( + f"Skipping dependency analysis for non-ELF file: " + f"{os.path.basename(binary_path)} ({file_type_desc.strip()})" + ) + return [], False + + except FileNotFoundError as e: + raise RuntimeError( + "FATAL: `file` command not found. Please ensure the `file` " + "package is installed." + ) from e + except subprocess.CalledProcessError as e: + raise RuntimeError( + f"`file` command failed on {os.path.basename(binary_path)}: " + f"{e.stderr.strip()}. Halting analysis." + ) from e + + # Step 2: If it's an ELF binary, run `readelf -d` to get needed libraries. needed = [] try: readelf_result = subprocess.run( @@ -184,8 +217,17 @@ class DependencyAnalyzer: if match: needed.append(match.group(1)) return needed, True + except FileNotFoundError as e: + raise RuntimeError( + "FATAL: `readelf` command not found. Please ensure the `binutils` " + "package is installed." + ) from e except subprocess.CalledProcessError as e: - return [], False + raise RuntimeError( + f"readelf failed on '{os.path.basename(binary_path)}' " + f"which was identified as an ELF file. The file might be corrupted. " + f"Halting analysis. Error: {e.stderr.strip()}" + ) from e def _resolve_library_to_package(self, lib_name: str) -> str: """ -- Gitee From f153e103061c8f1bd67a464eced8c260b294274c Mon Sep 17 00:00:00 2001 From: mahaoliang Date: Tue, 29 Jul 2025 12:40:07 +0800 Subject: [PATCH 3/6] refactor(tools): improve code readability and structure in DependencyAnalyzer --- tools/generator/dependency_analyzer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/generator/dependency_analyzer.py b/tools/generator/dependency_analyzer.py index 43fc755..5cbdcfc 100644 --- a/tools/generator/dependency_analyzer.py +++ b/tools/generator/dependency_analyzer.py @@ -60,9 +60,10 @@ class DependencyAnalyzer: slice_deps = defaultdict(set) invalid_files = set() - # 1. Pre-load the ldconfig cache once for the entire analysis + # Pre-load the ldconfig cache once for the entire analysis self._load_ldconfig_cache() + # Iterate through classified slices to find binaries and libraries for slice_name, file_set in self.classified_slices.items(): if not ( slice_name.endswith(SLICE_TYPE_BINS) @@ -76,14 +77,14 @@ class DependencyAnalyzer: if not os.path.isfile(full_file_path): continue - # 2. Get dependencies for a single binary file + # Get dependencies for a single binary file needed_libs, is_valid_elf = self._get_needed_libraries(full_file_path) if not is_valid_elf: invalid_files.add(file) continue for lib_name in needed_libs: - # 3. Resolve the library name to a package owner + # Resolve the library name to a package owner owner_pkg = self._resolve_library_to_package(lib_name) if owner_pkg and owner_pkg != self.package_name: -- Gitee From fff2704d3e6b4f88f8409cb05082abb065fab597 Mon Sep 17 00:00:00 2001 From: mahaoliang Date: Fri, 1 Aug 2025 22:59:11 +0800 Subject: [PATCH 4/6] =?UTF-8?q?=E9=87=8D=E6=9E=84=EF=BC=9A=E4=BC=98?= =?UTF-8?q?=E5=8C=96=E6=96=87=E4=BB=B6=E5=88=86=E7=B1=BB=E4=B8=8E=E4=BE=9D?= =?UTF-8?q?=E8=B5=96=E5=88=86=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在文件分类时,剔除无法归类的文件。 - 对于 bin 和 lib 中的文件,不是简单的使用扩展名 .so 进行判断,而是用 file 判断是否为 ELF 类型。 --- tools/generator/classifier.py | 136 +++++++++++++++---------- tools/generator/dependency_analyzer.py | 120 +++++++--------------- tools/generator/sdfgenerator.py | 27 ++--- tools/generator/writer.py | 18 +--- 4 files changed, 132 insertions(+), 169 deletions(-) diff --git a/tools/generator/classifier.py b/tools/generator/classifier.py index 717c882..ac6bbe9 100644 --- a/tools/generator/classifier.py +++ b/tools/generator/classifier.py @@ -1,41 +1,43 @@ from collections import defaultdict import os from pathlib import Path +import subprocess # Slice Type Definitions SLICE_TYPE_COPYRIGHT = "_copyright" SLICE_TYPE_CONFIG = "_config" SLICE_TYPE_BINS = "_bins" SLICE_TYPE_LIBS = "_libs" -SLICE_TYPE_FILES = "_files" - -# Rule Definition -CLASSIFICATION_RULES = [ - (SLICE_TYPE_COPYRIGHT, ("is_copyright", True)), - (SLICE_TYPE_CONFIG, ("prefix", "/etc/")), - ( - SLICE_TYPE_BINS, - ("prefix", "/usr/bin/"), - ("prefix", "/usr/sbin/"), - ("prefix", "/usr/libexec/"), - ), - (SLICE_TYPE_LIBS, ("is_library", True)), - (SLICE_TYPE_FILES, ("is_other", True)), -] - -MATCH_FUNCTIONS = { - "prefix": str.startswith, - "is_copyright": lambda path, _: _is_copyright(path), - "is_library": lambda path, _: _is_library(path), - "is_other": lambda path, _: _is_other(path), -} - -IGNORE_PREFIXES = [ - "/etc/ima/", -] -def _is_copyright(filepath: str) -> bool: +def _get_file_type_desc(filepath: str) -> str: + """ + Uses `file -L` to get the canonical type of a file. + Returns a lower-cased description string. + """ + if not os.path.exists(filepath): + return "" + try: + file_proc = subprocess.run( + ["file", "-L", filepath], + capture_output=True, + text=True, + check=True, + encoding="utf-8", + errors="ignore", + ) + return file_proc.stdout.lower() + except FileNotFoundError as e: + raise RuntimeError( + "FATAL: `file` command not found. Please ensure the `file` package is installed." + ) from e + except subprocess.CalledProcessError as e: + raise RuntimeError( + f"`file` command failed on {filepath}: {e.stderr.strip()}. Halting analysis." + ) from e + + +def _is_copyright(filepath: str, pkg_root_dir: str) -> bool: """ Checks if a file is a copyright/license file. This includes standard paths and special cases found in doc directories. @@ -46,7 +48,7 @@ def _is_copyright(filepath: str) -> bool: # Case-insensitive check for common license filenames filename = Path(filepath).name.lower() - if filename in ("license", "copying", "copyright", "notice"): + if any(keyword in filename for keyword in ("license", "copying", "copyright", "notice")): # If the filename itself suggests it's a license, it's a strong indicator. # This will catch cases like '/usr/share/doc/c-ares/LICENSE.md' return True @@ -54,48 +56,80 @@ def _is_copyright(filepath: str) -> bool: return False -def _is_library(filepath: str) -> bool: +def _is_config(filepath: str, pkg_root_dir: str) -> bool: + if filepath.startswith("/etc/ima/"): + return False + return filepath.startswith("/etc/") + + +def _is_bin(filepath: str, pkg_root_dir: str) -> bool: + """ + Checks if a file is a binary executable. + """ + if not (filepath.startswith(("/usr/bin/", "/usr/sbin/", "/usr/libexec/"))): + return False + + full_path = os.path.join(pkg_root_dir, filepath.lstrip("/")) + desc = _get_file_type_desc(full_path) + return "elf" in desc and "executable" in desc + + +def _is_library(filepath: str, pkg_root_dir: str) -> bool: """Checks if a file is a shared library.""" - return ".so" in filepath and ( - filepath.startswith("/usr/lib") or filepath.startswith("/lib") - ) + if not (filepath.startswith("/usr/lib") or filepath.startswith("/lib")): + return False + + full_path = os.path.join(pkg_root_dir, filepath.lstrip("/")) + desc = _get_file_type_desc(full_path) + return "elf" in desc and "shared object" in desc + +def _get_pkg_files(pkg_root_dir: str) -> list[str]: + files = [] + for root, _, filenames in os.walk(pkg_root_dir): + for filename in filenames: + full_path = os.path.join(root, filename) + rel_path = os.path.relpath(full_path, pkg_root_dir) + files.append(f"/{rel_path}") + return files -def _is_other(filepath: str) -> bool: - """Default rule that always matches.""" - return True +# Rule Definition +CLASSIFICATION_RULES = [ + (SLICE_TYPE_COPYRIGHT, _is_copyright), + (SLICE_TYPE_CONFIG, _is_config), + (SLICE_TYPE_BINS, _is_bin), + (SLICE_TYPE_LIBS, _is_library), +] -def classify_files(package_name: str, files: list[str]) -> dict[str, set[str]]: + +def classify_files(package_name: str, pkg_root_dir: str) -> dict[str, set[str]]: """ Classifies a list of files into slices based on a defined set of rules. Args: package_name: The name of the package. - files: A list of file paths from the RPM. + pkg_root_dir: The root path where files were extracted. Returns: A dictionary mapping slice names to a set of file paths. """ + files = _get_pkg_files(pkg_root_dir) classified_slices = defaultdict(set) + for filepath in files: - # Skip files that start with ignored prefixes - if any(filepath.startswith(prefix) for prefix in IGNORE_PREFIXES): - continue - slice_type_suffix = SLICE_TYPE_FILES - for rule_suffix, *conditions in CLASSIFICATION_RULES: - is_match = False - for match_type, pattern in conditions: - checker_func = MATCH_FUNCTIONS.get(match_type) - if checker_func and checker_func(filepath, pattern): - is_match = True - break - - if is_match: - slice_type_suffix = rule_suffix + slice_type_suffix = None + + for suffix, checker_func in CLASSIFICATION_RULES: + if checker_func(filepath, pkg_root_dir): + slice_type_suffix = suffix break + # If no rule matched, skip the file + if slice_type_suffix is None: + continue + slice_name = f"{package_name}{slice_type_suffix}" classified_slices[slice_name].add(os.path.normpath(filepath)) diff --git a/tools/generator/dependency_analyzer.py b/tools/generator/dependency_analyzer.py index 5cbdcfc..fb87477 100644 --- a/tools/generator/dependency_analyzer.py +++ b/tools/generator/dependency_analyzer.py @@ -29,46 +29,39 @@ class DependencyAnalyzer: self._file_to_pkg_cache = {} self._ldconfig_cache = None - def analyze(self) -> tuple[dict[str, set[str]], set[str]]: + def analyze(self) -> dict[str, set[str]]: """ Main entry point for the analysis. Orchestrates the process of analyzing dependencies for all relevant slices. Returns: - A tuple containing: - - A dictionary mapping slice names to sets of dependent slices. - - A set of invalid files that could not be processed. + - A dictionary mapping slice names to sets of dependent slices. """ # Step 1: Analyze external binary dependencies (existing logic) - slice_deps, invalid_files = self._analyze_external_dependencies() + slice_deps = self._analyze_external_dependencies() # Step 2: Inject internal dependencies (new logic) self._inject_internal_dependencies(slice_deps) - return slice_deps, invalid_files + return slice_deps - def _analyze_external_dependencies(self) -> tuple[dict[str, set[str]], set[str]]: + def _analyze_external_dependencies(self) -> dict[str, set[str]]: """ Analyzes dependencies on external packages using readelf. - This method checks for shared libraries used by binaries and maps them to their owning packages. + This method checks for shared libraries used by binaries + and maps them to their owning packages. Returns: - A tuple containing: - - A dictionary mapping slice names to sets of dependent slices. - - A set of invalid files that could not be processed. + - A dictionary mapping slice names to sets of dependent slices. """ slice_deps = defaultdict(set) - invalid_files = set() # Pre-load the ldconfig cache once for the entire analysis self._load_ldconfig_cache() # Iterate through classified slices to find binaries and libraries for slice_name, file_set in self.classified_slices.items(): - if not ( - slice_name.endswith(SLICE_TYPE_BINS) - or slice_name.endswith(SLICE_TYPE_LIBS) - ): + if not (slice_name.endswith(SLICE_TYPE_BINS) or slice_name.endswith(SLICE_TYPE_LIBS)): continue for file in file_set: @@ -78,10 +71,7 @@ class DependencyAnalyzer: continue # Get dependencies for a single binary file - needed_libs, is_valid_elf = self._get_needed_libraries(full_file_path) - if not is_valid_elf: - invalid_files.add(file) - continue + needed_libs = self._get_needed_libraries(full_file_path) for lib_name in needed_libs: # Resolve the library name to a package owner @@ -94,7 +84,7 @@ class DependencyAnalyzer: ) slice_deps[slice_name].add(dep_slice) - return slice_deps, invalid_files + return slice_deps def _inject_internal_dependencies(self, slice_deps: dict[str, set[str]]): """ @@ -111,16 +101,12 @@ class DependencyAnalyzer: if config_slice_name in self.classified_slices: # If bins slice exists, make it depend on the config slice if bins_slice_name in self.classified_slices: - logger.info( - f"Adding internal dependency: {bins_slice_name} -> {config_slice_name}" - ) + logger.info(f"Adding internal dependency: {bins_slice_name} -> {config_slice_name}") slice_deps[bins_slice_name].add(config_slice_name) # If libs slice exists, make it depend on the config slice if libs_slice_name in self.classified_slices: - logger.info( - f"Adding internal dependency: {libs_slice_name} -> {config_slice_name}" - ) + logger.info(f"Adding internal dependency: {libs_slice_name} -> {config_slice_name}") slice_deps[libs_slice_name].add(config_slice_name) def _load_ldconfig_cache(self): @@ -128,17 +114,19 @@ class DependencyAnalyzer: Executes `ldconfig -p` once and caches its output. """ logger.debug("Loading ldconfig cache...") - if self._ldconfig_cache is None: - try: - ldconfig_proc = subprocess.run( - ["ldconfig", "-p"], capture_output=True, text=True, check=True - ) - self._ldconfig_cache = ldconfig_proc.stdout - except (subprocess.CalledProcessError, FileNotFoundError): - logger.error( - "`ldconfig -p` failed. Dependency analysis will be severely impacted." - ) - self._ldconfig_cache = "" # Set to empty string to avoid re-running + try: + ldconfig_proc = subprocess.run( + ["ldconfig", "-p"], capture_output=True, text=True, check=True + ) + self._ldconfig_cache = ldconfig_proc.stdout + except FileNotFoundError as e: + raise RuntimeError( + "`ldconfig` command not found. Please ensure the `glibc` package is installed." + ) from e + except subprocess.CalledProcessError as e: + raise RuntimeError( + f"`ldconfig` command failed: {e.stderr.strip()}. Halting analysis." + ) from e def _get_pkg_owner(self, file_path: str) -> str: """Finds the package that owns a file using an instance cache.""" @@ -158,50 +146,22 @@ class DependencyAnalyzer: owner_pkg = rpm_q_qf_proc.stdout.strip() self._file_to_pkg_cache[file_path] = owner_pkg return owner_pkg - except (subprocess.CalledProcessError, FileNotFoundError): + except FileNotFoundError as e: + raise RuntimeError( + "FATAL: `rpm` command not found. Please ensure the `rpm` package is installed." + ) from e + except subprocess.CalledProcessError: self._file_to_pkg_cache[file_path] = "" return "" - def _get_needed_libraries(self, binary_path: str) -> tuple[list[str], bool]: + def _get_needed_libraries(self, binary_path: str) -> list[str]: """ - Runs `readelf -d` on a binary after pre-checking with `file` command. + Runs `readelf -d` on a binary. Assumes the file is a valid ELF + as it has been pre-filtered by the classifier. + Returns: - A tuple containing: - - A list of needed libraries. - - A boolean indicating if the binary is a valid ELF file. + A list of needed libraries. """ - # Step 1: Pre-check the file type using the `file` command. - try: - file_proc = subprocess.run( - ["file", "-L", binary_path], - capture_output=True, - text=True, - check=True, - encoding="utf-8", - errors="ignore", - ) - file_type_desc = file_proc.stdout.lower() - - # If the file is not an ELF binary, skip dependency analysis. - if "elf" not in file_type_desc: - logger.info( - f"Skipping dependency analysis for non-ELF file: " - f"{os.path.basename(binary_path)} ({file_type_desc.strip()})" - ) - return [], False - - except FileNotFoundError as e: - raise RuntimeError( - "FATAL: `file` command not found. Please ensure the `file` " - "package is installed." - ) from e - except subprocess.CalledProcessError as e: - raise RuntimeError( - f"`file` command failed on {os.path.basename(binary_path)}: " - f"{e.stderr.strip()}. Halting analysis." - ) from e - - # Step 2: If it's an ELF binary, run `readelf -d` to get needed libraries. needed = [] try: readelf_result = subprocess.run( @@ -217,7 +177,7 @@ class DependencyAnalyzer: match = re.search(r"\[(.*)\]", line) if match: needed.append(match.group(1)) - return needed, True + return needed except FileNotFoundError as e: raise RuntimeError( "FATAL: `readelf` command not found. Please ensure the `binutils` " @@ -225,7 +185,7 @@ class DependencyAnalyzer: ) from e except subprocess.CalledProcessError as e: raise RuntimeError( - f"readelf failed on '{os.path.basename(binary_path)}' " + f"readelf failed on '{binary_path}' " f"which was identified as an ELF file. The file might be corrupted. " f"Halting analysis. Error: {e.stderr.strip()}" ) from e @@ -242,9 +202,7 @@ class DependencyAnalyzer: rf"\s+{re.escape(lib_name)}\s*.*=>\s*(/.*)", self._ldconfig_cache ) if not lib_path_match: - logger.warning( - f"Could not find path for library '{lib_name}' in ldconfig cache." - ) + logger.warning(f"Could not find path for library '{lib_name}' in ldconfig cache.") return "" lib_path = lib_path_match.group(1) diff --git a/tools/generator/sdfgenerator.py b/tools/generator/sdfgenerator.py index 8c4a27b..99fdf72 100644 --- a/tools/generator/sdfgenerator.py +++ b/tools/generator/sdfgenerator.py @@ -49,38 +49,25 @@ class SDFGenerator: logger.info(f"Package downloaded to: {local_pkg_path}") # extracting RPM files - pkg_dir = tempfile.TemporaryDirectory() - logger.info(f"Extracting {local_pkg_path} to {pkg_dir} for analysis...") - parse.extract_files(local_pkg_path, pkg_dir.name, ["/*"]) - pkg_files = get_pkg_files(pkg_dir.name) + pkg_root_dir = tempfile.TemporaryDirectory() + logger.info(f"Extracting {local_pkg_path} to {pkg_root_dir} for analysis...") + parse.extract_files(local_pkg_path, pkg_root_dir.name, ["/*"]) # Classify files into slices - classified_slices = classifier.classify_files(self.package, pkg_files) + classified_slices = classifier.classify_files(self.package, pkg_root_dir.name) for slice_name, files in classified_slices.items(): logger.info(f"Slice '{slice_name}' contains {files} ") # Analyze dependencies - analyzer = DependencyAnalyzer(self.package, pkg_dir.name, classified_slices) - slice_deps, invalid_files = analyzer.analyze() + analyzer = DependencyAnalyzer(self.package, pkg_root_dir.name, classified_slices) + slice_deps = analyzer.analyze() for slice_name, deps in slice_deps.items(): logger.info(f"Slice '{slice_name}' depends on: {deps}") # Write the SDF file - writer = SDFWriter( - self.output, self.package, classified_slices, slice_deps, invalid_files - ) + writer = SDFWriter(self.output, self.package, classified_slices, slice_deps) writer.write() rpm.clear(dnf_client) logger.info(f"===== Finished SDF Generation for: {self.package} =====") - - -def get_pkg_files(path): - files = [] - for root, _, filenames in os.walk(path): - for filename in filenames: - full_path = os.path.join(root, filename) - rel_path = os.path.relpath(full_path, path) - files.append(f"/{rel_path}") - return files diff --git a/tools/generator/writer.py b/tools/generator/writer.py index a1f14ea..41f6c7a 100644 --- a/tools/generator/writer.py +++ b/tools/generator/writer.py @@ -2,8 +2,6 @@ import yaml from pathlib import Path from tools.logger import logger -from tools.generator.classifier import SLICE_TYPE_FILES - class SDFWriter: """ @@ -16,13 +14,11 @@ class SDFWriter: package_name: str, classified_slices: dict[str, set[str]], slice_deps: dict[str, set[str]], - invalid_files: set[str] = None, ): self.output_path = Path(output) / f"{package_name}.yaml" self.package_name = package_name self.classified_slices = classified_slices self.slice_deps = slice_deps - self.invalid_files = invalid_files or set() def write(self): """ @@ -51,19 +47,7 @@ class SDFWriter: "slices": {}, } - files_slice_name = f"{self.package_name}{SLICE_TYPE_FILES}" for slice_name, files in sorted(self.classified_slices.items()): - if slice_name == files_slice_name: - logger.debug(f"Ignoring '{slice_name}' slice from the final output.") - continue - - # Remove any files that were identified as invalid - valid_files = files - self.invalid_files - if not valid_files: - logger.info( - f"Slice '{slice_name}' is empty after filtering invalid files, excluding from SDF." - ) - continue short_slice_name = slice_name.replace(f"{self.package_name}_", "", 1) slice_content = {} @@ -72,7 +56,7 @@ class SDFWriter: slice_content["deps"] = sorted(list(self.slice_deps[slice_name])) # Apply path compression before adding to the structure - compressed_files = self._compress_paths(valid_files) + compressed_files = self._compress_paths(files) slice_content["contents"] = {"common": sorted(list(compressed_files))} sdf_data["slices"][short_slice_name] = slice_content -- Gitee From d40052fce340d2f08bce4761b632466e2e6808f8 Mon Sep 17 00:00:00 2001 From: mahaoliang Date: Fri, 8 Aug 2025 11:01:33 +0800 Subject: [PATCH 5/6] =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E5=9F=BA=E4=BA=8EDocker?= =?UTF-8?q?=E7=9A=84SDF=E7=94=9F=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 为SDF生成器环境添加Dockerfile - 创建gen-sdf-docker.sh脚本,用于在Docker中运行SDF生成 - 移除EnvironmentManager类,并更新SDFGenerator以适配Docker - 重构SDF生成流程,使其在Docker容器内运行 --- bin/Dockerfile | 26 +++++++++ bin/gen-sdf-docker.sh | 100 ++++++++++++++++++++++++++++++++ tools/generator/environment.py | 96 ------------------------------ tools/generator/sdfgenerator.py | 58 +++++++++--------- 4 files changed, 153 insertions(+), 127 deletions(-) create mode 100644 bin/Dockerfile create mode 100755 bin/gen-sdf-docker.sh delete mode 100644 tools/generator/environment.py diff --git a/bin/Dockerfile b/bin/Dockerfile new file mode 100644 index 0000000..1bf3f67 --- /dev/null +++ b/bin/Dockerfile @@ -0,0 +1,26 @@ +# Dockerfile for the SDF Generator environment + +# Start from a specific openEuler release. +ARG RELEASE_TAG=24.03-lts +FROM hub.oepkgs.net/openeuler/openeuler:${RELEASE_TAG} + +# Install all the dependencies for the splitter tool. +RUN dnf install -y \ + python3-dnf \ + git \ + python3-pip \ + cpio \ + binutils \ + file && \ + dnf clean all + +# Copy the splitter source code into the image. +COPY . /splitter + +# Install the splitter tool itself and its dependencies. +RUN cd /splitter && \ + pip3 install -i https://repo.huaweicloud.com/repository/pypi/simple . + +WORKDIR /splitter + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/bin/gen-sdf-docker.sh b/bin/gen-sdf-docker.sh new file mode 100755 index 0000000..e97fae5 --- /dev/null +++ b/bin/gen-sdf-docker.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +set -eu + +# base directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SCRIPT_DIR}/.." || exit 1 +BASE_DIR=$(pwd) + +# Help Function +usage() { + echo "Usage: $0 -p -r -o " + echo " -p, --package Required. The name of the RPM package to generate an SDF for (e.g., 'brotli')." + echo " -r, --release Required. The openEuler release (e.g., '24.03-LTS')." + echo " -o, --output Required. The directory to save the generated SDF file." + echo " -h, --help Show this help message." + exit 1 +} + +# Defaults +ARCH=$(uname -m) +RELEASE="" +PACKAGE_NAME="" +OUTPUT_DIR="" + +# Argument Parsing with getopts +while getopts ":p:r:o:h" opt; do + case ${opt} in + p ) + PACKAGE_NAME=$OPTARG + ;; + r ) + RELEASE=$OPTARG + ;; + o ) + OUTPUT_DIR=$OPTARG + ;; + h ) + usage + ;; + \? ) + echo "Invalid Option: -$OPTARG" 1>&2 + usage + ;; + : ) + echo "Invalid Option: -$OPTARG requires an argument" 1>&2 + usage + ;; + esac +done + +# Input Validation +if [[ -z "$PACKAGE_NAME" || -z "$RELEASE" || -z "$OUTPUT_DIR" ]]; then + echo "Error: Missing required arguments." + usage +fi + +# Absolute path for the output directory for Docker mount +mkdir -p "${OUTPUT_DIR}" +ABS_OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)" + +RELEASE_TAG="${RELEASE,,}" # Convert to lowercase for consistency +CUSTOM_IMAGE_NAME="sdf-generator-base:${RELEASE_TAG}" +CONTAINER_NAME="sdf-generator-${PACKAGE_NAME}-$$" + +echo ">>> Starting SDF Generation for '${PACKAGE_NAME}'" +echo " Release: ${RELEASE}" +echo " Release Tag: ${RELEASE_TAG}" +echo " Arch: ${ARCH}" +echo " Output Dir: ${ABS_OUTPUT_DIR}" +echo " Using Docker Image: ${CUSTOM_IMAGE_NAME}" + +# Docker Image Build +echo ">>> Checking for custom base image: ${CUSTOM_IMAGE_NAME}" +if [[ -z "$(docker images -q "${CUSTOM_IMAGE_NAME}")" ]]; then + echo ">>> Base image not found. Building it now..." + docker build --no-cache --build-arg RELEASE_TAG="${RELEASE_TAG}" \ + -t "${CUSTOM_IMAGE_NAME}" \ + -f "${BASE_DIR}/bin/Dockerfile" \ + "${BASE_DIR}" + echo ">>> Base image built successfully." +else + echo ">>> Base image found." +fi + +echo ">>> Output SDF will be saved to: ${ABS_OUTPUT_DIR}" + +# The command to be executed inside the container. +INSTALL_CMD="dnf install -y ${PACKAGE_NAME}" +GENERATE_CMD="splitter gen -p ${PACKAGE_NAME} -r ${RELEASE} -o /output -a ${ARCH}" +FULL_COMMAND="${INSTALL_CMD} && ${GENERATE_CMD}" + +echo ">>> Starting Docker container from custom image..." +docker run --name "${CONTAINER_NAME}" \ + -v "${ABS_OUTPUT_DIR}:/output" \ + --rm \ + "${CUSTOM_IMAGE_NAME}" \ + /bin/bash -c "${FULL_COMMAND}" + +echo ">>> SDF Generation complete. Docker container has been removed." +echo ">>> Done." \ No newline at end of file diff --git a/tools/generator/environment.py b/tools/generator/environment.py deleted file mode 100644 index c7c8b48..0000000 --- a/tools/generator/environment.py +++ /dev/null @@ -1,96 +0,0 @@ -import subprocess -from tools.logger import logger - - -class EnvironmentManager: - """ - A context manager to prepare and clean up the system environment - for SDF analysis. - """ - - def __init__(self, package_to_install: str): - self.package_to_install = package_to_install - self.newly_installed_packages = [] - - def __enter__(self): - """ - The 'prepare' stage. This is executed when entering the 'with' block. - It installs dependencies and records what was installed. - """ - logger.info("Preparing environment by installing dependencies...") - - try: - # Get the list of packages BEFORE installation - before_install_proc = subprocess.run( - ["rpm", "-qa", "--qf", "%{NAME}\n"], - capture_output=True, - text=True, - check=True, - ) - packages_before = set(before_install_proc.stdout.strip().split("\n")) - - # Install the target package and its dependencies - subprocess.run( - ["dnf", "install", "-y", self.package_to_install], check=True - ) - - # Get the list of packages AFTER installation - after_install_proc = subprocess.run( - ["rpm", "-qa", "--qf", "%{NAME}\n"], - capture_output=True, - text=True, - check=True, - ) - packages_after = set(after_install_proc.stdout.strip().split("\n")) - - # Find the difference to know what to clean up - self.newly_installed_packages = sorted( - list(packages_after - packages_before) - ) - - if self.newly_installed_packages: - logger.info( - f"The following packages were newly installed: {self.newly_installed_packages}" - ) - else: - logger.info( - "No new packages were installed (all dependencies were already met)." - ) - - logger.info("Environment prepared successfully.") - - except subprocess.CalledProcessError as e: - logger.error(f"Failed to prepare environment. Command '{e.cmd}' failed.") - logger.error(f"Stderr: {e.stderr}") - # Re-raise to stop the process if preparation fails - raise - - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """ - The 'cleanup' stage. This is executed when exiting the 'with' block, - regardless of whether an exception occurred. - """ - if self.newly_installed_packages: - logger.info( - f"Cleaning up environment by removing {len(self.newly_installed_packages)} package(s)..." - ) - try: - cleanup_command = [ - "dnf", - "remove", - "-y", - ] + self.newly_installed_packages - subprocess.run(cleanup_command, check=True) - logger.info("Environment cleaned up successfully.") - except subprocess.CalledProcessError as e: - logger.error("CRITICAL: Environment cleanup failed!") - logger.error(f"Command '{e.cmd}' failed. Stderr: {e.stderr}") - logger.error( - f"The following packages may have been left on the system: {self.newly_installed_packages}" - ) - else: - logger.info("No new packages to clean up.") - - return False diff --git a/tools/generator/sdfgenerator.py b/tools/generator/sdfgenerator.py index 99fdf72..8bbafb8 100644 --- a/tools/generator/sdfgenerator.py +++ b/tools/generator/sdfgenerator.py @@ -9,7 +9,6 @@ from tools import SLICE_PATH from tools.generator import classifier from tools.generator.dependency_analyzer import DependencyAnalyzer from tools.generator.writer import SDFWriter -from tools.generator.environment import EnvironmentManager class SDFGenerator: @@ -33,41 +32,38 @@ class SDFGenerator: """ logger.info(f"===== Starting SDF Generation for: {self.package} =====") - # The 'with' statement automatically handles setup and teardown - with EnvironmentManager(self.package): + _clone_slices(self.release, SLICE_PATH) - _clone_slices(self.release, SLICE_PATH) - - # Initialize DNF client for downloading - logger.info(f"Downloading package: {self.package}...") - dnf_client = rpm.init_dnf_client(self.arch, self.release, self.output) - local_pkg_path = rpm.download(dnf_client, self.package) - if not local_pkg_path: - logger.error(f"Failed to download package {self.package}.") - rpm.clear(dnf_client) - return - logger.info(f"Package downloaded to: {local_pkg_path}") + # Initialize DNF client for downloading + logger.info(f"Downloading package: {self.package}...") + dnf_client = rpm.init_dnf_client(self.arch, self.release, self.output) + local_pkg_path = rpm.download(dnf_client, self.package) + if not local_pkg_path: + logger.error(f"Failed to download package {self.package}.") + rpm.clear(dnf_client) + return + logger.info(f"Package downloaded to: {local_pkg_path}") - # extracting RPM files - pkg_root_dir = tempfile.TemporaryDirectory() - logger.info(f"Extracting {local_pkg_path} to {pkg_root_dir} for analysis...") - parse.extract_files(local_pkg_path, pkg_root_dir.name, ["/*"]) + # extracting RPM files + pkg_root_dir = tempfile.TemporaryDirectory() + logger.info(f"Extracting {local_pkg_path} to {pkg_root_dir} for analysis...") + parse.extract_files(local_pkg_path, pkg_root_dir.name, ["/*"]) - # Classify files into slices - classified_slices = classifier.classify_files(self.package, pkg_root_dir.name) - for slice_name, files in classified_slices.items(): - logger.info(f"Slice '{slice_name}' contains {files} ") + # Classify files into slices + classified_slices = classifier.classify_files(self.package, pkg_root_dir.name) + for slice_name, files in classified_slices.items(): + logger.info(f"Slice '{slice_name}' contains {files} ") - # Analyze dependencies - analyzer = DependencyAnalyzer(self.package, pkg_root_dir.name, classified_slices) - slice_deps = analyzer.analyze() - for slice_name, deps in slice_deps.items(): - logger.info(f"Slice '{slice_name}' depends on: {deps}") + # Analyze dependencies + analyzer = DependencyAnalyzer(self.package, pkg_root_dir.name, classified_slices) + slice_deps = analyzer.analyze() + for slice_name, deps in slice_deps.items(): + logger.info(f"Slice '{slice_name}' depends on: {deps}") - # Write the SDF file - writer = SDFWriter(self.output, self.package, classified_slices, slice_deps) - writer.write() + # Write the SDF file + writer = SDFWriter(self.output, self.package, classified_slices, slice_deps) + writer.write() - rpm.clear(dnf_client) + rpm.clear(dnf_client) logger.info(f"===== Finished SDF Generation for: {self.package} =====") -- Gitee From 10f57b8ff011760c4fd2b3ee794101a8433baa95 Mon Sep 17 00:00:00 2001 From: mahaoliang Date: Tue, 12 Aug 2025 11:09:45 +0800 Subject: [PATCH 6/6] ensure Docker is installed and running before building --- bin/gen-sdf-docker.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/bin/gen-sdf-docker.sh b/bin/gen-sdf-docker.sh index e97fae5..e986cb2 100755 --- a/bin/gen-sdf-docker.sh +++ b/bin/gen-sdf-docker.sh @@ -6,6 +6,21 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "${SCRIPT_DIR}/.." || exit 1 BASE_DIR=$(pwd) +ensure_docker_ready() { + # Check if Docker command exists + if ! command -v docker &> /dev/null; then + echo ">>> Docker not found. Installing ..." + # Check for root privileges. Exit if not root. + if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root to install Docker." + exit 1 + fi + dnf install -y docker + systemctl restart docker + echo ">>> Docker installed." + fi +} + # Help Function usage() { echo "Usage: $0 -p -r -o " @@ -54,6 +69,8 @@ if [[ -z "$PACKAGE_NAME" || -z "$RELEASE" || -z "$OUTPUT_DIR" ]]; then usage fi +ensure_docker_ready + # Absolute path for the output directory for Docker mount mkdir -p "${OUTPUT_DIR}" ABS_OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)" -- Gitee