From f5d46f4ca92b5d1088afbb371a8879a15b185bcc Mon Sep 17 00:00:00 2001 From: "maofeng.huang" Date: Tue, 18 Mar 2025 19:05:50 +0800 Subject: [PATCH] Support IXDCGM StartHostengine mode --- README.md | 33 +++++++++- pkg/ixdcgm/admin.go | 117 ++++++++++++++++++++++++++++++++++ pkg/ixdcgm/api.go | 63 +++--------------- pkg/ixdcgm/embedded.go | 4 -- pkg/ixdcgm/standalone.go | 3 - pkg/ixdcgm/startHostengine.go | 108 +++++++++++++++++++++++++++++-- pkg/ixdcgm/utils.go | 9 +++ samples/devicecommon/main.go | 4 +- 8 files changed, 271 insertions(+), 70 deletions(-) create mode 100644 pkg/ixdcgm/admin.go diff --git a/README.md b/README.md index 00f5ed1..7c84334 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,11 @@ ## Introduction IXDCGM is a tool provided for monitoring and managing IX GPUs, offering a rich set of APIs to retrieve information about GPU status, performance, power consumption, and more. -Go-ixdcgm is a wrapper library for ixdcgm written in Go language, providing a simple set of functions that facilitate the easy invocation of ixdcgm's APIs. +Go-IXDCGM is a wrapper library for IXDCGM written in Go language, providing a simple set of functions that facilitate the easy invocation of IXDCGM's APIs. + +**Note:** +- The runtime environment requires the library of **libixdcgm.so**, please install IXDCGM SDK firstly. +- The current version of Go-IXDCGM is compatible with IX driver version **4.2.0**. ## Install @@ -78,9 +82,34 @@ func main() { } ``` +## IXDCGM running modes +IXDCGM can be run in three different ways. + +#### Embedded Mode +In embedded mode, hostengine is started as part of the running process and is loaded as a shared library. In this mode, metrics are also updated and collected automatically. This mode is recommended for users who wants to avoid managing an autonomous hostengine. + +#### Standalone Mode +This mode allows you to connect to a running hostengine using a specified TCP/IP or Unix socket address. It is recommended for remote connections to the hostengine. By default, IXDCGM assumes a TCP connection and attempts to connect to localhost:5777, unless specified otherwise. + +If the hostengine is running at a different address, pass it to `-connect`: +- "IP" - A valid IP address for the remote hostengine, at port 5777. +- "IP:PORT" - A valid IP address and port. + +The `-socket` parameter identifies whether the passed `-connect` address is a Unix socket filename (1) or a TCP/IP address (0): +- "0" - The given address is a TCP/IP address. +- "1" - The given address is a Unix socket filename. + +For example: +``` +go run samples/devicecommon/main.go -connect "0.0.0.0:5777" -socket "0" +``` + +#### StartHostengine Mode +This is an add-on mode which opens an Unix socket for starting and connecting with hostengine. The hostengine is started as a child process of the running process and automatically terminated on exit. When operating in this mode, make sure to stop an already running hostengine to avoid any connection address conflicts. This mode is recommended for safely integrating IXDCGM in an already existing setup. + ## More Samples -The `samples` folder contains more simple examples of how to use go-ixdcgm to call the ixdcgm API. +The `samples` folder contains more simple examples of how to use go-ixdcgm to call the IXDCGM API. To get device information, run the following command: ``` diff --git a/pkg/ixdcgm/admin.go b/pkg/ixdcgm/admin.go new file mode 100644 index 0000000..4ffca5e --- /dev/null +++ b/pkg/ixdcgm/admin.go @@ -0,0 +1,117 @@ +/* +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this file except in compliance with the License. You may obtain +a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ixdcgm + +/* +#cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files +#include +#include +#include "include/dcgm_agent.h" +#include "include/dcgm_structs.h" +*/ +import "C" + +import ( + "fmt" + "log" + "sync" + "unsafe" +) + +var ( + ixdcgmLibHandler unsafe.Pointer + ixdcgmInitCounter int + connection Interface +) + +var ( + uptDirMu sync.Mutex + ixdcgmBinDir = "/usr/local/ixdcgm/bin" + ixdcgmLibDir = "/usr/local/ixdcgm/lib64" +) + +const ixdcgmLib = "libixdcgm.so" + +func initIxDcgm(m int) (err error) { + lib := string2Char(ixdcgmLib) + defer freeCString(lib) + + ixdcgmLibHandler = C.dlopen(lib, C.RTLD_LAZY|C.RTLD_GLOBAL) + if ixdcgmLibHandler == nil { + errMsg := C.GoString(C.dlerror()) + log.Printf("failed to load %s from system library path: %s\ntry to load from %s\n", + ixdcgmLib, errMsg, ixdcgmLibDir) + + abslib := string2Char(ixdcgmLibDir + "/" + ixdcgmLib) + defer freeCString(abslib) + ixdcgmLibHandler = C.dlopen(abslib, C.RTLD_LAZY|C.RTLD_GLOBAL) + } + if ixdcgmLibHandler == nil { + errMsg := C.GoString(C.dlerror()) + return fmt.Errorf("failed to load %s, err: %s", ixdcgmLib, errMsg) + } + + connection, err = New(m) + if err != nil { + return err + } + return nil +} + +func shutdown() (err error) { + mux.Lock() + defer mux.Unlock() + if ixdcgmInitCounter <= 0 { + return fmt.Errorf("ixdcgm already shutdown") + } + + if ixdcgmInitCounter == 1 { + err = connection.Shutdown() + if err != nil { + return err + } + } + + C.dlclose(ixdcgmLibHandler) + ixdcgmInitCounter -= 1 + return nil +} + +func SetIxDcgmBinDir(dir string) error { + uptDirMu.Lock() + defer uptDirMu.Unlock() + + path, err := parseDirPath(dir) + if err != nil { + return err + } + ixdcgmBinDir = path + return nil +} + +func SetIxDcgmLibDir(dir string) error { + uptDirMu.Lock() + defer uptDirMu.Unlock() + + path, err := parseDirPath(dir) + if err != nil { + return err + } + ixdcgmLibDir = path + return nil +} diff --git a/pkg/ixdcgm/api.go b/pkg/ixdcgm/api.go index 0a1e2aa..fc749c2 100644 --- a/pkg/ixdcgm/api.go +++ b/pkg/ixdcgm/api.go @@ -17,53 +17,26 @@ limitations under the License. package ixdcgm -/* -#cgo LDFLAGS: -ldl - -#include -#include -#include "include/dcgm_agent.h" -#include "include/dcgm_structs.h" -*/ import "C" import ( "context" "fmt" "sync" - "unsafe" _ "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm/include" ) var ( - ixdcgmLibHandler unsafe.Pointer - ixdcgmInitCounter int - mux sync.Mutex - connection Interface - handle DcgmHandle + mux sync.Mutex + handle DcgmHandle ) -// dynamic library path -const ( - ixdcgmLib = "libixdcgm.so" -) - -func initIxDcgm(m int) (err error) { - lib := string2Char(ixdcgmLib) - defer freeCString(lib) - - ixdcgmLibHandler = C.dlopen(lib, C.RTLD_LAZY|C.RTLD_GLOBAL) - if ixdcgmLibHandler == nil { - return fmt.Errorf("failed to load %s", ixdcgmLib) - } - - connection, err = New(m) - if err != nil { - return err - } - return nil -} - +// Init starts IXDCGM, based on the user selected mode +// IXDCGM can be started in 3 differengt modes: +// 1. Embedded: Start hostengine within this process +// 2. Standalone: Connect to an already running ix-hostengine at the specified address +// Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" +// 3. StartHostengine: Open an Unix socket to start and connect to the ix-hostengine and terminate before exiting func Init(m int, args ...string) (cleanup func(), err error) { mux.Lock() defer mux.Unlock() @@ -90,26 +63,6 @@ func Init(m int, args ...string) (cleanup func(), err error) { return cleanup, err } -func shutdown() (err error) { - mux.Lock() - defer mux.Unlock() - - if ixdcgmInitCounter <= 0 { - return fmt.Errorf("ixdcgm already shutdown") - } - - if ixdcgmInitCounter == 1 { - err = connection.Shutdown() - if err != nil { - return err - } - } - - C.dlclose(ixdcgmLibHandler) - ixdcgmInitCounter -= 1 - return nil -} - func GetAllDeviceCount() (uint, error) { return getAllDeviceCount() } diff --git a/pkg/ixdcgm/embedded.go b/pkg/ixdcgm/embedded.go index 09c2a29..50f59ea 100644 --- a/pkg/ixdcgm/embedded.go +++ b/pkg/ixdcgm/embedded.go @@ -19,9 +19,6 @@ package ixdcgm /* #cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files - -#include -#include #include "include/dcgm_agent.h" #include "include/dcgm_structs.h" */ @@ -29,7 +26,6 @@ import "C" import "fmt" type embedded struct { - // TODO: implement embeded mode. } func (e *embedded) Shutdown() error { diff --git a/pkg/ixdcgm/standalone.go b/pkg/ixdcgm/standalone.go index 486b156..9d229cd 100644 --- a/pkg/ixdcgm/standalone.go +++ b/pkg/ixdcgm/standalone.go @@ -19,9 +19,6 @@ package ixdcgm /* #cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files - -#include -#include #include "include/dcgm_agent.h" #include "include/dcgm_structs.h" */ diff --git a/pkg/ixdcgm/startHostengine.go b/pkg/ixdcgm/startHostengine.go index 0a46990..5ef5c9f 100644 --- a/pkg/ixdcgm/startHostengine.go +++ b/pkg/ixdcgm/startHostengine.go @@ -17,17 +17,117 @@ limitations under the License. package ixdcgm -import "fmt" +/* +#cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files +#include "include/dcgm_agent.h" +#include "include/dcgm_structs.h" +*/ +import "C" +import ( + "fmt" + "os" + "os/exec" + "syscall" + "unsafe" +) + +var ( + hostengineAsChildPid int + startHostengineDir = "/tmp" +) type startHostengine struct { - // TODO: implement embeded mode. } func (s *startHostengine) Shutdown() (err error) { - return nil + if err = s.disconnect(); err != nil { + return + } + + // terminate ix-hostengine + cmd := exec.Command("ix-hostengine", "--term") + cmd.Env = append(os.Environ(), + "PATH="+os.Getenv("PATH=")+":"+ixdcgmBinDir, + "LD_LIBRARY_PATH="+os.Getenv("LD_LIBRARY_PATH")+":"+ixdcgmLibDir, + ) + if err = cmd.Run(); err != nil { + return fmt.Errorf("Error terminating ix-hostengine: %s", err) + } + fmt.Println("Successfully terminated ix-hostengine.") + + return syscall.Kill(hostengineAsChildPid, syscall.SIGKILL) +} + +func (s *startHostengine) disconnect() (err error) { + result := C.dcgmDisconnect(handle.handle) + if err = errorString(result); err != nil { + return fmt.Errorf("Error disconnecting from ix-hostengine: %s", err) + } + + result = C.dcgmShutdown() + if err = errorString(result); err != nil { + return fmt.Errorf("Error shutting down IXDCGM: %s", err) + } + return } func (s startHostengine) Start(args ...string) (DcgmHandle, error) { fmt.Println("Start ixdcgm based on StartHostengine mode.") - return DcgmHandle{}, nil + + os.Setenv("PATH", os.Getenv("PATH=")+":"+ixdcgmBinDir) + bin, err := exec.LookPath("ix-hostengine") + if err != nil { + return DcgmHandle{}, fmt.Errorf("Error finding ix-hostengine: %s", err) + } + var procAttr syscall.ProcAttr + procAttr.Files = []uintptr{ + uintptr(syscall.Stdin), + uintptr(syscall.Stdout), + uintptr(syscall.Stderr)} + procAttr.Sys = &syscall.SysProcAttr{Setpgid: true} + procAttr.Env = []string{"LD_LIBRARY_PATH=" + os.Getenv("LD_LIBRARY_PATH") + ":" + ixdcgmLibDir} + + dir := startHostengineDir + socketFile, err := os.CreateTemp(dir, "ixdcgm") + if err != nil { + return DcgmHandle{}, fmt.Errorf("Error creating socket file in %s directory: %s", dir, err) + } + + socketPath := socketFile.Name() + defer os.Remove(socketPath) + connectArg := "--domain-socket" + hostengineAsChildPid, err = syscall.ForkExec(bin, []string{bin, connectArg, socketPath}, &procAttr) + if err != nil { + return DcgmHandle{}, fmt.Errorf("Error fork-execing ix-hostengine: %s", err) + } + result := C.dcgmInit() + if err = errorString(result); err != nil { + return DcgmHandle{}, fmt.Errorf("Error initializing IXDCGM: %s", err) + } + + var cHandle C.dcgmHandle_t + var connectParams C.dcgmConnectV2Params_v2 + connectParams.version = makeVersion2(unsafe.Sizeof(connectParams)) + connectParams.addressIsUnixSocket = C.uint(1) + cSockPath := C.CString(socketPath) + defer freeCString(cSockPath) + + result = C.dcgmConnect_v2(cSockPath, &connectParams, &cHandle) + if err = errorString(result); err != nil { + return DcgmHandle{}, fmt.Errorf("Error connecting to ix-hostengine: %s", err) + } + + return DcgmHandle{handle: cHandle}, nil +} + +func SetStartHostengineDir(dir string) error { + uptDirMu.Lock() + defer uptDirMu.Unlock() + + path, err := parseDirPath(dir) + if err != nil { + return err + } + startHostengineDir = path + return nil } diff --git a/pkg/ixdcgm/utils.go b/pkg/ixdcgm/utils.go index eefe3e8..5f54ec2 100644 --- a/pkg/ixdcgm/utils.go +++ b/pkg/ixdcgm/utils.go @@ -30,6 +30,7 @@ package ixdcgm import "C" import ( "fmt" + "path/filepath" "strconv" "strings" "unsafe" @@ -174,3 +175,11 @@ func convertBitsetStr(input string) (output string) { output = strings.Join(result, ",") return } + +func parseDirPath(path string) (string, error) { + absPath, err := filepath.Abs(path) + if err != nil { + return "", fmt.Errorf("Error to parse dir path %s, err: %v", path, err) + } + return absPath, nil +} diff --git a/samples/devicecommon/main.go b/samples/devicecommon/main.go index f4c9651..32b0b79 100644 --- a/samples/devicecommon/main.go +++ b/samples/devicecommon/main.go @@ -25,8 +25,8 @@ import ( ) var ( - connectAddr = flag.String("connectAddr", "0.0.0.0:5777", "DCGM connect address") - isSocket = flag.String("socket", "0", "Connect to Unix socket") + connectAddr = flag.String("connect", "localhost:5777", "Provide ix-hostengine connection address.") + isSocket = flag.String("socket", "0", "Connecting to Unix socket") ) func main() { -- Gitee