diff --git a/README.md b/README.md index 00f5ed181cd40cd0a06057ca297cfab1ce82f60e..7c843345f190ff7bfb72107250099d2259cfebaf 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,11 @@ ## Introduction IXDCGM is a tool provided for monitoring and managing IX GPUs, offering a rich set of APIs to retrieve information about GPU status, performance, power consumption, and more. -Go-ixdcgm is a wrapper library for ixdcgm written in Go language, providing a simple set of functions that facilitate the easy invocation of ixdcgm's APIs. +Go-IXDCGM is a wrapper library for IXDCGM written in Go language, providing a simple set of functions that facilitate the easy invocation of IXDCGM's APIs. + +**Note:** +- The runtime environment requires the library of **libixdcgm.so**, please install IXDCGM SDK firstly. +- The current version of Go-IXDCGM is compatible with IX driver version **4.2.0**. ## Install @@ -78,9 +82,34 @@ func main() { } ``` +## IXDCGM running modes +IXDCGM can be run in three different ways. + +#### Embedded Mode +In embedded mode, hostengine is started as part of the running process and is loaded as a shared library. In this mode, metrics are also updated and collected automatically. This mode is recommended for users who wants to avoid managing an autonomous hostengine. + +#### Standalone Mode +This mode allows you to connect to a running hostengine using a specified TCP/IP or Unix socket address. It is recommended for remote connections to the hostengine. By default, IXDCGM assumes a TCP connection and attempts to connect to localhost:5777, unless specified otherwise. + +If the hostengine is running at a different address, pass it to `-connect`: +- "IP" - A valid IP address for the remote hostengine, at port 5777. +- "IP:PORT" - A valid IP address and port. + +The `-socket` parameter identifies whether the passed `-connect` address is a Unix socket filename (1) or a TCP/IP address (0): +- "0" - The given address is a TCP/IP address. +- "1" - The given address is a Unix socket filename. + +For example: +``` +go run samples/devicecommon/main.go -connect "0.0.0.0:5777" -socket "0" +``` + +#### StartHostengine Mode +This is an add-on mode which opens an Unix socket for starting and connecting with hostengine. The hostengine is started as a child process of the running process and automatically terminated on exit. When operating in this mode, make sure to stop an already running hostengine to avoid any connection address conflicts. This mode is recommended for safely integrating IXDCGM in an already existing setup. + ## More Samples -The `samples` folder contains more simple examples of how to use go-ixdcgm to call the ixdcgm API. +The `samples` folder contains more simple examples of how to use go-ixdcgm to call the IXDCGM API. To get device information, run the following command: ``` diff --git a/pkg/ixdcgm/admin.go b/pkg/ixdcgm/admin.go new file mode 100644 index 0000000000000000000000000000000000000000..4ffca5e1141d4ea1620169bf5f68c40c9b740cec --- /dev/null +++ b/pkg/ixdcgm/admin.go @@ -0,0 +1,117 @@ +/* +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this file except in compliance with the License. You may obtain +a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ixdcgm + +/* +#cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files +#include +#include +#include "include/dcgm_agent.h" +#include "include/dcgm_structs.h" +*/ +import "C" + +import ( + "fmt" + "log" + "sync" + "unsafe" +) + +var ( + ixdcgmLibHandler unsafe.Pointer + ixdcgmInitCounter int + connection Interface +) + +var ( + uptDirMu sync.Mutex + ixdcgmBinDir = "/usr/local/ixdcgm/bin" + ixdcgmLibDir = "/usr/local/ixdcgm/lib64" +) + +const ixdcgmLib = "libixdcgm.so" + +func initIxDcgm(m int) (err error) { + lib := string2Char(ixdcgmLib) + defer freeCString(lib) + + ixdcgmLibHandler = C.dlopen(lib, C.RTLD_LAZY|C.RTLD_GLOBAL) + if ixdcgmLibHandler == nil { + errMsg := C.GoString(C.dlerror()) + log.Printf("failed to load %s from system library path: %s\ntry to load from %s\n", + ixdcgmLib, errMsg, ixdcgmLibDir) + + abslib := string2Char(ixdcgmLibDir + "/" + ixdcgmLib) + defer freeCString(abslib) + ixdcgmLibHandler = C.dlopen(abslib, C.RTLD_LAZY|C.RTLD_GLOBAL) + } + if ixdcgmLibHandler == nil { + errMsg := C.GoString(C.dlerror()) + return fmt.Errorf("failed to load %s, err: %s", ixdcgmLib, errMsg) + } + + connection, err = New(m) + if err != nil { + return err + } + return nil +} + +func shutdown() (err error) { + mux.Lock() + defer mux.Unlock() + if ixdcgmInitCounter <= 0 { + return fmt.Errorf("ixdcgm already shutdown") + } + + if ixdcgmInitCounter == 1 { + err = connection.Shutdown() + if err != nil { + return err + } + } + + C.dlclose(ixdcgmLibHandler) + ixdcgmInitCounter -= 1 + return nil +} + +func SetIxDcgmBinDir(dir string) error { + uptDirMu.Lock() + defer uptDirMu.Unlock() + + path, err := parseDirPath(dir) + if err != nil { + return err + } + ixdcgmBinDir = path + return nil +} + +func SetIxDcgmLibDir(dir string) error { + uptDirMu.Lock() + defer uptDirMu.Unlock() + + path, err := parseDirPath(dir) + if err != nil { + return err + } + ixdcgmLibDir = path + return nil +} diff --git a/pkg/ixdcgm/api.go b/pkg/ixdcgm/api.go index 0a1e2aa85892b6dc06f5566e6f29f043a02e5e03..fc749c263078d68b15bd337c853367ff59d698da 100644 --- a/pkg/ixdcgm/api.go +++ b/pkg/ixdcgm/api.go @@ -17,53 +17,26 @@ limitations under the License. package ixdcgm -/* -#cgo LDFLAGS: -ldl - -#include -#include -#include "include/dcgm_agent.h" -#include "include/dcgm_structs.h" -*/ import "C" import ( "context" "fmt" "sync" - "unsafe" _ "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm/include" ) var ( - ixdcgmLibHandler unsafe.Pointer - ixdcgmInitCounter int - mux sync.Mutex - connection Interface - handle DcgmHandle + mux sync.Mutex + handle DcgmHandle ) -// dynamic library path -const ( - ixdcgmLib = "libixdcgm.so" -) - -func initIxDcgm(m int) (err error) { - lib := string2Char(ixdcgmLib) - defer freeCString(lib) - - ixdcgmLibHandler = C.dlopen(lib, C.RTLD_LAZY|C.RTLD_GLOBAL) - if ixdcgmLibHandler == nil { - return fmt.Errorf("failed to load %s", ixdcgmLib) - } - - connection, err = New(m) - if err != nil { - return err - } - return nil -} - +// Init starts IXDCGM, based on the user selected mode +// IXDCGM can be started in 3 differengt modes: +// 1. Embedded: Start hostengine within this process +// 2. Standalone: Connect to an already running ix-hostengine at the specified address +// Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" +// 3. StartHostengine: Open an Unix socket to start and connect to the ix-hostengine and terminate before exiting func Init(m int, args ...string) (cleanup func(), err error) { mux.Lock() defer mux.Unlock() @@ -90,26 +63,6 @@ func Init(m int, args ...string) (cleanup func(), err error) { return cleanup, err } -func shutdown() (err error) { - mux.Lock() - defer mux.Unlock() - - if ixdcgmInitCounter <= 0 { - return fmt.Errorf("ixdcgm already shutdown") - } - - if ixdcgmInitCounter == 1 { - err = connection.Shutdown() - if err != nil { - return err - } - } - - C.dlclose(ixdcgmLibHandler) - ixdcgmInitCounter -= 1 - return nil -} - func GetAllDeviceCount() (uint, error) { return getAllDeviceCount() } diff --git a/pkg/ixdcgm/embedded.go b/pkg/ixdcgm/embedded.go index 09c2a29139ca997feaaaf1ca87797491d8c769c3..50f59ea873ff57ed0e5f4c09661111fea13a35e2 100644 --- a/pkg/ixdcgm/embedded.go +++ b/pkg/ixdcgm/embedded.go @@ -19,9 +19,6 @@ package ixdcgm /* #cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files - -#include -#include #include "include/dcgm_agent.h" #include "include/dcgm_structs.h" */ @@ -29,7 +26,6 @@ import "C" import "fmt" type embedded struct { - // TODO: implement embeded mode. } func (e *embedded) Shutdown() error { diff --git a/pkg/ixdcgm/standalone.go b/pkg/ixdcgm/standalone.go index 486b1569f53be0d67d788f0b964b3abebc7dd7bc..9d229cdfd34cc1005d9eadb0e8dfcf3f373983e5 100644 --- a/pkg/ixdcgm/standalone.go +++ b/pkg/ixdcgm/standalone.go @@ -19,9 +19,6 @@ package ixdcgm /* #cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files - -#include -#include #include "include/dcgm_agent.h" #include "include/dcgm_structs.h" */ diff --git a/pkg/ixdcgm/startHostengine.go b/pkg/ixdcgm/startHostengine.go index 0a46990c30e413b4f401156ce3f4fc96b1af941f..5ef5c9f33e0e7198bad0e0d069b69af1601099b2 100644 --- a/pkg/ixdcgm/startHostengine.go +++ b/pkg/ixdcgm/startHostengine.go @@ -17,17 +17,117 @@ limitations under the License. package ixdcgm -import "fmt" +/* +#cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files +#include "include/dcgm_agent.h" +#include "include/dcgm_structs.h" +*/ +import "C" +import ( + "fmt" + "os" + "os/exec" + "syscall" + "unsafe" +) + +var ( + hostengineAsChildPid int + startHostengineDir = "/tmp" +) type startHostengine struct { - // TODO: implement embeded mode. } func (s *startHostengine) Shutdown() (err error) { - return nil + if err = s.disconnect(); err != nil { + return + } + + // terminate ix-hostengine + cmd := exec.Command("ix-hostengine", "--term") + cmd.Env = append(os.Environ(), + "PATH="+os.Getenv("PATH=")+":"+ixdcgmBinDir, + "LD_LIBRARY_PATH="+os.Getenv("LD_LIBRARY_PATH")+":"+ixdcgmLibDir, + ) + if err = cmd.Run(); err != nil { + return fmt.Errorf("Error terminating ix-hostengine: %s", err) + } + fmt.Println("Successfully terminated ix-hostengine.") + + return syscall.Kill(hostengineAsChildPid, syscall.SIGKILL) +} + +func (s *startHostengine) disconnect() (err error) { + result := C.dcgmDisconnect(handle.handle) + if err = errorString(result); err != nil { + return fmt.Errorf("Error disconnecting from ix-hostengine: %s", err) + } + + result = C.dcgmShutdown() + if err = errorString(result); err != nil { + return fmt.Errorf("Error shutting down IXDCGM: %s", err) + } + return } func (s startHostengine) Start(args ...string) (DcgmHandle, error) { fmt.Println("Start ixdcgm based on StartHostengine mode.") - return DcgmHandle{}, nil + + os.Setenv("PATH", os.Getenv("PATH=")+":"+ixdcgmBinDir) + bin, err := exec.LookPath("ix-hostengine") + if err != nil { + return DcgmHandle{}, fmt.Errorf("Error finding ix-hostengine: %s", err) + } + var procAttr syscall.ProcAttr + procAttr.Files = []uintptr{ + uintptr(syscall.Stdin), + uintptr(syscall.Stdout), + uintptr(syscall.Stderr)} + procAttr.Sys = &syscall.SysProcAttr{Setpgid: true} + procAttr.Env = []string{"LD_LIBRARY_PATH=" + os.Getenv("LD_LIBRARY_PATH") + ":" + ixdcgmLibDir} + + dir := startHostengineDir + socketFile, err := os.CreateTemp(dir, "ixdcgm") + if err != nil { + return DcgmHandle{}, fmt.Errorf("Error creating socket file in %s directory: %s", dir, err) + } + + socketPath := socketFile.Name() + defer os.Remove(socketPath) + connectArg := "--domain-socket" + hostengineAsChildPid, err = syscall.ForkExec(bin, []string{bin, connectArg, socketPath}, &procAttr) + if err != nil { + return DcgmHandle{}, fmt.Errorf("Error fork-execing ix-hostengine: %s", err) + } + result := C.dcgmInit() + if err = errorString(result); err != nil { + return DcgmHandle{}, fmt.Errorf("Error initializing IXDCGM: %s", err) + } + + var cHandle C.dcgmHandle_t + var connectParams C.dcgmConnectV2Params_v2 + connectParams.version = makeVersion2(unsafe.Sizeof(connectParams)) + connectParams.addressIsUnixSocket = C.uint(1) + cSockPath := C.CString(socketPath) + defer freeCString(cSockPath) + + result = C.dcgmConnect_v2(cSockPath, &connectParams, &cHandle) + if err = errorString(result); err != nil { + return DcgmHandle{}, fmt.Errorf("Error connecting to ix-hostengine: %s", err) + } + + return DcgmHandle{handle: cHandle}, nil +} + +func SetStartHostengineDir(dir string) error { + uptDirMu.Lock() + defer uptDirMu.Unlock() + + path, err := parseDirPath(dir) + if err != nil { + return err + } + startHostengineDir = path + return nil } diff --git a/pkg/ixdcgm/utils.go b/pkg/ixdcgm/utils.go index eefe3e8ef8e69644a17a30e5df3242ba818fe5c6..5f54ec2733ff62fdfedcd1e2f003a303e33ba86c 100644 --- a/pkg/ixdcgm/utils.go +++ b/pkg/ixdcgm/utils.go @@ -30,6 +30,7 @@ package ixdcgm import "C" import ( "fmt" + "path/filepath" "strconv" "strings" "unsafe" @@ -174,3 +175,11 @@ func convertBitsetStr(input string) (output string) { output = strings.Join(result, ",") return } + +func parseDirPath(path string) (string, error) { + absPath, err := filepath.Abs(path) + if err != nil { + return "", fmt.Errorf("Error to parse dir path %s, err: %v", path, err) + } + return absPath, nil +} diff --git a/samples/devicecommon/main.go b/samples/devicecommon/main.go index f4c9651a3f543e9b96ebfef5e95c9bce2738ceb5..32b0b7967b4695ff0caff0d56fc82b0b1f72e9ad 100644 --- a/samples/devicecommon/main.go +++ b/samples/devicecommon/main.go @@ -25,8 +25,8 @@ import ( ) var ( - connectAddr = flag.String("connectAddr", "0.0.0.0:5777", "DCGM connect address") - isSocket = flag.String("socket", "0", "Connect to Unix socket") + connectAddr = flag.String("connect", "localhost:5777", "Provide ix-hostengine connection address.") + isSocket = flag.String("socket", "0", "Connecting to Unix socket") ) func main() {