From e3916e5a4fd73eda6f80d5ebabd100805229012e Mon Sep 17 00:00:00 2001 From: "maofeng.huang" Date: Tue, 11 Feb 2025 15:59:42 +0800 Subject: [PATCH] support more apis and update device status --- README.md | 4 +- pkg/ixdcgm/api.go | 12 ++ pkg/ixdcgm/const.go | 143 +++++++++++++++ pkg/ixdcgm/device_info.go | 28 +-- pkg/ixdcgm/device_status.go | 206 +++++++++++++++------- pkg/ixdcgm/embedded.go | 49 +++++- pkg/ixdcgm/fields.go | 75 +++++--- pkg/ixdcgm/include/ixdcgmApiExport.h | 248 ++++++++++++++++++--------- pkg/ixdcgm/include/ixdcgmFields.h | 4 +- pkg/ixdcgm/include/ixdcgmStructs.h | 235 ++++++++++++++++++++++++- pkg/ixdcgm/process_info.go | 85 +++++++++ pkg/ixdcgm/utils.go | 12 ++ samples/deviceinfo/main.go | 2 +- samples/deviceprocessinfo/main.go | 57 ++++++ samples/devicestatus/main.go | 58 ++++--- 15 files changed, 998 insertions(+), 220 deletions(-) create mode 100644 pkg/ixdcgm/const.go create mode 100644 pkg/ixdcgm/process_info.go create mode 100644 samples/deviceprocessinfo/main.go diff --git a/README.md b/README.md index 7b0c28c..305d76f 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Introduction -ixDCGM is a tool provided for monitoring and managing **IluvatarCorex GPUs**, offering a rich set of APIs to retrieve information about GPU status, performance, power consumption, and more. Go-IXDCGM is a wrapper library for IXDCGM written in Go language, providing a simple set of functions that facilitate the easy invocation of IXDCGM's APIs. +IXDCGM is a tool provided for monitoring and managing IX GPUs, offering a rich set of APIs to retrieve information about GPU status, performance, power consumption, and more. Go-IXDCGM is a wrapper library for IXDCGM written in Go language, providing a simple set of functions that facilitate the easy invocation of IXDCGM's APIs. ## Install @@ -26,7 +26,7 @@ import ( "log" "os" - "iluvatar.com/go-dcgm/pkg/ixdcgm" + "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm" ) const ( diff --git a/pkg/ixdcgm/api.go b/pkg/ixdcgm/api.go index 1b758d7..86d09e2 100644 --- a/pkg/ixdcgm/api.go +++ b/pkg/ixdcgm/api.go @@ -30,6 +30,8 @@ import ( "fmt" "sync" "unsafe" + + _ "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm/include" ) var ( @@ -125,6 +127,16 @@ func GetDeviceStatus(gpuId uint) (DeviceStatus, error) { return getDeviceStatus(gpuId) } +// GetDeviceProfStatus monitors GPM info including SM_ACTIVE, SM_OCCUPANCY and DRAM_ACTIVE +func GetDeviceProfStatus(gpuId uint) (DeviceProfStatus, error) { + return getDeviceProfStatus(gpuId) +} + +// GetDeviceRunningProcess get the running process infos for the given gpu id +func GetDeviceRunningProcesses(gpuId uint) ([]DeviceProcessInfo, error) { + return getDeviceRunningProcesses(gpuId) +} + func GetDeviceOnSameBoard(gpuId1, gpuId2 uint) (bool, error) { return getDeviceOnSameBoard(gpuId1, gpuId2) } diff --git a/pkg/ixdcgm/const.go b/pkg/ixdcgm/const.go new file mode 100644 index 0000000..a6eea2e --- /dev/null +++ b/pkg/ixdcgm/const.go @@ -0,0 +1,143 @@ +/* +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this file except in compliance with the License. You may obtain +a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ixdcgm + +/* +#include "include/dcgm_agent.h" +#include "include/dcgm_structs.h" +*/ +import "C" + +const ( + DCGM_FI_UNKNOWN Short = C.DCGM_FI_UNKNOWN + DCGM_FI_DRIVER_VERSION Short = C.DCGM_FI_DRIVER_VERSION + DCGM_FI_NVML_VERSION Short = C.DCGM_FI_NVML_VERSION + DCGM_FI_PROCESS_NAME Short = C.DCGM_FI_PROCESS_NAME + DCGM_FI_DEV_COUNT Short = C.DCGM_FI_DEV_COUNT + DCGM_FI_CUDA_DRIVER_VERSION Short = C.DCGM_FI_CUDA_DRIVER_VERSION + DCGM_FI_DEV_NAME Short = C.DCGM_FI_DEV_NAME + DCGM_FI_DEV_BRAND Short = C.DCGM_FI_DEV_BRAND + DCGM_FI_DEV_NVML_INDEX Short = C.DCGM_FI_DEV_NVML_INDEX + DCGM_FI_DEV_SERIAL Short = C.DCGM_FI_DEV_SERIAL + DCGM_FI_DEV_UUID Short = C.DCGM_FI_DEV_UUID + DCGM_FI_DEV_MINOR_NUMBER Short = C.DCGM_FI_DEV_MINOR_NUMBER + DCGM_FI_DEV_OEM_INFOROM_VER Short = C.DCGM_FI_DEV_OEM_INFOROM_VER + DCGM_FI_DEV_PCI_BUSID Short = C.DCGM_FI_DEV_PCI_BUSID + DCGM_FI_DEV_PCI_COMBINED_ID Short = C.DCGM_FI_DEV_PCI_COMBINED_ID + DCGM_FI_DEV_PCI_SUBSYS_ID Short = C.DCGM_FI_DEV_PCI_SUBSYS_ID + DCGM_FI_GPU_TOPOLOGY_PCI Short = C.DCGM_FI_GPU_TOPOLOGY_PCI + DCGM_FI_GPU_TOPOLOGY_NVLINK Short = C.DCGM_FI_GPU_TOPOLOGY_NVLINK + DCGM_FI_GPU_TOPOLOGY_AFFINITY Short = C.DCGM_FI_GPU_TOPOLOGY_AFFINITY + DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY Short = C.DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY + DCGM_FI_DEV_COMPUTE_MODE Short = C.DCGM_FI_DEV_COMPUTE_MODE + DCGM_FI_DEV_PERSISTENCE_MODE Short = C.DCGM_FI_DEV_PERSISTENCE_MODE + DCGM_FI_DEV_MIG_MODE Short = C.DCGM_FI_DEV_MIG_MODE + DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR Short = C.DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR + DCGM_FI_DEV_MIG_MAX_SLICES Short = C.DCGM_FI_DEV_MIG_MAX_SLICES + DCGM_FI_DEV_CPU_AFFINITY_0 Short = C.DCGM_FI_DEV_CPU_AFFINITY_0 + DCGM_FI_DEV_CPU_AFFINITY_1 Short = C.DCGM_FI_DEV_CPU_AFFINITY_1 + DCGM_FI_DEV_CPU_AFFINITY_2 Short = C.DCGM_FI_DEV_CPU_AFFINITY_2 + DCGM_FI_DEV_CPU_AFFINITY_3 Short = C.DCGM_FI_DEV_CPU_AFFINITY_3 + DCGM_FI_DEV_CC_MODE Short = C.DCGM_FI_DEV_CC_MODE + DCGM_FI_DEV_MIG_ATTRIBUTES Short = C.DCGM_FI_DEV_MIG_ATTRIBUTES + DCGM_FI_DEV_MIG_GI_INFO Short = C.DCGM_FI_DEV_MIG_GI_INFO + DCGM_FI_DEV_MIG_CI_INFO Short = C.DCGM_FI_DEV_MIG_CI_INFO + DCGM_FI_DEV_ECC_INFOROM_VER Short = C.DCGM_FI_DEV_ECC_INFOROM_VER + DCGM_FI_DEV_POWER_INFOROM_VER Short = C.DCGM_FI_DEV_POWER_INFOROM_VER + DCGM_FI_DEV_INFOROM_IMAGE_VER Short = C.DCGM_FI_DEV_INFOROM_IMAGE_VER + DCGM_FI_DEV_INFOROM_CONFIG_CHECK Short = C.DCGM_FI_DEV_INFOROM_CONFIG_CHECK + DCGM_FI_DEV_INFOROM_CONFIG_VALID Short = C.DCGM_FI_DEV_INFOROM_CONFIG_VALID + DCGM_FI_DEV_VBIOS_VERSION Short = C.DCGM_FI_DEV_VBIOS_VERSION + DCGM_FI_DEV_MEM_AFFINITY_0 Short = C.DCGM_FI_DEV_MEM_AFFINITY_0 + DCGM_FI_DEV_MEM_AFFINITY_1 Short = C.DCGM_FI_DEV_MEM_AFFINITY_1 + DCGM_FI_DEV_MEM_AFFINITY_2 Short = C.DCGM_FI_DEV_MEM_AFFINITY_2 + DCGM_FI_DEV_MEM_AFFINITY_3 Short = C.DCGM_FI_DEV_MEM_AFFINITY_3 + DCGM_FI_DEV_BAR1_TOTAL Short = C.DCGM_FI_DEV_BAR1_TOTAL + DCGM_FI_SYNC_BOOST Short = C.DCGM_FI_SYNC_BOOST + DCGM_FI_DEV_BAR1_USED Short = C.DCGM_FI_DEV_BAR1_USED + DCGM_FI_DEV_BAR1_FREE Short = C.DCGM_FI_DEV_BAR1_FREE + DCGM_FI_DEV_SM_CLOCK Short = C.DCGM_FI_DEV_SM_CLOCK + DCGM_FI_DEV_MEM_CLOCK Short = C.DCGM_FI_DEV_MEM_CLOCK + DCGM_FI_DEV_VIDEO_CLOCK Short = C.DCGM_FI_DEV_VIDEO_CLOCK + DCGM_FI_DEV_APP_SM_CLOCK Short = C.DCGM_FI_DEV_APP_SM_CLOCK + DCGM_FI_DEV_APP_MEM_CLOCK Short = C.DCGM_FI_DEV_APP_MEM_CLOCK + DCGM_FI_DEV_CLOCK_THROTTLE_REASONS Short = C.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS + DCGM_FI_DEV_MAX_SM_CLOCK Short = C.DCGM_FI_DEV_MAX_SM_CLOCK + DCGM_FI_DEV_MAX_MEM_CLOCK Short = C.DCGM_FI_DEV_MAX_MEM_CLOCK + DCGM_FI_DEV_MAX_VIDEO_CLOCK Short = C.DCGM_FI_DEV_MAX_VIDEO_CLOCK + DCGM_FI_DEV_AUTOBOOST Short = C.DCGM_FI_DEV_AUTOBOOST + DCGM_FI_DEV_SUPPORTED_CLOCKS Short = C.DCGM_FI_DEV_SUPPORTED_CLOCKS + DCGM_FI_DEV_MEMORY_TEMP Short = C.DCGM_FI_DEV_MEMORY_TEMP + DCGM_FI_DEV_GPU_TEMP Short = C.DCGM_FI_DEV_GPU_TEMP + DCGM_FI_DEV_MEM_MAX_OP_TEMP Short = C.DCGM_FI_DEV_MEM_MAX_OP_TEMP + DCGM_FI_DEV_GPU_MAX_OP_TEMP Short = C.DCGM_FI_DEV_GPU_MAX_OP_TEMP + DCGM_FI_DEV_POWER_USAGE Short = C.DCGM_FI_DEV_POWER_USAGE + DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Short = C.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + DCGM_FI_DEV_POWER_USAGE_INSTANT Short = C.DCGM_FI_DEV_POWER_USAGE_INSTANT + DCGM_FI_DEV_SLOWDOWN_TEMP Short = C.DCGM_FI_DEV_SLOWDOWN_TEMP + DCGM_FI_DEV_SHUTDOWN_TEMP Short = C.DCGM_FI_DEV_SHUTDOWN_TEMP + DCGM_FI_DEV_POWER_MGMT_LIMIT Short = C.DCGM_FI_DEV_POWER_MGMT_LIMIT + DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN Short = C.DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN + DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX Short = C.DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX + DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF Short = C.DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF + DCGM_FI_DEV_ENFORCED_POWER_LIMIT Short = C.DCGM_FI_DEV_ENFORCED_POWER_LIMIT + DCGM_FI_DEV_PSTATE Short = C.DCGM_FI_DEV_PSTATE + DCGM_FI_DEV_FAN_SPEED Short = C.DCGM_FI_DEV_FAN_SPEED + DCGM_FI_DEV_PCIE_TX_THROUGHPUT Short = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT + DCGM_FI_DEV_PCIE_RX_THROUGHPUT Short = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT + DCGM_FI_DEV_PCIE_REPLAY_COUNTER Short = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER + DCGM_FI_DEV_GPU_UTIL Short = C.DCGM_FI_DEV_GPU_UTIL + DCGM_FI_DEV_MEM_COPY_UTIL Short = C.DCGM_FI_DEV_MEM_COPY_UTIL + DCGM_FI_DEV_ACCOUNTING_DATA Short = C.DCGM_FI_DEV_ACCOUNTING_DATA + DCGM_FI_DEV_ENC_UTIL Short = C.DCGM_FI_DEV_ENC_UTIL + DCGM_FI_DEV_DEC_UTIL Short = C.DCGM_FI_DEV_DEC_UTIL + DCGM_FI_DEV_XID_ERRORS Short = C.DCGM_FI_DEV_XID_ERRORS + DCGM_FI_DEV_PCIE_MAX_LINK_GEN Short = C.DCGM_FI_DEV_PCIE_MAX_LINK_GEN + DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH Short = C.DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH + DCGM_FI_DEV_PCIE_LINK_GEN Short = C.DCGM_FI_DEV_PCIE_LINK_GEN + DCGM_FI_DEV_PCIE_LINK_WIDTH Short = C.DCGM_FI_DEV_PCIE_LINK_WIDTH + DCGM_FI_DEV_POWER_VIOLATION Short = C.DCGM_FI_DEV_POWER_VIOLATION + DCGM_FI_DEV_THERMAL_VIOLATION Short = C.DCGM_FI_DEV_THERMAL_VIOLATION + DCGM_FI_DEV_SYNC_BOOST_VIOLATION Short = C.DCGM_FI_DEV_SYNC_BOOST_VIOLATION + DCGM_FI_DEV_BOARD_LIMIT_VIOLATION Short = C.DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + DCGM_FI_DEV_LOW_UTIL_VIOLATION Short = C.DCGM_FI_DEV_LOW_UTIL_VIOLATION + DCGM_FI_DEV_RELIABILITY_VIOLATION Short = C.DCGM_FI_DEV_RELIABILITY_VIOLATION + DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION Short = C.DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION Short = C.DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + DCGM_FI_DEV_FB_TOTAL Short = C.DCGM_FI_DEV_FB_TOTAL + DCGM_FI_DEV_FB_FREE Short = C.DCGM_FI_DEV_FB_FREE + DCGM_FI_DEV_FB_USED Short = C.DCGM_FI_DEV_FB_USED + DCGM_FI_DEV_FB_RESERVED Short = C.DCGM_FI_DEV_FB_RESERVED + DCGM_FI_DEV_FB_USED_PERCENT Short = C.DCGM_FI_DEV_FB_USED_PERCENT + DCGM_FI_DEV_ECC_CURRENT Short = C.DCGM_FI_DEV_ECC_CURRENT + DCGM_FI_DEV_ECC_PENDING Short = C.DCGM_FI_DEV_ECC_PENDING + DCGM_FI_DEV_ECC_SBE_VOL_DEV Short = C.DCGM_FI_DEV_ECC_SBE_VOL_DEV + DCGM_FI_DEV_ECC_DBE_VOL_DEV Short = C.DCGM_FI_DEV_ECC_DBE_VOL_DEV + DCGM_FI_INTERNAL_FIELDS_0_START Short = C.DCGM_FI_INTERNAL_FIELDS_0_START + DCGM_FI_INTERNAL_FIELDS_0_END Short = C.DCGM_FI_INTERNAL_FIELDS_0_END + + DCGM_FI_PROF_GR_ENGINE_ACTIVE Short = C.DCGM_FI_PROF_GR_ENGINE_ACTIVE + DCGM_FI_PROF_SM_ACTIVE Short = C.DCGM_FI_PROF_SM_ACTIVE + DCGM_FI_PROF_SM_OCCUPANCY Short = C.DCGM_FI_PROF_SM_OCCUPANCY + DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Short = C.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + DCGM_FI_PROF_DRAM_ACTIVE Short = C.DCGM_FI_PROF_DRAM_ACTIVE + DCGM_FI_PROF_PCIE_TX_BYTES Short = C.DCGM_FI_PROF_PCIE_TX_BYTES + DCGM_FI_PROF_PCIE_RX_BYTES Short = C.DCGM_FI_PROF_PCIE_RX_BYTES + + DCGM_FI_MAX_FIELDS Short = C.DCGM_FI_MAX_FIELDS +) diff --git a/pkg/ixdcgm/device_info.go b/pkg/ixdcgm/device_info.go index 33c3105..4bde0ac 100644 --- a/pkg/ixdcgm/device_info.go +++ b/pkg/ixdcgm/device_info.go @@ -54,7 +54,7 @@ type DeviceInfo struct { GPUId uint IxDCGMSupported string Uuid string - Power uint + PowerLimit uint PCI PciInfo MemoryUsage MemoryUsageInfo Identifiers DeviceIdentifier @@ -92,7 +92,7 @@ func getPciBandwidth(gpuId uint) (int64, error) { } groupName := fmt.Sprintf("pciBandwidth%d", gpuId) - groupId, err := WatchFields(gpuId, fieldsId, groupName) + groupId, err := WatchFields([]uint{gpuId}, fieldsId, groupName) if err != nil { FieldGroupDestroy(fieldsId) return 0, err @@ -158,7 +158,7 @@ func getDeviceInfo(gpuId uint) (DeviceInfo, error) { } uuid := cChar2String(&dcgmAttr.identifiers.uuid[0]) - power := uint(dcgmAttr.powerLimits.defaultPowerLimit) + powerLimit := uint(dcgmAttr.powerLimits.defaultPowerLimit) busId := cChar2String(&dcgmAttr.identifiers.pciBusId[0]) pci := PciInfo{ @@ -182,7 +182,7 @@ func getDeviceInfo(gpuId uint) (DeviceInfo, error) { GPUId: gpuId, IxDCGMSupported: supported, Uuid: uuid, - Power: power, + PowerLimit: powerLimit, PCI: pci, MemoryUsage: memInfo, Identifiers: id, @@ -216,24 +216,24 @@ func getCPUAffinity(gpuId uint) (string, error) { ) affFields := make([]Short, fieldsCount) - affFields[affinity0] = C.DCGM_FI_DEV_CPU_AFFINITY_0 - affFields[affinity1] = C.DCGM_FI_DEV_CPU_AFFINITY_1 - affFields[affinity2] = C.DCGM_FI_DEV_CPU_AFFINITY_2 - affFields[affinity3] = C.DCGM_FI_DEV_CPU_AFFINITY_3 + affFields[affinity0] = DCGM_FI_DEV_CPU_AFFINITY_0 + affFields[affinity1] = DCGM_FI_DEV_CPU_AFFINITY_1 + affFields[affinity2] = DCGM_FI_DEV_CPU_AFFINITY_2 + affFields[affinity3] = DCGM_FI_DEV_CPU_AFFINITY_3 - fieldsName := fmt.Sprintf("cpuAffFields%d", gpuId) - fieldId, err := FieldGroupCreate(fieldsName, affFields) + fieldGrpName := fmt.Sprintf("cpuAffFields%d", gpuId) + fieldGrpHdl, err := FieldGroupCreate(fieldGrpName, affFields) if err != nil { return "N/A", err } - defer FieldGroupDestroy(fieldId) + defer FieldGroupDestroy(fieldGrpHdl) - gpoupName := fmt.Sprintf("cpuAff%d", gpuId) - groupId, err := WatchFields(gpuId, fieldId, gpoupName) + gpuGrpName := fmt.Sprintf("cpuAff%d", gpuId) + gpuGrpHdl, err := WatchFields([]uint{gpuId}, fieldGrpHdl, gpuGrpName) if err != nil { return "N/A", err } - defer DestroyGroup(groupId) + defer DestroyGroup(gpuGrpHdl) values, err := GetLatestValuesForFields(gpuId, affFields) if err != nil { diff --git a/pkg/ixdcgm/device_status.go b/pkg/ixdcgm/device_status.go index 9f1054a..f0aac21 100644 --- a/pkg/ixdcgm/device_status.go +++ b/pkg/ixdcgm/device_status.go @@ -25,14 +25,15 @@ import "C" import ( "fmt" "math/rand" + "time" ) type PerfState uint const ( - PerfStateMax = 0 - PerfStateMin = 15 - PerfStateUnknown = 32 + PerfStateMax PerfState = 0 + PerfStateMin PerfState = 15 + PerfStateUnknown PerfState = 32 ) func (p PerfState) String() string { @@ -43,105 +44,188 @@ func (p PerfState) String() string { } type UtilizationInfo struct { - GPU int64 // % - Memory int64 // % + Gpu int64 // % + Mem int64 // % } type ClockInfo struct { - Cores int64 // MHz - Memory int64 // MHz + Sm int64 // MHz + Mem int64 // MHz } type PCIStatusInfo struct { - Rx int64 // MB/s - Tx int64 // MB/s - Replays int64 + Rx int64 // KB/s + Tx int64 // KB/s + ReplayCounter int64 // Counter +} + +type MemoryUsage struct { + Total int64 // Total Memory (Frame Buffer) of the GPU in MB + Used int64 // Used Memory (Frame Buffer) in MB + Free int64 // Free Memory (Frame Buffer) in MB } type DeviceStatus struct { - Power float64 // W - Temperature int64 // °C + Id uint + Power string // "N/A" or float64 str, W + Temperature string // "N/A" or int64 str, °C + Utilization UtilizationInfo Clocks ClockInfo PCI PCIStatusInfo Performance PerfState - FanSpeed int64 // % + MemUsage MemoryUsage + + FanSpeed string // "N/A" or int64 str, % + EccSbeVolDev string // "N/A" or int64 str, 1 for errors occurred, 0 for no errors + EccDbeVolDev string // "N/A" or int64 str, 1 for errors occurred, 0 for no errors +} + +type DeviceProfStatus struct { + SmActive string // "N/A" or float64 str, % + SmOccupancy string // "N/A" or float64 str, % + DramActive string // "N/A" or float64 str, % } func getDeviceStatus(gpuId uint) (status DeviceStatus, err error) { const ( - pwr int = iota - temp - sm - mem - smClock - memClock - pcieRxThroughput - pcieTxThroughput - pcieReplay - fanSpeed + IdxPower int = iota + IdxGpuTemp + IdxGpuUtil + IdxMemUtil + IdxSmClock + IdxMemClock + IdxPcieRxThroughput + IdxPcieTxThroughput + IdxPcieReplayCounter + IdxFanSpeed + IdxEccSbeVolDev + IdxEccDbeVolDev + IdxMemTotal + IdxMemUsed + IdxMemFree ) - deviceFields := []Short{ - C.DCGM_FI_DEV_POWER_USAGE, - C.DCGM_FI_DEV_GPU_TEMP, - C.DCGM_FI_DEV_GPU_UTIL, - C.DCGM_FI_DEV_MEM_COPY_UTIL, - C.DCGM_FI_DEV_SM_CLOCK, - C.DCGM_FI_DEV_MEM_CLOCK, - C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT, - C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT, - C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER, - C.DCGM_FI_DEV_FAN_SPEED, + fields := []Short{ + DCGM_FI_DEV_POWER_USAGE, + DCGM_FI_DEV_GPU_TEMP, + DCGM_FI_DEV_GPU_UTIL, + DCGM_FI_DEV_MEM_COPY_UTIL, + DCGM_FI_DEV_SM_CLOCK, + DCGM_FI_DEV_MEM_CLOCK, + DCGM_FI_DEV_PCIE_RX_THROUGHPUT, + DCGM_FI_DEV_PCIE_TX_THROUGHPUT, + DCGM_FI_DEV_PCIE_REPLAY_COUNTER, + DCGM_FI_DEV_FAN_SPEED, + DCGM_FI_DEV_ECC_SBE_VOL_DEV, + DCGM_FI_DEV_ECC_DBE_VOL_DEV, + DCGM_FI_DEV_FB_TOTAL, + DCGM_FI_DEV_FB_USED, + DCGM_FI_DEV_FB_FREE, } - fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64()) - fieldsId, err := FieldGroupCreate(fieldsName, deviceFields) + fieldGrpName := fmt.Sprintf("devStatusFields%d", rand.Uint64()) + fieldGrp, err := FieldGroupCreate(fieldGrpName, fields) if err != nil { return } - groupName := fmt.Sprintf("devStatus%d", rand.Uint64()) - groupId, err := WatchFields(gpuId, fieldsId, groupName) + gpuGrpName := fmt.Sprintf("devStatusGrp%d", rand.Uint64()) + gpuGrpHdl, err := WatchFields([]uint{gpuId}, fieldGrp, gpuGrpName) if err != nil { - _ = FieldGroupDestroy(fieldsId) + _ = FieldGroupDestroy(fieldGrp) return } - values, err := GetLatestValuesForFields(gpuId, deviceFields) + values, err := GetLatestValuesForFields(gpuId, fields) if err != nil { - _ = FieldGroupDestroy(fieldsId) - _ = DestroyGroup(groupId) + _ = FieldGroupDestroy(fieldGrp) + _ = DestroyGroup(gpuGrpHdl) return status, err } - power := values[pwr].Float64() - clocks := ClockInfo{ - Cores: values[smClock].Int64(), - Memory: values[memClock].Int64(), + Sm: values[IdxSmClock].Int64(), + Mem: values[IdxMemClock].Int64(), + } + + utilInfo := UtilizationInfo{ + Gpu: values[IdxGpuUtil].Int64(), + Mem: values[IdxMemUtil].Int64(), } - gpuUtil := UtilizationInfo{ - GPU: values[sm].Int64(), - Memory: values[mem].Int64(), + pciInfo := PCIStatusInfo{ + Rx: values[IdxPcieRxThroughput].Int64(), + Tx: values[IdxPcieTxThroughput].Int64(), + ReplayCounter: values[IdxPcieReplayCounter].Int64(), } - pci := PCIStatusInfo{ - Rx: values[pcieRxThroughput].Int64(), - Tx: values[pcieTxThroughput].Int64(), - Replays: values[pcieReplay].Int64(), + memUsage := MemoryUsage{ + Total: values[IdxMemTotal].Int64(), + Free: values[IdxMemFree].Int64(), + Used: values[IdxMemUsed].Int64(), } + status = DeviceStatus{ - Power: power, - Temperature: values[temp].Int64(), - Utilization: gpuUtil, - Clocks: clocks, - PCI: pci, - FanSpeed: values[fanSpeed].Int64(), + Id: gpuId, + Power: GetFieldValueStr(values[IdxPower], "float64"), + Temperature: GetFieldValueStr(values[IdxGpuTemp], "int64"), + Utilization: utilInfo, + Clocks: clocks, + PCI: pciInfo, + MemUsage: memUsage, + FanSpeed: GetFieldValueStr(values[IdxFanSpeed], "int64"), + EccSbeVolDev: GetFieldValueStr(values[IdxEccSbeVolDev], "int64"), + EccDbeVolDev: GetFieldValueStr(values[IdxEccDbeVolDev], "int64"), } - _ = FieldGroupDestroy(fieldsId) - _ = DestroyGroup(groupId) + _ = FieldGroupDestroy(fieldGrp) + _ = DestroyGroup(gpuGrpHdl) return } + +func getDeviceProfStatus(gpuId uint) (status DeviceProfStatus, err error) { + const ( + IdxSmActive int = iota + IdxSmOccupancy + IdxDramActive + ) + + fields := []Short{ + DCGM_FI_PROF_SM_ACTIVE, + DCGM_FI_PROF_SM_OCCUPANCY, + DCGM_FI_PROF_DRAM_ACTIVE, + } + + fieldGrpName := fmt.Sprintf("devProfStatusFields%d", rand.Uint64()) + fieldGrp, err := FieldGroupCreate(fieldGrpName, fields) + if err != nil { + return + } + + grpName := fmt.Sprintf("devProfStatusGrp%d", rand.Uint64()) + grpId, err := WatchFields([]uint{gpuId}, fieldGrp, grpName) + if err != nil { + _ = FieldGroupDestroy(fieldGrp) + return + } + + time.Sleep(2000 * time.Millisecond) + values, err := GetLatestValuesForFields(gpuId, fields) + if err != nil { + _ = FieldGroupDestroy(fieldGrp) + _ = DestroyGroup(grpId) + return status, err + } + + status = DeviceProfStatus{ + SmActive: GetFieldValueStr(values[IdxSmActive], "float64"), + SmOccupancy: GetFieldValueStr(values[IdxSmOccupancy], "float64"), + DramActive: GetFieldValueStr(values[IdxDramActive], "float64"), + } + + _ = FieldGroupDestroy(fieldGrp) + _ = DestroyGroup(grpId) + return + +} diff --git a/pkg/ixdcgm/embedded.go b/pkg/ixdcgm/embedded.go index 72134e5..9829373 100644 --- a/pkg/ixdcgm/embedded.go +++ b/pkg/ixdcgm/embedded.go @@ -51,11 +51,56 @@ func (e *embedded) Start(args ...string) (DcgmHandle, error) { return DcgmHandle{}, fmt.Errorf("failed to initialize dcgm: %v", err) } - var cHandler C.dcgmHandle_t - result = C.dcgmStartEmbedded(C.DCGM_OPERATION_MODE_AUTO, &cHandler) + logLevel := C.DcgmLoggingSeverityNone + if len(args) > 0 { + logLevelStr := args[0] + switch logLevelStr { + case "LogNone": + logLevel = C.DcgmLoggingSeverityNone + case "LogFatal": + logLevel = C.DcgmLoggingSeverityFatal + case "LogError": + logLevel = C.DcgmLoggingSeverityError + case "LogWarn": + logLevel = C.DcgmLoggingSeverityWarning + case "LogInfo": + logLevel = C.DcgmLoggingSeverityInfo + case "LogDebug": + logLevel = C.DcgmLoggingSeverityDebug + case "LogVerb": + logLevel = C.DcgmLoggingSeverityVerbose + default: + errMsg := fmt.Sprintf("Invalid log level: %s", logLevelStr) + fmt.Println(errMsg) + fmt.Println("The following log levels are supported: LogNone, LogFatal, LogError, LogWarn, LogInfo, LogDebug, LogVerb.") + fmt.Println(" - LogNone : No logging") + fmt.Println(" - LogFatal : Fatal errors") + fmt.Println(" - LogError : Errors") + fmt.Println(" - LogWarn : Warnings") + fmt.Println(" - LogInfo : Informative, will generate medium logs") + fmt.Println(" - LogDebug : Debug infomation, will generate large logs") + fmt.Println(" - LogVerb : Verbose debugging information, will generate more large logs") + fmt.Println() + return DcgmHandle{}, fmt.Errorf("%v", errMsg) + } + } + + params := C.dcgmStartEmbeddedV2Params_v1{ + version: C.dcgmStartEmbeddedV2Params_version1, + opMode: C.dcgmOperationMode_t(C.DCGM_OPERATION_MODE_AUTO), + dcgmHandle: C.dcgmHandle_t(0), + logFile: nil, // use default log file + severity: C.DcgmLoggingSeverity_t(logLevel), + denyListCount: 0, // no deny list + denyList: [C.DcgmModuleIdCount]C.uint{0}, + } + + // Use dcgmStartEmbedded_v2 but dcgmStartEmbedded which using verbose log + result = C.dcgmStartEmbedded_v2(¶ms) if err := errorString(result); err != nil { return DcgmHandle{}, fmt.Errorf("failed to start embedded dcgm: %v", err) } + var cHandler C.dcgmHandle_t = params.dcgmHandle return DcgmHandle{handle: cHandler}, nil } diff --git a/pkg/ixdcgm/fields.go b/pkg/ixdcgm/fields.go index 3f60a7d..dec9618 100644 --- a/pkg/ixdcgm/fields.go +++ b/pkg/ixdcgm/fields.go @@ -24,18 +24,23 @@ package ixdcgm import "C" import ( "fmt" + "os" "unsafe" ) const ( - defaultUpdateFreq = 30000000 // usec - defaultMaxKeepAge = 0 // sec - defaultMaxKeepSamples = 1 // Keep one sample by default since we only ask for latest + defaultUpdateFreq = 1000000 // usec + defaultMaxKeepAge = 0 // sec + defaultMaxKeepSamples = 1 // Keep one sample by default since we only ask for latest + + DCGM_INT32_BLANK = int32(2147483632) // 0x7ffffff0 + DCGM_INT64_BLANK = int64(9223372036854775792) // 0x7ffffffffffffff0 + DCGM_FP64_BLANK = float64(140737488355328.0) ) -type FieldHandle struct{ handle C.dcgmFieldGrp_t } +type FieldGrpHandle struct{ handle C.dcgmFieldGrp_t } -func FieldGroupCreate(groupName string, fields []Short) (fieldsId FieldHandle, err error) { +func FieldGroupCreate(groupName string, fields []Short) (fgId FieldGrpHandle, err error) { var fieldsGroup C.dcgmFieldGrp_t cfields := *(*[]C.ushort)(unsafe.Pointer(&fields)) @@ -44,16 +49,16 @@ func FieldGroupCreate(groupName string, fields []Short) (fieldsId FieldHandle, e res := C.dcgmFieldGroupCreate(handle.handle, C.int(len(fields)), &cfields[0], gn, &fieldsGroup) if err = errorString(res); err != nil { - return fieldsId, fmt.Errorf("error creating DCGM fields group: %s", err) + return fgId, fmt.Errorf("error creating DCGM fields group: %s", err) } - fieldsId = FieldHandle{ + fgId = FieldGrpHandle{ handle: fieldsGroup, } return } -func FieldGroupDestroy(fieldGroup FieldHandle) (err error) { +func FieldGroupDestroy(fieldGroup FieldGrpHandle) (err error) { res := C.dcgmFieldGroupDestroy(handle.handle, fieldGroup.handle) if err = errorString(res); err != nil { return fmt.Errorf("error destroying DCGM fields group: %s", err) @@ -61,18 +66,19 @@ func FieldGroupDestroy(fieldGroup FieldHandle) (err error) { return nil } -func WatchFields(gpuId uint, fieldsGroup FieldHandle, groupName string) (GroupHandle, error) { - groups, err := CreateGroup(groupName) +func WatchFields(gpuIds []uint, fieldGrp FieldGrpHandle, groupName string) (GroupHandle, error) { + group, err := CreateGroup(groupName) if err != nil { return GroupHandle{}, err } - - err = AddDevice(groups, gpuId) - if err != nil { - return GroupHandle{}, err + for _, gpuId := range gpuIds { + err = AddDevice(group, gpuId) + if err != nil { + return GroupHandle{}, err + } } - res := C.dcgmWatchFields(handle.handle, groups.handle, fieldsGroup.handle, + res := C.dcgmWatchFields(handle.handle, group.handle, fieldGrp.handle, C.longlong(defaultUpdateFreq), C.double(defaultMaxKeepAge), C.int(defaultMaxKeepSamples)) @@ -85,7 +91,7 @@ func WatchFields(gpuId uint, fieldsGroup FieldHandle, groupName string) (GroupHa if err = errorString(res); err != nil { return GroupHandle{}, fmt.Errorf("error updating DCGM fields: %s", err) } - return groups, nil + return group, nil } func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error) { @@ -113,10 +119,35 @@ func toFieldValue(values []C.dcgmFieldValue_v1) (fields []FieldValue_v1) { return } -// func FieldsInit() int { -// return int(C.ixdcgmFieldsInit()) -// } +func GetFieldValueStr(fv FieldValue_v1, typ string) string { + st := fv.Status + if st != C.DCGM_ST_OK { + return "N/A" + } -// func FieldsTerm() int { -// return int(C.ixdcgmFieldsTerm()) -// } + switch typ { + case "int64": + value := *(*int64)(unsafe.Pointer(&fv.Value[0])) + if value >= DCGM_INT64_BLANK { + return "N/A" // indicate the field is not supported + } + return fmt.Sprintf("%d", value) + + case "float64": + value := *(*float64)(unsafe.Pointer(&fv.Value[0])) + if value >= DCGM_FP64_BLANK { + return "N/A" // indicate the field is not supported + } + // sync the precision with the display of ixdcgmi + return fmt.Sprintf("%.3f", value) + + case "string": + // remove redundant spaces of string converted from C bytes + return removeBytesSpaces(fv.Value[:]) + + default: + fmt.Printf("Not Supported Type: %s\n", typ) + os.Exit(1) + return "N/A" + } +} diff --git a/pkg/ixdcgm/include/ixdcgmApiExport.h b/pkg/ixdcgm/include/ixdcgmApiExport.h index 8cee559..789cd6f 100644 --- a/pkg/ixdcgm/include/ixdcgmApiExport.h +++ b/pkg/ixdcgm/include/ixdcgmApiExport.h @@ -29,149 +29,227 @@ extern "C" #define IXDCGM_PRIVATE_API __attribute((visibility("hidden"))) ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmInit(void); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStartEmbedded(ixdcgmStartEmbeddedParam* params); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStartEmbedded(ixdcgmStartEmbeddedParam *params); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmEngineStart(unsigned short portNum, char const* socketPath, bool overTCP); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmEngineStart(unsigned short portNum, char const *socketPath, bool overTCP); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmEngineRun(unsigned short portNumber, - char const* socketPath, - unsigned int isConnectionTCP); - IXDCGM_PUBLIC_API const char* ixdcgmErrorString(ixdcgmReturn_t result); + char const *socketPath, + unsigned int isConnectionTCP); + IXDCGM_PUBLIC_API const char *ixdcgmErrorString(ixdcgmReturn_t result); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmDisconnect(ixdcgmHandle_t pixdcgmHandle); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConnect(const char* ipAddress, - ixdcgmConnectParams* connectParams, - ixdcgmHandle_t* pixdcgmHandle); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetEntityGroupEntities(ixdcgmHandle_t pixdcgmHandle, + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConnect(const char *ipAddress, + ixdcgmConnectParams *connectParams, + ixdcgmHandle_t *pixdcgmHandle); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetEntityGroupEntities(ixdcgmHandle_t pixdcgmHandle, ixdcgm_field_entity_group_t entityGroup, - ixdcgm_field_eid_t* entities, - int* numEntities, - unsigned int flags); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetDeviceAttributes(ixdcgmHandle_t pixdcgmHandle, - unsigned int gpuId, - ixdcgmDeviceAttributes_t* pixdcgmAttr); + ixdcgm_field_eid_t *entities, + int *numEntities, + unsigned int flags); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetDeviceAttributes(ixdcgmHandle_t pixdcgmHandle, + unsigned int gpuId, + ixdcgmDeviceAttributes_t *pixdcgmAttr); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetAllDevices(ixdcgmHandle_t pixdcgmHandle, - unsigned int gpuIdList[IXDCGM_MAX_NUM_DEVICES], - int* count); + unsigned int gpuIdList[IXDCGM_MAX_NUM_DEVICES], + int *count); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetAllSupportedDevices(ixdcgmHandle_t pixdcgmHandle, - unsigned int gpuIdList[IXDCGM_MAX_NUM_DEVICES], - int* count); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmEntitiesGetLatestValues(ixdcgmHandle_t pDcgmHandle, + unsigned int gpuIdList[IXDCGM_MAX_NUM_DEVICES], + int *count); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmEntitiesGetLatestValues(ixdcgmHandle_t pDcgmHandle, ixdcgmGroupEntityPair_t entities[], - unsigned int entityCount, - unsigned short fields[], - unsigned int fieldCount, - unsigned int flags, - ixdcgmFieldValue_v2 values[]); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHostengineVersionInfo(ixdcgmHandle_t pixdcgmHandle, - ixdcgmVersionInfo_t* pVersionInfo); + unsigned int entityCount, + unsigned short fields[], + unsigned int fieldCount, + unsigned int flags, + ixdcgmFieldValue_v2 values[]); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHostengineVersionInfo(ixdcgmHandle_t pixdcgmHandle, + ixdcgmVersionInfo_t *pVersionInfo); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmVersionInfo(ixdcgmVersionInfo_t* pVersionInfo); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmVersionInfo(ixdcgmVersionInfo_t *pVersionInfo); /*Grouping APIs*/ - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupCreate(ixdcgmHandle_t pixdcgmHandle, + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupCreate(ixdcgmHandle_t pixdcgmHandle, ixdcgmGroupType_t type, - const char* groupName, - ixdcgmGpuGrp_t* groupId); + const char *groupName, + ixdcgmGpuGrp_t *groupId); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupDestroy(ixdcgmHandle_t pixdcgmHandle, ixdcgmGpuGrp_t groupId); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupAddEntity(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupAddEntity(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, ixdcgm_field_entity_group_t entityGroupId, - ixdcgm_field_eid_t entityId); + ixdcgm_field_eid_t entityId); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupAddDevice(ixdcgmHandle_t pixdcgmHandle, ixdcgmGpuGrp_t groupId, - unsigned int gpuId); + unsigned int gpuId); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupRemoveDevice(ixdcgmHandle_t pixdcgmHandle, ixdcgmGpuGrp_t groupId, - unsigned int gpuId); + unsigned int gpuId); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupRemoveEntity(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupRemoveEntity(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, ixdcgm_field_entity_group_t entityGroupId, - ixdcgm_field_eid_t entityId); + ixdcgm_field_eid_t entityId); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupGetInfo(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmGroupInfo_t* pDcgmGroupInfo); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupGetInfo(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, + ixdcgmGroupInfo_t *pDcgmGroupInfo); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupGetAllIds(ixdcgmHandle_t pixdcgmHandle, ixdcgmGpuGrp_t groupIdList[], - unsigned int* count); + unsigned int *count); /* Field Grouping APIs*/ - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupCreate(ixdcgmHandle_t pixdcgmHandle, - int numFieldIds, - unsigned short* fieldIds, - const char* fieldGroupName, - ixdcgmFieldGrp_t* fieldGroupId); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupCreate(ixdcgmHandle_t pixdcgmHandle, + int numFieldIds, + unsigned short *fieldIds, + const char *fieldGroupName, + ixdcgmFieldGrp_t *fieldGroupId); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupDestroy(ixdcgmHandle_t pixdcgmHandle, + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupDestroy(ixdcgmHandle_t pixdcgmHandle, ixdcgmFieldGrp_t fieldGroupId); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupGetInfo(ixdcgmHandle_t pixdcgmHandle, - ixdcgmFieldGroupInfo_t* fieldGroupInfo); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupGetInfo(ixdcgmHandle_t pixdcgmHandle, + ixdcgmFieldGroupInfo_t *fieldGroupInfo); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupGetAll(ixdcgmHandle_t pixdcgmHandle, - ixdcgmAllFieldGroup_t* allGroupInfo); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupGetAll(ixdcgmHandle_t pixdcgmHandle, + ixdcgmAllFieldGroup_t *allGroupInfo); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmWatchFields(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmWatchFields(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, ixdcgmFieldGrp_t fieldGroupId, - long long updateFreq, - double maxKeepAge, - int maxKeepSamples); + long long updateFreq, + double maxKeepAge, + int maxKeepSamples); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmUnwatchFields(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmUnwatchFields(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, ixdcgmFieldGrp_t fieldGroupId); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusCreate(ixdcgmStatus_t* statusHandle); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusCreate(ixdcgmStatus_t *statusHandle); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusDestroy(ixdcgmStatus_t statusHandle); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConfigGet(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusGetCount(ixdcgmStatus_t statusHandle, unsigned int *count); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusPopError(ixdcgmStatus_t statusHandle, + ixdcgmErrorInfo_t *pixdcgmErrorInfo); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusClear(ixdcgmStatus_t statusHandle); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConfigGet(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, ixdcgmConfigType_t type, - int count, - ixdcgmConfig_t deviceConfigList[], - ixdcgmStatus_t statusHandle); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetValuesSince_v2(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmFieldGrp_t fieldGroupId, - long long sinceTimestamp, - long long* nextSinceTimestamp, + int count, + ixdcgmConfig_t deviceConfigList[], + ixdcgmStatus_t statusHandle); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConfigSet(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, + ixdcgmConfig_t *pDeviceConfig, + ixdcgmStatus_t statusHandle); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConfigEnforce(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, + ixdcgmStatus_t statusHandle); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetValuesSince_v2(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, + ixdcgmFieldGrp_t fieldGroupId, + long long sinceTimestamp, + long long *nextSinceTimestamp, ixdcgmFieldValueEntityEnumeration_f enumCB, - void* userData); + void *userData); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLatestValues_v2(ixdcgmHandle_t pDcgmHandle, + ixdcgmGpuGrp_t groupId, + ixdcgmFieldGrp_t fieldGroupId, + ixdcgmFieldValueEntityEnumeration_f enumCB, + void *userData); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStopEmbedded(ixdcgmHandle_t pixdcgmHandle); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetFieldSummary(ixdcgmHandle_t pixdcgmHandle, - ixdcgmFieldSummaryRequest_t* request); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetFieldSummary(ixdcgmHandle_t pixdcgmHandle, + ixdcgmFieldSummaryRequest_t *request); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmShutdown(void); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmModuleIdToName(ixdcgmModuleId_t id, char const** name); + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmModuleIdToName(ixdcgmModuleId_t id, char const **name); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLatestValuesForFields(ixdcgmHandle_t pixdcgmHandle, - int gpuId, - unsigned short fields[], - unsigned int count, + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLatestValuesForFields(ixdcgmHandle_t pixdcgmHandle, + int gpuId, + unsigned short fields[], + unsigned int count, ixdcgmFieldValue_v1 values[]); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmUpdateAllFields(ixdcgmHandle_t pixdcgmHandle, int waitForUpdate); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHostengineSetLoggingSeverity(ixdcgmHandle_t pixdcgmHandle, - ixdcgmSettingsSetLoggingSeverity_t* logging); + ixdcgmSettingsSetLoggingSeverity_t *logging); ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmDeviceOnSameBoard(ixdcgmHandle_t pixdcgmHandle, - unsigned int gpuId1, - unsigned int gpuId2, - int* onSameBoard); + unsigned int gpuId1, + unsigned int gpuId2, + int *onSameBoard); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmModuleGetStatuses(ixdcgmHandle_t pixdcgmHandle, + ixdcgmModuleGetStatuses_t *moduleStatuses); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmModuleDenylist(ixdcgmHandle_t pixdcgmHandle, ixdcgmModuleId_t moduleId); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetDeviceTopology(ixdcgmHandle_t pixdcgmHandle, + unsigned int gpuId, + ixdcgmDeviceTopology_t *deviceTopology); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetGroupTopology(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, + ixdcgmGroupTopology_t *pixdcgmGroupTopology); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmSelectGpusByTopology(ixdcgmHandle_t pixdcgmHandle, + uint64_t inputGpuIds, + uint32_t numGpus, + uint64_t *outputGpuIds, + uint64_t hintFlags); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHealthGet(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, + ixdcgmHealthSystems_t *systems); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHealthSet_v2(ixdcgmHandle_t pixdcgmHandle, ixdcgmHealthSetParams_v2 *params); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHealthCheck_v4(ixdcgmHandle_t pixdcgmHandle, + ixdcgmGpuGrp_t groupId, + ixdcgmHealthResponse_v4 *response); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLinkStatus(ixdcgmHandle_t pixdcgmHandle, ixdcgmLinkStatus_v3 *linkStatus); + + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmProfGetSupportedMetricGroups(ixdcgmHandle_t pixdcgmHandle, + ixdcgmProfGetMetricGroups_t *metricGroups); + + /** + * Gets all the running process info corresponding to the gpuId . + * @param pixdcgmHandle IN: ixDCGM Handle + * @param gpuId IN: GPU Id corresponding to which the processes info should be fetched + * @param infoCount IN/OUT: + * IN - max number of the info could be stored in to the pids and usedMemoryBytes buffer + * OUT - When API return DCGM_ST_OK, stored number of valid pids/usedMemoryBytes info collected. + * When API return IXDCGM_RET_INSUFFICIENT_SIZE, stored the number of buffer needed. + * @param pids OUT: Buffer to store returned processes pid + * @param usedMemoryBytes OUT: Buffer to store returned processes used memory in byte + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref IXDCGM_RET_INSUFFICIENT_SIZE if the infoCount input is smaller than the buffer needed. + * - \ref DCGM_ST_BADPARAM if gpuId, infoCunt, pids or usedMemoryBytes not valid. + **/ + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetDeviceRunningProcesses(ixdcgmHandle_t pixdcgmHandle, + unsigned int gpuId, + unsigned int *infoCount, + uint64_t *pids, + uint64_t *usedMemoryBytes); #ifdef __cplusplus } #endif -#endif // end of __IXDCGM_API_EXPORT_H__ \ No newline at end of file +#endif // end of __IXDCGM_API_EXPORT_H__ \ No newline at end of file diff --git a/pkg/ixdcgm/include/ixdcgmFields.h b/pkg/ixdcgm/include/ixdcgmFields.h index 3edd33c..64a5b84 100644 --- a/pkg/ixdcgm/include/ixdcgmFields.h +++ b/pkg/ixdcgm/include/ixdcgmFields.h @@ -491,7 +491,9 @@ extern "C" ixdcgm_field_meta_p __attribute((visibility("default"))) ixdcgmFieldGetById(unsigned short fieldId); bool ixdcgmIsFieldSupported(unsigned short fieldId); - const char __attribute((visibility("default"))) *ixdcgmFieldsGetEntityGroupString(ixdcgm_field_entity_group_t entityGroupId); + bool ixdcgmIsNvlinkField(unsigned short fieldId); + const char __attribute((visibility("default"))) * + ixdcgmFieldsGetEntityGroupString(ixdcgm_field_entity_group_t entityGroupId); #ifdef __cplusplus } diff --git a/pkg/ixdcgm/include/ixdcgmStructs.h b/pkg/ixdcgm/include/ixdcgmStructs.h index 879de6c..20848a1 100644 --- a/pkg/ixdcgm/include/ixdcgmStructs.h +++ b/pkg/ixdcgm/include/ixdcgmStructs.h @@ -21,8 +21,17 @@ #define IXDCGM_VGPU_NAME_BUFFER_SIZE 64 #define IXDCGM_DEVICE_UUID_BUFFER_SIZE 80 +#define IXDCGM_CONFIG_COMPUTEMODE_DEFAULT 0 +#define IXDCGM_CONFIG_COMPUTEMODE_PROHIBITED 1 +#define IXDCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS 2 + #define IXDCGM_GROUP_MAX_ENTITIES 64 +/*IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT field not supported, + set IXDCGM_LINK_ERROR_COUNT and IXDCGM_HEALTH_WATCH_LINK_ERROR_NUM_FIELDS to 3. */ +#define IXDCGM_LINK_ERROR_COUNT 3 +#define IXDCGM_HEALTH_WATCH_LINK_ERROR_NUM_FIELDS 3 + #define IXDCGM_INT32_BLANK 0x7ffffff0 #define IXDCGM_INT64_BLANK 0x7ffffffffffffff0ll #define IXDCGM_FP64_BLANK 140737488355328.0 @@ -188,6 +197,29 @@ typedef enum This status implies that the module is loaded. */ } ixdcgmModuleStatus_t; +typedef struct +{ + ixdcgmModuleId_t id; //!< ID of this module + ixdcgmModuleStatus_t status; //!< Status of this module +} ixdcgmModuleGetStatusesModule_t; + +/* This is larger than ixdcgmModuleIdCount so we can add modules without versioning this request */ +#define IXDCGM_MODULE_STATUSES_CAPACITY 16 + +typedef struct +{ + unsigned int version; //!< Version of this request. Should be ixdcgmModuleGetStatuses_version1 + unsigned int numStatuses; //!< Number of entries in statuses[] that are populated + ixdcgmModuleGetStatusesModule_t statuses[IXDCGM_MODULE_STATUSES_CAPACITY]; //!< Per-module status information +} ixdcgmModuleGetStatuses_v1; + +/** + * Version 1 of dcgmModuleGetStatuses + */ +#define ixdcgmModuleGetStatuses_version1 MAKE_IXDCGM_VERSION(ixdcgmModuleGetStatuses_v1, 1) +#define ixdcgmModuleGetStatuses_version ixdcgmModuleGetStatuses_version1 +typedef ixdcgmModuleGetStatuses_v1 ixdcgmModuleGetStatuses_t; + typedef struct { unsigned int version; /*!< Version number. Use ixdcgmStartEmbeddedV2Params_version2 */ @@ -205,10 +237,11 @@ typedef unsigned int ixdcgm_connection_id_t; #define IXDCGM_CONNECTION_ID_NONE ((ixdcgm_connection_id_t)0) #define IXDCGM_HOSTENGINE_DEFAULT_PORT 5777 -#define IXDCGM_HOSTENGINE_LOCAL_ADDR "0.0.0.0" // Default set to listen to ALL IP addrs +#define IXDCGM_HOSTENGINE_LOCAL_ADDR "0.0.0.0" // Default set to listen to ALL IP addrs +#define IXDCGM_HOSTENGINE_DEFAULT_SOCKET "/tmp/ix-hostengine" // Default set to listen to ALL IP addrs #define IXDCGM_EMBEDDED_HANDLE 0x7fffffff -#define IXDCGM_MAX_NUM_DEVICES 16 +#define IXDCGM_MAX_NUM_DEVICES 32 #define IXDCGM_MAX_NUM_GROUPS 64 #define IXDCGM_CMI_F_WATCHED 0x00000001 /* Is this field being watched? */ @@ -412,7 +445,7 @@ typedef struct unsigned short fieldId; //!< One of IXDCGM_FI_? unsigned short fieldType; //!< One of IXDCGM_FT_? - int status; //!< Status for the querying the field. IXDCGM_ST_OK or one of IXDCGM_ST_? + int status; //!< Status for the querying the field. IXDCGM_RET_OK or one of IXDCGM_RET_? int64_t ts; //!< Timestamp in usec since 1970 union { int64_t i64; //!< Int64 value @@ -431,7 +464,7 @@ typedef struct ixdcgm_field_eid_t entityId; //!< Entity this field value belongs to unsigned short fieldId; //!< One of IXDCGM_FI_? unsigned short fieldType; //!< One of IXDCGM_FT_? - int status; //!< Status for the querying the field. IXDCGM_ST_OK or one of IXDCGM_ST_? + int status; //!< Status for the querying the field. IXDCGM_RET_OK or one of IXDCGM_RET_? unsigned int unused; //!< Unused for now to align ts to an 8-byte boundary. int64_t ts; //!< Timestamp in usec since 1970 union { @@ -443,6 +476,27 @@ typedef struct } ixdcgmFieldValue_v2; #define ixdcgmFieldValue_version2 MAKE_IXDCGM_VERSION(ixdcgmFieldValue_v2, 2) +/** + * User callback function for processing one or more field updates. This callback will + * be invoked one or more times per field until all of the expected field values have been + * enumerated. It is up to the callee to detect when the field id changes + * + * @param gpuId IN: GPU ID of the GPU this field value set belongs to + * @param values IN: Field values. These values must be copied as they will be destroyed as soon as this + * call returns. + * @param numValues IN: Number of entries that are valid in values[] + * @param userData IN: User data pointer passed to the update function that generated this callback + * + * @returns + * 0 if OK + * <0 if enumeration should stop. This allows to callee to abort field value enumeration. + * + */ +typedef int (*ixdcgmFieldValueEnumeration_f)(unsigned int gpuId, + ixdcgmFieldValue_v1* values, + int numValues, + void* userData); + /* Bitmask values for ixdcgmGetFieldIdSummary - Sync with DcgmcmSummaryType_t */ #define IXDCGM_SUMMARY_MIN 0x00000001 #define IXDCGM_SUMMARY_MAX 0x00000002 @@ -598,7 +652,34 @@ typedef enum ixdcgmLinkState_enum ixdcgmLinkStateUp = 3 //!< This Link link is up (active) } ixdcgmLinkState_t; -#define IXDCGM_MAX_LINKS_PER_GPU 16 +#define IXDCGM_MAX_LINKS_PER_GPU 18 +#define IXDCGM_MAX_NUM_SWITCHES 12 +#define IXDCGM_MAX_LINKS_PER_SWITCH 64 + +typedef struct +{ + ixdcgm_field_eid_t entityId; //!< Entity ID of the GPU (gpuId) + ixdcgmLinkState_t linkState[IXDCGM_MAX_LINKS_PER_GPU]; //!< Per-GPU link states +} ixdcgmLinkGpuLinkStatus_v3; + +typedef struct +{ + ixdcgm_field_eid_t entityId; //!< Entity ID of the NvSwitch (physicalId) + ixdcgmLinkState_t linkState[IXDCGM_MAX_LINKS_PER_SWITCH]; //!< Per-NvSwitch link states +} ixdcgmSwitchLinkStatus_t; + +typedef struct +{ + unsigned int version; //!< Version of this request. Should be dcgmNvLinkStatus_version1 + unsigned int numGpus; //!< Number of entries in gpus[] that are populated + ixdcgmLinkGpuLinkStatus_v3 gpus[IXDCGM_MAX_NUM_DEVICES]; //!< Per-GPU NvLink link statuses + unsigned int numNvSwitches; //!< Number of entries in nvSwitches[] that are populated + ixdcgmSwitchLinkStatus_t nvSwitches[IXDCGM_MAX_NUM_SWITCHES]; //!< Per-NvSwitch link statuses +} ixdcgmLinkStatus_v3; + +typedef ixdcgmLinkStatus_v3 ixdcgmLinkStatus_t; + +#define ixdcgmLinkStatus_version3 MAKE_IXDCGM_VERSION(ixdcgmLinkStatus_v3, 3) typedef struct { @@ -708,6 +789,15 @@ typedef enum ixdcgmGpuLevel_enum #define IXDCGM_TOPOLOGY_PATH_LINK(x) (ixdcgmGpuTopologyLevel_t)((unsigned int)(x) & 0xFFFFFF00) #define IXDCGM_AFFINITY_BITMASK_ARRAY_SIZE 8 + +/** No hints specified */ +#define IXDCGM_TOPO_HINT_F_NONE 0x00000000 + +/** Ignore the health of the GPUs when picking GPUs for job + * execution. By default, only healthy GPUs are considered. + */ +#define IXDCGM_TOPO_HINT_F_IGNOREHEALTH 0x00000001 + /** * Device topology information */ @@ -729,7 +819,7 @@ typedef struct //!< 0x210 = IXDCGM_TOPOLOGY_CPU | IXDCGM_TOPOLOGY_LINK2 //!< Use the macros IXDCGM_TOPOLOGY_PATH_LINK and //!< IXDCGM_TOPOLOGY_PATH_PCI to mask the NvLink and PCI paths, respectively. - unsigned int localNvLinkIds; //!< bits representing the local links connected to gpuId + unsigned int localLinkIds; //!< bits representing the local links connected to gpuId //!< e.g. if this field == 3, links 0 and 1 are connected, //!< field is only valid if LINKS actually exist between GPUs } gpuPaths[IXDCGM_MAX_NUM_DEVICES - 1]; @@ -847,6 +937,16 @@ typedef ixdcgmAllFieldGroup_v1 ixdcgmAllFieldGroup_t; */ #define ixdcgmAllFieldGroup_version ixdcgmAllFieldGroup_version1 +/** + * Structure to represent error attributes + */ +typedef struct +{ + unsigned int gpuId; //!< Represents GPU ID + short fieldId; //!< One of DCGM_FI_? + int status; //!< One of DCGM_ST_? +} ixdcgmErrorInfo_t; + typedef struct { int targetLogger; @@ -857,4 +957,127 @@ typedef struct #define ixdcgmSettingsSetLoggingSeverity_version ixdcgmSettingsSetLoggingSeverity_version1 typedef ixdcgmSettingsSetLoggingSeverity_v1 ixdcgmSettingsSetLoggingSeverity_t; +/** + * Systems structure used to enable or disable health watch systems + */ +typedef enum ixdcgmHealthSystems_enum +{ + IXDCGM_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches (must have 1m of data before query) + IXDCGM_HEALTH_WATCH_NVLINK = 0x2, //!< NVLINK system watches + IXDCGM_HEALTH_WATCH_PMU = 0x4, //!< Power management unit watches + IXDCGM_HEALTH_WATCH_MCU = 0x8, //!< Micro-controller unit watches + IXDCGM_HEALTH_WATCH_MEM = 0x10, //!< Memory watches + IXDCGM_HEALTH_WATCH_SM = 0x20, //!< Streaming multiprocessor watches + IXDCGM_HEALTH_WATCH_INFOROM = 0x40, //!< Inforom watches + IXDCGM_HEALTH_WATCH_THERMAL = 0x80, //!< Temperature watches (must have 1m of data before query) + IXDCGM_HEALTH_WATCH_POWER = 0x100, //!< Power watches (must have 1m of data before query) + IXDCGM_HEALTH_WATCH_DRIVER = 0x200, //!< Driver-related watches + IXDCGM_HEALTH_WATCH_NVSWITCH_NONFATAL = 0x400, //!< Non-fatal errors in NvSwitch + IXDCGM_HEALTH_WATCH_NVSWITCH_FATAL = 0x800, //!< Fatal errors in NvSwitch + + // ... + IXDCGM_HEALTH_WATCH_ALL = 0xFFFFFFFF //!< All watches enabled +} ixdcgmHealthSystems_t; + +#define IXDCGM_HEALTH_WATCH_COUNT_V1 10 /*!< For iterating through the dcgmHealthSystems_v1 enum */ +#define IXDCGM_HEALTH_WATCH_COUNT_V2 12 /*!< For iterating through the dcgmHealthSystems_v2 enum */ + +/** + * Health Watch test results + */ +typedef enum ixdcgmHealthWatchResult_enum +{ + IXDCGM_HEALTH_RESULT_PASS = 0, //!< All results within this system are reporting normal + IXDCGM_HEALTH_RESULT_WARN = 10, //!< A warning has been issued, refer to the response for more information + IXDCGM_HEALTH_RESULT_FAIL = 20, //!< A failure has been issued, refer to the response for more information +} ixdcgmHealthWatchResults_t; + +typedef struct +{ + char msg[1024]; + unsigned int code; +} ixdcgmDiagErrorDetail_t; + +#define IXDCGM_ERR_MSG_LENGTH 512 +#define IXDCGM_HEALTH_WATCH_MAX_INCIDENTS IXDCGM_GROUP_MAX_ENTITIES + +typedef struct +{ + ixdcgmHealthSystems_t system; //!< system to which this information belongs + ixdcgmHealthWatchResults_t health; //!< health diagnosis of this incident + ixdcgmDiagErrorDetail_t error; //!< Information about the error(s) and their error codes + ixdcgmGroupEntityPair_t entityInfo; //!< identify which entity has this error +} ixdcgmIncidentInfo_t; + +/** + * Structure used to set health watches via the dcgmHealthSet_v2 API + */ +typedef struct +{ + unsigned int version; /*!< Version of this struct. Should be ixdcgmHealthSet_version2 */ + ixdcgmGpuGrp_t groupId; /*!< Group ID representing collection of one or more entities. Look + at \ref dcgmGroupCreate for details on creating the group. + Alternatively, pass in the group id as \a IXDCGM_GROUP_ALL_GPUS + to perform operation on all the GPUs or \a IXDCGM_GROUP_ALL_NVSWITCHES + to perform operation on all the NvSwitches. */ + ixdcgmHealthSystems_t systems; /*!< An enum representing systems that should be enabled for health + checks logically OR'd together. Refer to \ref ixdcgmHealthSystems_t + for details. */ + long long updateInterval; /*!< How often to query the underlying health information from the + driver in usec. This should be the same as how often you call + ixdcgmHealthCheck */ + double maxKeepAge; /*!< How long to keep data cached for this field in seconds. This should + be at least your maximum time between calling ixdcgmHealthCheck */ +} ixdcgmHealthSetParams_v2; + +/** + * Version 2 for \ref ixdcgmHealthSet_v2 + */ +#define ixdcgmHealthSetParams_version2 MAKE_IXDCGM_VERSION(ixdcgmHealthSetParams_v2, 2) + +typedef struct +{ + unsigned int version; //!< The version number of this struct + ixdcgmHealthWatchResults_t overallHealth; //!< The overall health of this entire host + unsigned int incidentCount; //!< The number of health incidents reported in this struct + ixdcgmIncidentInfo_t incidents[IXDCGM_HEALTH_WATCH_MAX_INCIDENTS]; //!< Report of the errors detected +} ixdcgmHealthResponse_v4; + +#define ixdcgmHealthResponse_version4 MAKE_IXDCGM_VERSION(ixdcgmHealthResponse_v4, 4) +#define ixdcgmHealthResponse_version ixdcgmHealthResponse_version4 +typedef ixdcgmHealthResponse_v4 ixdcgmHealthResponse_t; + +#define IXDCGM_PROF_MAX_NUM_GROUPS_V2 10 + +#define IXDCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2 64 + +typedef struct +{ + unsigned short majorId; //!< Major ID of this metric group. Metric groups with the same majorId cannot be + //!< watched concurrently with other metric groups with the same majorId + unsigned short minorId; //!< Minor ID of this metric group. This distinguishes metric groups within the same + //!< major metric group from each other + unsigned int numFieldIds; //!< Number of field IDs that are populated in fieldIds[] + unsigned short fieldIds[IXDCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2]; //!< DCGM Field IDs that are part of this + //!< profiling group. See DCGM_FI_PROF_* + //!< definitions in dcgm_fields.h for details. +} ixdcgmProfMetricGroupInfo_v2; + +typedef struct +{ + unsigned int version; //!< Version of this request. Should be dcgmProfGetMetricGroups_version + unsigned int unused; //!< Not used for now. Set to 0 + unsigned int gpuId; //!< GPU ID we should get the metric groups for. + + unsigned int numMetricGroups; //!< Number of entries in metricGroups[] that are populated + ixdcgmProfMetricGroupInfo_v2 metricGroups[IXDCGM_PROF_MAX_NUM_GROUPS_V2]; //!< Info for each metric group +} ixdcgmProfGetMetricGroups_v3; + +/** + * Version 3 of dcgmProfGetMetricGroups_t. See dcgm_structs_24.h for v2 + */ +#define ixdcgmProfGetMetricGroups_version3 MAKE_IXDCGM_VERSION(ixdcgmProfGetMetricGroups_v3, 3) +#define ixdcgmProfGetMetricGroups_version ixdcgmProfGetMetricGroups_version3 +typedef ixdcgmProfGetMetricGroups_v3 ixdcgmProfGetMetricGroups_t; + #endif // end of __IXDCGM_STRUCTS_H__ \ No newline at end of file diff --git a/pkg/ixdcgm/process_info.go b/pkg/ixdcgm/process_info.go new file mode 100644 index 0000000..74955a8 --- /dev/null +++ b/pkg/ixdcgm/process_info.go @@ -0,0 +1,85 @@ +/* +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this file except in compliance with the License. You may obtain +a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ixdcgm + +/* +#cgo LDFLAGS: -ldl +#include "include/ixdcgmFields.h" +#include "include/ixdcgmStructs.h" +#include "include/ixdcgmApiExport.h" +*/ +import "C" +import ( + "bytes" + "fmt" + "os" + "strings" +) + +type DeviceProcessInfo struct { + Pid uint64 + Name string + UsedGpuMemory uint64 // MiB +} + +func getDeviceRunningProcesses(gpuId uint) ([]DeviceProcessInfo, error) { + cnt, pids, usedMemoryBytes, err := ixdcgmGetDeviceRunningProcesses(gpuId) + if err != nil { + return nil, err + } + InfoCount := int(uint32(cnt)) + infos := make([]DeviceProcessInfo, InfoCount) + for i := 0; i < InfoCount; i++ { + infos[i].Pid = uint64(pids[i]) + infos[i].Name = getPidName(uint64(pids[i])) + infos[i].UsedGpuMemory = uint64(usedMemoryBytes[i]) / 1024 / 1024 + } + return infos, nil +} + +func ixdcgmGetDeviceRunningProcesses(gpuId uint) (cnt C.uint32_t, pids []C.uint64_t, usedMemoryBytes []C.uint64_t, err error) { + cnt = 1 + for i := 0; i < 2; i++ { + pids = make([]C.uint64_t, cnt) + usedMemoryBytes = make([]C.uint64_t, cnt) + ret := C.ixdcgmGetDeviceRunningProcesses(C.ulong(handle.handle), C.uint(gpuId), &cnt, &pids[0], &usedMemoryBytes[0]) + if ret == C.IXDCGM_RET_OK { + // fmt.Printf("the number of valid pids/usedMemoryBytes info is %d\n", uint32(cnt)) + err = nil + return + } else if ret == C.IXDCGM_RET_INSUFFICIENT_SIZE { + // fmt.Printf("INSUFFICIENT_SIZE Warnnig: the needed buffer size is %d\n", uint32(cnt)) + continue + } else if ret == C.IXDCGM_RET_BADPARAM { + err = fmt.Errorf("bad parameter") + return + } + } + err = fmt.Errorf("failed to call ixdcgm api with the needed buffer size %d", uint32(cnt)) + return +} + +func getPidName(pid uint64) string { + cmdlinePath := fmt.Sprintf("/proc/%d/cmdline", pid) + data, err := os.ReadFile(cmdlinePath) + if err != nil { + return "" + } + data = bytes.ReplaceAll(data, []byte{0}, []byte{' '}) + return strings.TrimSuffix(string(data), "\x00") +} diff --git a/pkg/ixdcgm/utils.go b/pkg/ixdcgm/utils.go index 5ac78c6..7ab4650 100644 --- a/pkg/ixdcgm/utils.go +++ b/pkg/ixdcgm/utils.go @@ -75,3 +75,15 @@ func freeCString(c *C.char) { func cChar2String(c *C.char) string { return C.GoString(c) } + +func removeBytesSpaces(originalBytes []byte) string { + lastNonZeroIndex := len(originalBytes) - 1 + for ; lastNonZeroIndex >= 0; lastNonZeroIndex-- { + if originalBytes[lastNonZeroIndex] != 0 { + break + } + } + cleanedBytes := originalBytes[:lastNonZeroIndex+1] + + return string(cleanedBytes) +} diff --git a/samples/deviceinfo/main.go b/samples/deviceinfo/main.go index 79169d2..5f71142 100644 --- a/samples/deviceinfo/main.go +++ b/samples/deviceinfo/main.go @@ -40,7 +40,7 @@ Total Memory (MB): : {{or .MemoryUsage.Total "N/A"}} Used Memory (MB): : {{or .MemoryUsage.Used "N/A"}} Free Memory (MB): : {{or .MemoryUsage.Free "N/A"}} Bandwidth (MB/s) : {{or .PCI.Bandwidth "N/A"}} -Power (W) : {{or .Power "N/A"}} +PowerLimit (W) : {{or .PowerLimit "N/A"}} --------------------------------------------------------------------- ` ) diff --git a/samples/deviceprocessinfo/main.go b/samples/deviceprocessinfo/main.go new file mode 100644 index 0000000..f16b1ba --- /dev/null +++ b/samples/deviceprocessinfo/main.go @@ -0,0 +1,57 @@ +/* +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this file except in compliance with the License. You may obtain +a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "log" + "os" + "os/signal" + "syscall" + + "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm" +) + +func main() { + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) + + cleanup, err := ixdcgm.Init(ixdcgm.Embedded, "LogInfo") + if err != nil { + log.Panicln(err) + } + defer cleanup() + + gpuIds, err := ixdcgm.GetSupportedDevices() + if err != nil { + log.Panicln(err) + } + + for _, gpuId := range gpuIds { + fmt.Printf("Get the running process infos of gpu %d\n", gpuId) + infos, err := ixdcgm.GetDeviceRunningProcesses(gpuId) + if err != nil { + fmt.Printf("%v\n", err) + } + for _, info := range infos { + fmt.Printf("> Pid: %d, Name: %s, UsedGpuMemory(MiB): %d\n", info.Pid, info.Name, info.UsedGpuMemory) + } + fmt.Println("---------------------------------------------------------------------") + } + +} diff --git a/samples/devicestatus/main.go b/samples/devicestatus/main.go index 1412d0e..91dbb1e 100644 --- a/samples/devicestatus/main.go +++ b/samples/devicestatus/main.go @@ -23,14 +23,13 @@ import ( "os" "os/signal" "syscall" - "time" "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm" ) func main() { sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) cleanup, err := ixdcgm.Init(ixdcgm.Embedded) if err != nil { @@ -38,34 +37,41 @@ func main() { } defer cleanup() - gpus, err := ixdcgm.GetSupportedDevices() + gpuIds, err := ixdcgm.GetSupportedDevices() if err != nil { log.Panicln(err) } - ticker := time.NewTicker(time.Second * 5) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - for _, gpu := range gpus { - st, err := ixdcgm.GetDeviceStatus(gpu) - if err != nil { - log.Panicln(err) - } - fmt.Printf("st.Power Usage %f\n", st.Power) - fmt.Printf("st.Temperature %d\n", st.Temperature) - fmt.Printf("st.Utilization.GPU %d\n", st.Utilization.GPU) - fmt.Printf("st.Utilization.Memory %d\n", st.Utilization.Memory) - fmt.Printf("st.Clocks.Cores %d\n", st.Clocks.Cores) - fmt.Printf("st.Clocks.Memory %d\n", st.Clocks.Memory) - fmt.Printf("st.FanSpeed %d\n", st.FanSpeed) - fmt.Printf("st.st.PCI.Tx %d, st.PCI.Rx %d, st.PCI.Replays %d\n", st.PCI.Tx, st.PCI.Rx, st.PCI.Replays) - } - - case <-sigs: - return + for _, gpuId := range gpuIds { + st, err := ixdcgm.GetDeviceStatus(gpuId) + if err != nil { + log.Panicln(err) + } + pst, err := ixdcgm.GetDeviceProfStatus(gpuId) + if err != nil { + log.Panicln(err) } + + fmt.Printf("GPUId : %d\n", st.Id) + fmt.Printf("Power Usage (W) : %s\n", st.Power) + fmt.Printf("Temperature (°C) : %s\n", st.Temperature) + fmt.Printf("FanSpeed (%%) : %s\n", st.FanSpeed) + fmt.Printf("Utilization.GPU (%%) : %d\n", st.Utilization.Gpu) + fmt.Printf("Utilization.Mem (%%) : %d\n", st.Utilization.Mem) + fmt.Printf("Clocks.Cores (MHz) : %d\n", st.Clocks.Sm) + fmt.Printf("Clocks.Mem (MHz) : %d\n", st.Clocks.Mem) + fmt.Printf("EccSdbVolDev : %s\n", st.EccSbeVolDev) + fmt.Printf("EccDdbVolDev : %s\n", st.EccDbeVolDev) + fmt.Printf("PCI.Tx (MB/s) : %d\n", st.PCI.Tx) + fmt.Printf("PCI.Rx (MB/s) : %d\n", st.PCI.Rx) + fmt.Printf("PCI.ReplayCounter : %d\n", st.PCI.ReplayCounter) + fmt.Printf("Total Memory (MB) : %d\n", st.MemUsage.Total) + fmt.Printf("Used Memory (MB) : %d\n", st.MemUsage.Used) + fmt.Printf("Free Memory (MB) : %d\n", st.MemUsage.Free) + fmt.Printf("SmActive : %s\n", pst.SmActive) + fmt.Printf("SmOccupancy : %s\n", pst.SmOccupancy) + fmt.Printf("DramActive : %s\n", pst.DramActive) + fmt.Println("-------------------------------------------") } + } -- Gitee