From dc53d4301e006f0395895b823fabe6436cbce148 Mon Sep 17 00:00:00 2001 From: funnyfunny <1360681597@qq.com> Date: Fri, 11 Aug 2023 17:55:56 +0800 Subject: [PATCH 1/9] =?UTF-8?q?=E4=BB=8Epod=20annotations=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E4=BF=A1=E6=81=AF=E6=8C=82=E8=BD=BDdevice=20new?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runtime/main.go | 56 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/runtime/main.go b/runtime/main.go index b8775d1..b08c628 100644 --- a/runtime/main.go +++ b/runtime/main.go @@ -17,10 +17,12 @@ package main import ( "context" + "crypto/tls" "encoding/json" "fmt" "io/ioutil" "log" + "net/http" "os" "os/exec" "path" @@ -577,6 +579,10 @@ func modifySpecFile(path string) error { return fmt.Errorf("failed to add device to env: %v", err) } + if err = getPodDevice(); err != nil { + return fmt.Errorf("failed to get pod device: %v", err) + } + addEnvToDevicePlugin(&spec) jsonOutput, err := json.Marshal(spec) @@ -650,3 +656,53 @@ func main() { log.Fatal(err) } } + +func getPodDevice() error { + certFile := "/etc/kubernetes/pki/apiserver-kubelet-client.crt" + keyFile := "/etc/kubernetes/pki/apiserver-kubelet-client.key" + kubeletUrl := "https://127.0.0.1:10250/" + podsUrlPath := "pods" + + cert, err := tls.LoadX509KeyPair(certFile, keyFile) + if err != nil { + hwlog.RunLog.Errorf("LoadX509KeyPair failed: %#v", err) + return err + } + + t := &http.Transport{ + TLSClientConfig: &tls.Config{ + Certificates: []tls.Certificate{cert}, + InsecureSkipVerify: true, + }, + } + + client := &http.Client{Transport: t} + + resp, err := client.Get(kubeletUrl + podsUrlPath) + if err != nil { + hwlog.RunLog.Errorf("http get failed: %#v", err) + return err + } + defer resp.Body.Close() + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + hwlog.RunLog.Errorf("ReadAll resp.Body failed: %#v", err) + return err + } + hwlog.RunLog.Infof("resp.Status: %#v, resp.Body: %#v", resp.Status, body) + + jsonFile, err := os.Create("pod_list.json") + if err != nil { + hwlog.RunLog.Errorf("create pod_list.json failed: %#v", err) + return err + } + defer jsonFile.Close() + + if _, err := jsonFile.Write(body); err != nil { + hwlog.RunLog.Errorf("write pod_list.json failed: %#v", err) + return err + } + + return nil +} -- Gitee From b0f609f7e76555bbd2edf63b4d74d3d9ee0c2bcc Mon Sep 17 00:00:00 2001 From: funnyfunny <1360681597@qq.com> Date: Fri, 11 Aug 2023 18:03:24 +0800 Subject: [PATCH 2/9] =?UTF-8?q?=E4=BB=8Epod=20annotations=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E4=BF=A1=E6=81=AF=E6=8C=82=E8=BD=BDdevice=20new1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runtime/main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/main.go b/runtime/main.go index b08c628..2c068ed 100644 --- a/runtime/main.go +++ b/runtime/main.go @@ -704,5 +704,6 @@ func getPodDevice() error { return err } + hwlog.RunLog.Errorf("getPodDevice success: %#v", err) return nil } -- Gitee From 8f226ab5c6cf492b4d981a3bad9fe92cb73d9e24 Mon Sep 17 00:00:00 2001 From: funnyfunny <1360681597@qq.com> Date: Sat, 12 Aug 2023 10:57:36 +0800 Subject: [PATCH 3/9] =?UTF-8?q?=E4=BB=8Epod=20annotations=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E4=BF=A1=E6=81=AF=E6=8C=82=E8=BD=BDdevice=20new=202?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runtime/main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/main.go b/runtime/main.go index 2c068ed..8ad236d 100644 --- a/runtime/main.go +++ b/runtime/main.go @@ -670,6 +670,7 @@ func getPodDevice() error { } t := &http.Transport{ + Proxy: nil, TLSClientConfig: &tls.Config{ Certificates: []tls.Certificate{cert}, InsecureSkipVerify: true, -- Gitee From b747fe57d5331108d188cd08c3bee48178e522c4 Mon Sep 17 00:00:00 2001 From: funnyfunny <1360681597@qq.com> Date: Sat, 12 Aug 2023 15:02:06 +0800 Subject: [PATCH 4/9] =?UTF-8?q?=E4=BB=8Epod=20annotations=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E4=BF=A1=E6=81=AF=E6=8C=82=E8=BD=BDdevice=20new=203?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runtime/main.go | 115 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 95 insertions(+), 20 deletions(-) diff --git a/runtime/main.go b/runtime/main.go index 8ad236d..5da7806 100644 --- a/runtime/main.go +++ b/runtime/main.go @@ -19,6 +19,7 @@ import ( "context" "crypto/tls" "encoding/json" + "errors" "fmt" "io/ioutil" "log" @@ -36,6 +37,7 @@ import ( "github.com/containerd/containerd/oci" "github.com/opencontainers/runtime-spec/specs-go" "huawei.com/npu-exporter/v5/common-utils/hwlog" + "k8s.io/api/core/v1" "main/dcmi" "mindxcheckutils" @@ -67,6 +69,8 @@ var ( hookDefaultFile = hookDefaultFilePath dockerRuncName = dockerRuncFile runcName = runcFile + + notMatchError = errors.New("container not match pod or pod not has huawei.com/Ascend910 annotation") ) const ( @@ -294,6 +298,26 @@ func removeDuplication(devices []int) []int { return list } +func parseAnnotationDevices(annotationDevices string) ([]int, error) { + devices := make([]int, 0) + + for _, d := range strings.Split(annotationDevices, ",") { + borders := strings.Split(d, Ascend910+"-") + if len(borders) != borderNum || borders[0] != "" { + return nil, fmt.Errorf("invalid device range: %s", d) + } + deviceID, err := strconv.Atoi(borders[1]) + if err != nil { + return nil, fmt.Errorf("invalid device ID: %s", d) + } + + devices = append(devices, deviceID) + } + + sort.Slice(devices, func(i, j int) bool { return i < j }) + return removeDuplication(devices), nil +} + func parseDevices(visibleDevices string) ([]int, error) { devices := make([]int, 0) const maxDevice = 128 @@ -474,6 +498,42 @@ func addManagerDevice(spec *specs.Spec) error { } func addDevice(spec *specs.Spec) error { + // 获取对应pod annotation中的设备信息 + annotationDevices, err := getDeviceFromPod(spec) + if err != nil && err != notMatchError { + return fmt.Errorf("failed to get pod device: %#v", err) + } + + // 如果没有匹配到pod或annotation,则通过环境变量挂载设备 + if annotationDevices == "" { + if err = addDeviceFromEnv(spec); err != nil { + return fmt.Errorf("failed to add device to env: %#v", err) + } + return nil + } + + // 如果对应pod annotation中的设备信息存在,则用这个信息挂载设备 + devices, err := parseAnnotationDevices(annotationDevices) + if err != nil { + return fmt.Errorf("failed to parse device: %#v", err) + } + hwlog.RunLog.Infof("devices is: %#v", devices) + deviceName := davinciName + for _, deviceId := range devices { + dPath := devicePath + deviceName + strconv.Itoa(deviceId) + if err = addDeviceToSpec(spec, dPath, deviceName); err != nil { + return fmt.Errorf("failed to add davinci device to spec: %#v", err) + } + } + + if err = addManagerDevice(spec); err != nil { + return fmt.Errorf("failed to add Manager device to spec: %#v", err) + } + + return nil +} + +func addDeviceFromEnv(spec *specs.Spec) error { visibleDevices := getValueByKey(spec.Process.Env, ascendVisibleDevices) if visibleDevices == "" { return nil @@ -579,10 +639,6 @@ func modifySpecFile(path string) error { return fmt.Errorf("failed to add device to env: %v", err) } - if err = getPodDevice(); err != nil { - return fmt.Errorf("failed to get pod device: %v", err) - } - addEnvToDevicePlugin(&spec) jsonOutput, err := json.Marshal(spec) @@ -657,54 +713,73 @@ func main() { } } -func getPodDevice() error { +func getDeviceFromPod(spec *specs.Spec) (string, error) { + // 只有apiserver会访问kubelet的https api接口,所以使用apiserver的客户端证书;证书需要从master节点拷贝到worker节点 certFile := "/etc/kubernetes/pki/apiserver-kubelet-client.crt" keyFile := "/etc/kubernetes/pki/apiserver-kubelet-client.key" kubeletUrl := "https://127.0.0.1:10250/" podsUrlPath := "pods" + npu910CardName := "huawei.com/Ascend910" cert, err := tls.LoadX509KeyPair(certFile, keyFile) if err != nil { hwlog.RunLog.Errorf("LoadX509KeyPair failed: %#v", err) - return err + return "", err } - t := &http.Transport{ - Proxy: nil, - TLSClientConfig: &tls.Config{ - Certificates: []tls.Certificate{cert}, - InsecureSkipVerify: true, + // 构造带客户端证书的http客户端 + client := &http.Client{ + Transport: &http.Transport{ + Proxy: nil, // 禁用代理 + TLSClientConfig: &tls.Config{ + Certificates: []tls.Certificate{cert}, + InsecureSkipVerify: true, // kubelet 是自签名ca证书,apiserver也未校验kubelet服务端证书,所以这里不校验 + }, }, } - client := &http.Client{Transport: t} - + // 向kubelet服务端请求获取pod list resp, err := client.Get(kubeletUrl + podsUrlPath) if err != nil { hwlog.RunLog.Errorf("http get failed: %#v", err) - return err + return "", err } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { hwlog.RunLog.Errorf("ReadAll resp.Body failed: %#v", err) - return err + return "", err } - hwlog.RunLog.Infof("resp.Status: %#v, resp.Body: %#v", resp.Status, body) + hwlog.RunLog.Infof("get pod list success, resp.Status: %#v", resp.Status) + // 把数据保存到pod_list.json里,便于分析,交付时可删除这段代码 jsonFile, err := os.Create("pod_list.json") if err != nil { hwlog.RunLog.Errorf("create pod_list.json failed: %#v", err) - return err + return "", err } defer jsonFile.Close() if _, err := jsonFile.Write(body); err != nil { hwlog.RunLog.Errorf("write pod_list.json failed: %#v", err) - return err + return "", err } - hwlog.RunLog.Errorf("getPodDevice success: %#v", err) - return nil + // 遍历pod list,找到此容器 + var podList v1.PodList + if err := json.Unmarshal(body, &podList); err != nil { + hwlog.RunLog.Errorf("unmarshal body failed: %#v", err) + return "", err + } + + for _, pod := range podList.Items { + if pod.ObjectMeta.Name == spec.Hostname { + if value, ok := pod.ObjectMeta.Annotations[npu910CardName]; ok { + return value, nil + } + break + } + } + return "", notMatchError } -- Gitee From b53616e81caac87e30571090eb1823482f0032aa Mon Sep 17 00:00:00 2001 From: funnyfunny <1360681597@qq.com> Date: Sat, 12 Aug 2023 15:14:49 +0800 Subject: [PATCH 5/9] =?UTF-8?q?=E4=BB=8Epod=20annotations=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E4=BF=A1=E6=81=AF=E6=8C=82=E8=BD=BDdevice=20new=204?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runtime/main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/main.go b/runtime/main.go index 5da7806..c100345 100644 --- a/runtime/main.go +++ b/runtime/main.go @@ -506,6 +506,7 @@ func addDevice(spec *specs.Spec) error { // 如果没有匹配到pod或annotation,则通过环境变量挂载设备 if annotationDevices == "" { + hwlog.RunLog.Info("add devices from env variable") if err = addDeviceFromEnv(spec); err != nil { return fmt.Errorf("failed to add device to env: %#v", err) } -- Gitee From 522ee8e4a6a1645f7b4dd4dd1d697876c597fd68 Mon Sep 17 00:00:00 2001 From: funnyfunny <1360681597@qq.com> Date: Sat, 12 Aug 2023 15:21:14 +0800 Subject: [PATCH 6/9] =?UTF-8?q?=E4=BB=8Epod=20annotations=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E4=BF=A1=E6=81=AF=E6=8C=82=E8=BD=BDdevice=20new=205?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runtime/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/main.go b/runtime/main.go index c100345..fca67e8 100644 --- a/runtime/main.go +++ b/runtime/main.go @@ -518,7 +518,7 @@ func addDevice(spec *specs.Spec) error { if err != nil { return fmt.Errorf("failed to parse device: %#v", err) } - hwlog.RunLog.Infof("devices is: %#v", devices) + hwlog.RunLog.Infof("annotation devices is: %#v", devices) deviceName := davinciName for _, deviceId := range devices { dPath := devicePath + deviceName + strconv.Itoa(deviceId) -- Gitee From 817e10af4c79759e69bb9c955d9b9a62ab436a1d Mon Sep 17 00:00:00 2001 From: funnyfunny <1360681597@qq.com> Date: Sat, 12 Aug 2023 15:31:05 +0800 Subject: [PATCH 7/9] =?UTF-8?q?=E4=BB=8Epod=20annotations=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E4=BF=A1=E6=81=AF=E6=8C=82=E8=BD=BDdevice=20new=206?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runtime/main.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runtime/main.go b/runtime/main.go index fca67e8..44bdc2f 100644 --- a/runtime/main.go +++ b/runtime/main.go @@ -501,7 +501,8 @@ func addDevice(spec *specs.Spec) error { // 获取对应pod annotation中的设备信息 annotationDevices, err := getDeviceFromPod(spec) if err != nil && err != notMatchError { - return fmt.Errorf("failed to get pod device: %#v", err) + hwlog.RunLog.Errorf("getDeviceFromPod failed: %#v", err) + //return fmt.Errorf("failed to get pod device: %#v", err) } // 如果没有匹配到pod或annotation,则通过环境变量挂载设备 -- Gitee From 1610d20826f64f04b7b5ac1ebf2894ddb1869914 Mon Sep 17 00:00:00 2001 From: funnyfunny <1360681597@qq.com> Date: Tue, 15 Aug 2023 09:13:49 +0800 Subject: [PATCH 8/9] =?UTF-8?q?=E4=BB=8Epod=20annotations=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E4=BF=A1=E6=81=AF=E6=8C=82=E8=BD=BDdevice=20new=207?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runtime/main.go | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/runtime/main.go b/runtime/main.go index 44bdc2f..25cca26 100644 --- a/runtime/main.go +++ b/runtime/main.go @@ -755,19 +755,6 @@ func getDeviceFromPod(spec *specs.Spec) (string, error) { } hwlog.RunLog.Infof("get pod list success, resp.Status: %#v", resp.Status) - // 把数据保存到pod_list.json里,便于分析,交付时可删除这段代码 - jsonFile, err := os.Create("pod_list.json") - if err != nil { - hwlog.RunLog.Errorf("create pod_list.json failed: %#v", err) - return "", err - } - defer jsonFile.Close() - - if _, err := jsonFile.Write(body); err != nil { - hwlog.RunLog.Errorf("write pod_list.json failed: %#v", err) - return "", err - } - // 遍历pod list,找到此容器 var podList v1.PodList if err := json.Unmarshal(body, &podList); err != nil { -- Gitee From ba9d51dfd5e35d628a0967ea715c3c39349a5b38 Mon Sep 17 00:00:00 2001 From: funnyfunny <1360681597@qq.com> Date: Tue, 15 Aug 2023 09:59:36 +0800 Subject: [PATCH 9/9] =?UTF-8?q?=E4=BB=8Epod=20annotations=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E4=BF=A1=E6=81=AF=E6=8C=82=E8=BD=BDdevice=20new=208?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runtime/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime/main.go b/runtime/main.go index 25cca26..0b66449 100644 --- a/runtime/main.go +++ b/runtime/main.go @@ -501,8 +501,8 @@ func addDevice(spec *specs.Spec) error { // 获取对应pod annotation中的设备信息 annotationDevices, err := getDeviceFromPod(spec) if err != nil && err != notMatchError { + // 报错不可直接返回,记录日志即可 hwlog.RunLog.Errorf("getDeviceFromPod failed: %#v", err) - //return fmt.Errorf("failed to get pod device: %#v", err) } // 如果没有匹配到pod或annotation,则通过环境变量挂载设备 @@ -771,4 +771,4 @@ func getDeviceFromPod(spec *specs.Spec) (string, error) { } } return "", notMatchError -} +}7 \ No newline at end of file -- Gitee