From 7ec08c0e99f6bda1cfe5164698d285561f7b92c4 Mon Sep 17 00:00:00 2001 From: Oleg Zhurakivskyy Date: Mon, 26 Aug 2024 16:34:26 +0300 Subject: [PATCH] nfd-gc: Remove stale NRT objects Remove stale NRT objects whose creator pod does not exist anymore. Signed-off-by: Oleg Zhurakivskyy --- .../templates/topologyupdater.yaml | 4 ++ pkg/nfd-gc/nfd-gc.go | 29 +++++++++- pkg/nfd-gc/nfd-gc_test.go | 56 ++++++++++++++++--- .../nfd-topology-updater.go | 3 + test/e2e/utils/pod/pod.go | 8 +++ 5 files changed, 92 insertions(+), 8 deletions(-) diff --git a/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml b/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml index 222e729806..0ecb799054 100644 --- a/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml +++ b/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml @@ -57,6 +57,10 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name {{- with .Values.topologyUpdater.extraEnvs }} {{- toYaml . | nindent 8 }} {{- end}} diff --git a/pkg/nfd-gc/nfd-gc.go b/pkg/nfd-gc/nfd-gc.go index c3dc8dae37..255c8d9f8c 100644 --- a/pkg/nfd-gc/nfd-gc.go +++ b/pkg/nfd-gc/nfd-gc.go @@ -19,6 +19,7 @@ package nfdgarbagecollector import ( "context" "fmt" + "strings" "time" topologyv1alpha2 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2" @@ -34,6 +35,7 @@ import ( "k8s.io/klog/v2" nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/api/nfd/v1alpha1" + "sigs.k8s.io/node-feature-discovery/pkg/nfd-topology-updater" "sigs.k8s.io/node-feature-discovery/pkg/utils" "sigs.k8s.io/node-feature-discovery/pkg/version" ) @@ -42,6 +44,8 @@ var ( gvrNF = nfdv1alpha1.SchemeGroupVersion.WithResource("nodefeatures") gvrNRT = topologyv1alpha2.SchemeGroupVersion.WithResource("noderesourcetopologies") gvrNode = corev1.SchemeGroupVersion.WithResource("nodes") + gvrPod = corev1.SchemeGroupVersion.WithResource("pods") + gcNRTs = map[string]string{} ) // Args are the command line arguments @@ -190,7 +194,30 @@ func (n *nfdGarbageCollector) garbageCollect() { // Handle NodeResourceTopology objects listAndHandle(gvrNRT, func(meta metav1.PartialObjectMetadata) { - if !nodeNames.Has(meta.Name) { + deleteNRT := false + + if objRef, ok := meta.Annotations[nfdtopologyupdater.NRTOwnerPodAnnotation]; ok { + if s := strings.Split(objRef, "/"); len(s) == 2 { + pod := s[1] + _, err := n.client.Resource(gvrPod).Get(context.TODO(), pod, metav1.GetOptions{}) + if errors.IsNotFound(err) { + if val, ok := gcNRTs[meta.Name]; !ok { + gcNRTs[meta.Name] = objRef + } else { + if val != objRef { + gcNRTs[meta.Name] = objRef + } else { + delete(gcNRTs, meta.Name) + deleteNRT = true + } + } + } else if err != nil { + klog.ErrorS(err, "failed to get Pod object") + } + } + } + + if !nodeNames.Has(meta.Name) || deleteNRT { n.deleteNRT(meta.Name) } }) diff --git a/pkg/nfd-gc/nfd-gc_test.go b/pkg/nfd-gc/nfd-gc_test.go index 7c912f4958..bcdc226829 100644 --- a/pkg/nfd-gc/nfd-gc_test.go +++ b/pkg/nfd-gc/nfd-gc_test.go @@ -36,7 +36,7 @@ import ( func TestNRTGC(t *testing.T) { Convey("When theres is old NRT ", t, func() { - gc := newMockGC(nil, []string{"node1"}) + gc := newMockGC(nil, []string{"node1"}, []string{"pod1"}) errChan := make(chan error) go func() { errChan <- gc.Run() }() @@ -47,7 +47,7 @@ func TestNRTGC(t *testing.T) { So(<-errChan, ShouldBeNil) }) Convey("When theres is one old NRT and one up to date", t, func() { - gc := newMockGC([]string{"node1"}, []string{"node1", "node2"}) + gc := newMockGC([]string{"node1"}, []string{"node1", "node2"}, []string{"pod1", "pod2"}) errChan := make(chan error) go func() { errChan <- gc.Run() }() @@ -58,7 +58,7 @@ func TestNRTGC(t *testing.T) { So(<-errChan, ShouldBeNil) }) Convey("Should react to delete event", t, func() { - gc := newMockGC([]string{"node1", "node2"}, []string{"node1", "node2"}) + gc := newMockGC([]string{"node1", "node2"}, []string{"node1", "node2"}, []string{"pod1", "pod2"}) errChan := make(chan error) go func() { errChan <- gc.Run() }() @@ -70,7 +70,7 @@ func TestNRTGC(t *testing.T) { So(waitForNRT(gc.client, "node2"), ShouldBeTrue) }) Convey("periodic GC should remove obsolete NRT", t, func() { - gc := newMockGC([]string{"node1", "node2"}, []string{"node1", "node2"}) + gc := newMockGC([]string{"node1", "node2"}, []string{"node1", "node2"}, []string{"pod1", "pod2"}) // Override period to run fast gc.args.GCPeriod = 100 * time.Millisecond @@ -85,16 +85,38 @@ func TestNRTGC(t *testing.T) { So(waitForNRT(gc.client, "node1", "node2"), ShouldBeTrue) }) + Convey("periodic GC should remove stale NRT", t, func() { + gc := newMockGC([]string{"node1", "node2"}, []string{"node1", "node2"}, []string{"pod1", "pod2"}) + // Override period to run fast + gc.args.GCPeriod = 100 * time.Millisecond + + nrt := createPartialObjectMetadata("topology.node.k8s.io/v1alpha2", "NodeResourceTopology", "", "not-existing") + nrt.ObjectMeta.Annotations = map[string]string{"owner-pod": "pod4"} + + errChan := make(chan error) + go func() { errChan <- gc.Run() }() + + gvr := topologyv1alpha2.SchemeGroupVersion.WithResource("noderesourcetopologies") + _, err := gc.client.Resource(gvr).(fake.MetadataClient).CreateFake(nrt, metav1.CreateOptions{}) + So(err, ShouldBeNil) + + So(waitForNrtPodsGC(gc.client, "pod1", "pod2"), ShouldBeTrue) + }) } -func newMockGC(nodes, nrts []string) *mockGC { +func newMockGC(nodes, nrts, pods []string) *mockGC { // Create fake objects objs := []runtime.Object{} + for _, name := range pods { + objs = append(objs, createPartialObjectMetadata("v1", "Pod", "", name)) + } for _, name := range nodes { objs = append(objs, createPartialObjectMetadata("v1", "Node", "", name)) } - for _, name := range nrts { - objs = append(objs, createPartialObjectMetadata("topology.node.k8s.io/v1alpha2", "NodeResourceTopology", "", name)) + for i, name := range nrts { + nrt := createPartialObjectMetadata("topology.node.k8s.io/v1alpha2", "NodeResourceTopology", "", name) + nrt.ObjectMeta.Annotations = map[string]string{"owner-pod": pods[i]} + objs = append(objs, nrt) } scheme := fake.NewTestScheme() @@ -151,3 +173,23 @@ func waitForNRT(cli metadataclient.Interface, names ...string) bool { } return false } + +func waitForNrtPodsGC(cli metadataclient.Interface, pods ...string) bool { + podsSet := sets.NewString(pods...) + gvr := topologyv1alpha2.SchemeGroupVersion.WithResource("noderesourcetopologies") + for i := 0; i < 2; i++ { + rsp, err := cli.Resource(gvr).List(context.TODO(), metav1.ListOptions{}) + So(err, ShouldBeNil) + + nrtPods := sets.NewString() + for _, nrt := range rsp.Items { + nrtPods.Insert(nrt.Annotations["owner-pod"]) + } + + if nrtPods.Equal(podsSet) { + return true + } + time.Sleep(1 * time.Second) + } + return false +} diff --git a/pkg/nfd-topology-updater/nfd-topology-updater.go b/pkg/nfd-topology-updater/nfd-topology-updater.go index e65edbea2b..cf1c64d339 100644 --- a/pkg/nfd-topology-updater/nfd-topology-updater.go +++ b/pkg/nfd-topology-updater/nfd-topology-updater.go @@ -52,6 +52,7 @@ const ( TopologyManagerPolicyAttributeName = "topologyManagerPolicy" // TopologyManagerScopeAttributeName represents an attribute which defines Topology Manager Policy Scope TopologyManagerScopeAttributeName = "topologyManagerScope" + NRTOwnerPodAnnotation = "nfd.node.kubernetes.io/owner-pod" ) // Args are the command line arguments @@ -294,6 +295,7 @@ func (w *nfdTopologyUpdater) updateNodeResourceTopology(zoneInfo v1alpha2.ZoneLi nrtNew := v1alpha2.NodeResourceTopology{ ObjectMeta: metav1.ObjectMeta{ Name: w.nodeName, + Annotations: map[string]string{NRTOwnerPodAnnotation: fmt.Sprintf("%s/%s", w.kubernetesNamespace, os.Getenv("POD_NAME"))}, OwnerReferences: w.ownerRefs, }, Zones: zoneInfo, @@ -317,6 +319,7 @@ func (w *nfdTopologyUpdater) updateNodeResourceTopology(zoneInfo v1alpha2.ZoneLi nrtMutated := nrt.DeepCopy() nrtMutated.Zones = zoneInfo nrtMutated.OwnerReferences = w.ownerRefs + nrtMutated.Annotations[NRTOwnerPodAnnotation] = fmt.Sprintf("%s/%s", w.kubernetesNamespace, os.Getenv("POD_NAME")) attributes := scanResponse.Attributes diff --git a/test/e2e/utils/pod/pod.go b/test/e2e/utils/pod/pod.go index 56b421ad38..abbef663c8 100644 --- a/test/e2e/utils/pod/pod.go +++ b/test/e2e/utils/pod/pod.go @@ -417,6 +417,14 @@ func NFDTopologyUpdaterSpec(kc utils.KubeletConfig, opts ...SpecOption) *corev1. }, }, }, + { + Name: "POD_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.name", + }, + }, + }, }, SecurityContext: &corev1.SecurityContext{ Capabilities: &corev1.Capabilities{