Include initContainers when calculating pod overhead

yuvipanda · yuvipanda · commit b83358e06568 · 2024-01-04T10:57:10.000-08:00
2i2c-org#3569 changed the cryptnono daemonset to have different resource requests for the init containers as well as the container. While working on 2i2c-org#3566, I noticed this was generating wrong choices - the overhead was calculated wrong (too small). We were intentionally ignoring init containers while calculating overhead, and turns out the scheduler and the autoscaler both do take it into consideration. The effective resource request for a pod is the higher of the resource requests for the containers *or* the init containers - this ensures that a pod with higher requests for init containers than containers (like our cryptnono pod!) will actually run. This is documented at https://kubernetes.io/docs/concepts/workloads/pods/init-containers/#resource-sharing-within-containers, and implemented in Kubernetes itself at https://github.com/kubernetes/kubernetes/blob/9bd0ef5f173de3cc2d1d629a4aee499d53690aee/pkg/api/v1/resource/helpers.go#L50 (this is the library code that the cluster autoscaler uses). This PR updates the two places we currently have that calculate effective resource requests (I assume eventually these will be merged into one - I haven't kept up with the team's work last quarter here). I've updated the node-capacity-info.json file, which is what seems to be used by the generator script right now.
diff --git a/deployer/commands/generate/resource_allocation/daemonset_requests.py b/deployer/commands/generate/resource_allocation/daemonset_requests.py
@@ -64,23 +64,50 @@ def get_daemon_sets_requests():
     info = []
     for ds in daemon_sets:
         name = ds["metadata"]["name"]
-        req_mem = req_cpu = lim_mem = lim_cpu = 0
+        # From https://kubernetes.io/docs/concepts/workloads/pods/init-containers/#resource-sharing-within-containers
+        # > - The highest of any particular resource request or limit defined on
+        # >   all init containers is the effective init request/limit. If any
+        # >   resource has no resource limit specified this is considered as the
+        # >   highest limit.
+        # > - The Pod's effective request/limit for a resource is the higher of:
+        # >  - the sum of all app containers request/limit for a resource
+        # >  - the effective init request/limit for a resource
+        #
+        # So we have to calculate the requests of the init containers and containers separately,
+        # and take the max as the effective request / limit
+
+        container_req_mem = (
+            container_req_cpu
+        ) = container_lim_mem = container_lim_cpu = 0
+        init_container_req_mem = (
+            init_container_req_cpu
+        ) = init_container_lim_mem = init_container_lim_cpu = 0
+
         for c in ds["spec"]["template"]["spec"]["containers"]:
             resources = c.get("resources", {})
             requests = resources.get("requests", {})
             limits = resources.get("limits", {})
-            req_mem += parse_quantity(requests.get("memory", 0))
-            lim_mem += parse_quantity(limits.get("memory", 0))
-            req_cpu += parse_quantity(requests.get("cpu", 0))
-            lim_cpu += parse_quantity(limits.get("cpu", 0))
+            container_req_mem += parse_quantity(requests.get("memory", 0))
+            container_lim_mem += parse_quantity(limits.get("memory", 0))
+            container_req_cpu += parse_quantity(requests.get("cpu", 0))
+            container_lim_cpu += parse_quantity(limits.get("cpu", 0))
+
+        for c in ds["spec"]["template"]["spec"].get("initContainers", []):
+            resources = c.get("resources", {})
+            requests = resources.get("requests", {})
+            limits = resources.get("limits", {})
+            init_container_req_mem += parse_quantity(requests.get("memory", 0))
+            init_container_lim_mem += parse_quantity(limits.get("memory", 0))
+            init_container_req_cpu += parse_quantity(requests.get("cpu", 0))
+            init_container_lim_cpu += parse_quantity(limits.get("cpu", 0))
 
         info.append(
             {
                 "name": name,
-                "cpu_request": float(req_cpu),
-                "cpu_limit": float(lim_cpu),
-                "memory_request": int(req_mem),
-                "memory_limit": int(lim_mem),
+                "cpu_request": float(max(container_req_cpu, init_container_req_cpu)),
+                "cpu_limit": float(max(container_lim_cpu, init_container_lim_cpu)),
+                "memory_request": int(max(container_req_mem, init_container_req_mem)),
+                "memory_limit": int(max(container_lim_mem, init_container_lim_mem)),
             }
         )
 
diff --git a/deployer/commands/generate/resource_allocation/daemonset_requests.yaml b/deployer/commands/generate/resource_allocation/daemonset_requests.yaml
@@ -134,7 +134,7 @@ eks:
     other_daemon_sets: ""
     cpu_requests: 170m
     memory_requests: 250Mi
-    k8s_version: v1.25.12-eks-2d98532
+    k8s_version: v1.27.8-eks-8cb36c9
   openscapes:
     requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
     other_daemon_sets: ""
diff --git a/deployer/commands/generate/resource_allocation/node-capacity-info.json b/deployer/commands/generate/resource_allocation/node-capacity-info.json
@@ -55,12 +55,12 @@
             "memory": 130451771392
         },
         "measured_overhead": {
-            "cpu": 0.165,
-            "memory": 157286400
+            "cpu": 0.17,
+            "memory": 262144000
         },
         "available": {
-            "cpu": 15.725,
-            "memory": 130294484992
+            "cpu": 15.72,
+            "memory": 130189627392
         }
     },
     "n2-highmem-32": {
diff --git a/deployer/commands/generate/resource_allocation/update_nodeinfo.py b/deployer/commands/generate/resource_allocation/update_nodeinfo.py
@@ -106,20 +106,42 @@ def get_node_capacity_info(instance_type: str):
     mem_available = mem_allocatable
 
     for p in pods:
-        mem_request = 0
-        cpu_request = 0
-        # Iterate through all the containers in the pod, and count the memory & cpu requests
-        # they make. We don't count initContainers' requests as they don't overlap with the
-        # container requests at any point.
+        # From https://kubernetes.io/docs/concepts/workloads/pods/init-containers/#resource-sharing-within-containers
+        # > - The highest of any particular resource request or limit defined on
+        # >   all init containers is the effective init request/limit. If any
+        # >   resource has no resource limit specified this is considered as the
+        # >   highest limit.
+        # > - The Pod's effective request/limit for a resource is the higher of:
+        # >  - the sum of all app containers request/limit for a resource
+        # >  - the effective init request/limit for a resource
+        #
+        # So we have to calculate the requests of the init containers and containers separately,
+        # and take the max as the effective request / limit
+        container_cpu_request = container_mem_request = 0
+        init_container_cpu_request = init_container_mem_request = 0
+
         for c in p["spec"]["containers"]:
-            mem_request += parse_quantity(
+            container_mem_request += parse_quantity(
+                c.get("resources", {}).get("requests", {}).get("memory", "0")
+            )
+            container_cpu_request += parse_quantity(
+                c.get("resources", {}).get("requests", {}).get("cpu", "0")
+            )
+
+        for c in p["spec"].get("initContainers", []):
+            init_container_mem_request += parse_quantity(
                 c.get("resources", {}).get("requests", {}).get("memory", "0")
             )
-            cpu_request += parse_quantity(
+            init_container_cpu_request += parse_quantity(
                 c.get("resources", {}).get("requests", {}).get("cpu", "0")
             )
-        cpu_available -= cpu_request
-        mem_available -= mem_request
+
+        print(
+            p["metadata"]["name"],
+            max(init_container_mem_request, container_mem_request),
+        )
+        cpu_available -= max(container_cpu_request, init_container_cpu_request)
+        mem_available -= max(container_mem_request, init_container_mem_request)
 
     return {
         # CPU units are  in fractions, while memory units are bytes