From 1932fd1cfa11af8ee91953e9815389d1096e1ddc Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Tue, 24 Sep 2024 14:56:44 +0200
Subject: [PATCH] victor: image choices and gpu machine

---
 config/clusters/victor/common.values.yaml  | 33 +++++++++++
 config/clusters/victor/staging.values.yaml | 64 +++++++++++++++-------
 eksctl/victor.jsonnet                      | 12 ++++
 3 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/config/clusters/victor/common.values.yaml b/config/clusters/victor/common.values.yaml
index f88b79c5d4..eb2bcb9615 100644
--- a/config/clusters/victor/common.values.yaml
+++ b/config/clusters/victor/common.values.yaml
@@ -63,6 +63,23 @@ basehub:
             mem_guarantee: 6.5G
             node_selector:
               node.kubernetes.io/instance-type: m5.large
+          profile_options: &profile_options
+            image:
+              display_name: Image
+              choices:
+                a-victor-notebook:
+                  display_name: Victor Notebook
+                  default: true
+                  kubespawner_override:
+                    image: quay.io/volcanocyber/victor-notebook:a045ad3616d1
+                b-pytorch-notebook:
+                  display_name: Pangeo ML Notebook (Pytorch)
+                  kubespawner_override:
+                    image: "quay.io/pangeo/pytorch-notebook:2024.09.11"
+                c-ml-notebook:
+                  display_name: Pangeo ML Notebook (Tensorflow)
+                  kubespawner_override:
+                    image: "quay.io/pangeo/ml-notebook:2024.09.11"
         - display_name: "Medium: m5.xlarge"
           description: "~4 CPU, ~15G RAM"
           kubespawner_override:
@@ -70,6 +87,7 @@ basehub:
             mem_guarantee: 12G
             node_selector:
               node.kubernetes.io/instance-type: m5.xlarge
+          profile_options: *profile_options
         - display_name: "Large: m5.2xlarge"
           description: "~8 CPU, ~30G RAM"
           kubespawner_override:
@@ -77,6 +95,7 @@ basehub:
             mem_guarantee: 25G
             node_selector:
               node.kubernetes.io/instance-type: m5.2xlarge
+          profile_options: *profile_options
         - display_name: "Huge: m5.8xlarge"
           description: "~16 CPU, ~60G RAM"
           kubespawner_override:
@@ -84,6 +103,20 @@ basehub:
             mem_guarantee: 50G
             node_selector:
               node.kubernetes.io/instance-type: m5.8xlarge
+          profile_options: *profile_options
+        - display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs
+          description: "Start a container on a dedicated node with a GPU"
+          slug: "gpu"
+          kubespawner_override:
+            environment:
+              NVIDIA_DRIVER_CAPABILITIES: compute,utility
+            mem_limit: null
+            mem_guarantee: 14G
+            node_selector:
+              node.kubernetes.io/instance-type: g4dn.xlarge
+            extra_resource_limits:
+              nvidia.com/gpu: "1"
+          profile_options: *profile_options
       defaultUrl: /lab
     scheduling:
       userScheduler:
diff --git a/config/clusters/victor/staging.values.yaml b/config/clusters/victor/staging.values.yaml
index c0e824090a..ec5dfae195 100644
--- a/config/clusters/victor/staging.values.yaml
+++ b/config/clusters/victor/staging.values.yaml
@@ -15,26 +15,12 @@ basehub:
           oauth_callback_url: https://staging.victor.2i2c.cloud/hub/oauth_callback
     singleuser:
       profileList:
-        # Create a small instance that can launch a custom image
-        - display_name: "Bring your own image - Small: m5.large"
-          description: "Specific your own image (must have python and jupyterhub installed in it) - ~2 CPU, ~8G RAM"
-          slug: custom
-          profile_options:
-            image:
-              display_name: Image
-              unlisted_choice:
-                enabled: true
-                display_name: "Custom image"
-                validation_regex: "^.+:.+$"
-                validation_message: "Must be a publicly available docker image, of form <image-name>:<tag>"
-                kubespawner_override:
-                  image: "{value}"
-                  mem_limit: 8G
-                  mem_guarantee: 6.5G
-                  node_selector:
-                    node.kubernetes.io/instance-type: m5.large
-              choices: {}
         #=== Below are copied from common file ===#
+        #
+        # But, they have been adjusted to include unlisted_choice to pick a
+        # custom image.
+        #
+
         # The mem-guarantees are here so k8s doesn't schedule other pods
         # on these nodes.
         - display_name: "Small: m5.large"
@@ -47,6 +33,30 @@ basehub:
             mem_guarantee: 6.5G
             node_selector:
               node.kubernetes.io/instance-type: m5.large
+          profile_options: &profile_options
+            image:
+              display_name: Image
+              choices:
+                a-victor-notebook:
+                  display_name: Victor Notebook
+                  default: true
+                  kubespawner_override:
+                    image: quay.io/volcanocyber/victor-notebook:a045ad3616d1
+                b-pytorch-notebook:
+                  display_name: Pangeo ML Notebook (Pytorch)
+                  kubespawner_override:
+                    image: "quay.io/pangeo/pytorch-notebook:2024.09.11"
+                c-ml-notebook:
+                  display_name: Pangeo ML Notebook (Tensorflow)
+                  kubespawner_override:
+                    image: "quay.io/pangeo/ml-notebook:2024.09.11"
+              unlisted_choice:
+                enabled: true
+                display_name: "Custom image"
+                validation_regex: "^.+:.+$"
+                validation_message: "Must be a publicly available docker image, of form <image-name>:<tag>"
+                kubespawner_override:
+                  image: "{value}"
         - display_name: "Medium: m5.xlarge"
           description: "~4 CPU, ~15G RAM"
           kubespawner_override:
@@ -54,6 +64,7 @@ basehub:
             mem_guarantee: 12G
             node_selector:
               node.kubernetes.io/instance-type: m5.xlarge
+          profile_options: *profile_options
         - display_name: "Large: m5.2xlarge"
           description: "~8 CPU, ~30G RAM"
           kubespawner_override:
@@ -61,6 +72,7 @@ basehub:
             mem_guarantee: 25G
             node_selector:
               node.kubernetes.io/instance-type: m5.2xlarge
+          profile_options: *profile_options
         - display_name: "Huge: m5.8xlarge"
           description: "~16 CPU, ~60G RAM"
           kubespawner_override:
@@ -68,3 +80,17 @@ basehub:
             mem_guarantee: 50G
             node_selector:
               node.kubernetes.io/instance-type: m5.8xlarge
+          profile_options: *profile_options
+        - display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs
+          description: "Start a container on a dedicated node with a GPU"
+          slug: "gpu"
+          kubespawner_override:
+            environment:
+              NVIDIA_DRIVER_CAPABILITIES: compute,utility
+            mem_limit: null
+            mem_guarantee: 14G
+            node_selector:
+              node.kubernetes.io/instance-type: g4dn.xlarge
+            extra_resource_limits:
+              nvidia.com/gpu: "1"
+          profile_options: *profile_options
diff --git a/eksctl/victor.jsonnet b/eksctl/victor.jsonnet
index 55de52dc02..106bcb587e 100644
--- a/eksctl/victor.jsonnet
+++ b/eksctl/victor.jsonnet
@@ -32,6 +32,18 @@ local notebookNodes = [
     { instanceType: "r5.xlarge" },
     { instanceType: "r5.4xlarge" },
     { instanceType: "r5.16xlarge" },
+    {
+        instanceType: "g4dn.xlarge",
+        tags+: {
+            "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
+        },
+        taints+: {
+            "nvidia.com/gpu": "present:NoSchedule"
+        },
+        // Allow provisioning GPUs across all AZs, to prevent situation where all
+        // GPUs in a single AZ are in use and no new nodes can be spawned
+        availabilityZones: masterAzs,
+    },
 ];
 
 local daskNodes = [