From 1932fd1cfa11af8ee91953e9815389d1096e1ddc Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Tue, 24 Sep 2024 14:56:44 +0200 Subject: [PATCH] victor: image choices and gpu machine --- config/clusters/victor/common.values.yaml | 33 +++++++++++ config/clusters/victor/staging.values.yaml | 64 +++++++++++++++------- eksctl/victor.jsonnet | 12 ++++ 3 files changed, 90 insertions(+), 19 deletions(-) diff --git a/config/clusters/victor/common.values.yaml b/config/clusters/victor/common.values.yaml index f88b79c5d4..eb2bcb9615 100644 --- a/config/clusters/victor/common.values.yaml +++ b/config/clusters/victor/common.values.yaml @@ -63,6 +63,23 @@ basehub: mem_guarantee: 6.5G node_selector: node.kubernetes.io/instance-type: m5.large + profile_options: &profile_options + image: + display_name: Image + choices: + a-victor-notebook: + display_name: Victor Notebook + default: true + kubespawner_override: + image: quay.io/volcanocyber/victor-notebook:a045ad3616d1 + b-pytorch-notebook: + display_name: Pangeo ML Notebook (Pytorch) + kubespawner_override: + image: "quay.io/pangeo/pytorch-notebook:2024.09.11" + c-ml-notebook: + display_name: Pangeo ML Notebook (Tensorflow) + kubespawner_override: + image: "quay.io/pangeo/ml-notebook:2024.09.11" - display_name: "Medium: m5.xlarge" description: "~4 CPU, ~15G RAM" kubespawner_override: @@ -70,6 +87,7 @@ basehub: mem_guarantee: 12G node_selector: node.kubernetes.io/instance-type: m5.xlarge + profile_options: *profile_options - display_name: "Large: m5.2xlarge" description: "~8 CPU, ~30G RAM" kubespawner_override: @@ -77,6 +95,7 @@ basehub: mem_guarantee: 25G node_selector: node.kubernetes.io/instance-type: m5.2xlarge + profile_options: *profile_options - display_name: "Huge: m5.8xlarge" description: "~16 CPU, ~60G RAM" kubespawner_override: @@ -84,6 +103,20 @@ basehub: mem_guarantee: 50G node_selector: node.kubernetes.io/instance-type: m5.8xlarge + profile_options: *profile_options + - display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs + description: "Start a container on a dedicated node with a GPU" + slug: "gpu" + kubespawner_override: + environment: + NVIDIA_DRIVER_CAPABILITIES: compute,utility + mem_limit: null + mem_guarantee: 14G + node_selector: + node.kubernetes.io/instance-type: g4dn.xlarge + extra_resource_limits: + nvidia.com/gpu: "1" + profile_options: *profile_options defaultUrl: /lab scheduling: userScheduler: diff --git a/config/clusters/victor/staging.values.yaml b/config/clusters/victor/staging.values.yaml index c0e824090a..ec5dfae195 100644 --- a/config/clusters/victor/staging.values.yaml +++ b/config/clusters/victor/staging.values.yaml @@ -15,26 +15,12 @@ basehub: oauth_callback_url: https://staging.victor.2i2c.cloud/hub/oauth_callback singleuser: profileList: - # Create a small instance that can launch a custom image - - display_name: "Bring your own image - Small: m5.large" - description: "Specific your own image (must have python and jupyterhub installed in it) - ~2 CPU, ~8G RAM" - slug: custom - profile_options: - image: - display_name: Image - unlisted_choice: - enabled: true - display_name: "Custom image" - validation_regex: "^.+:.+$" - validation_message: "Must be a publicly available docker image, of form :" - kubespawner_override: - image: "{value}" - mem_limit: 8G - mem_guarantee: 6.5G - node_selector: - node.kubernetes.io/instance-type: m5.large - choices: {} #=== Below are copied from common file ===# + # + # But, they have been adjusted to include unlisted_choice to pick a + # custom image. + # + # The mem-guarantees are here so k8s doesn't schedule other pods # on these nodes. - display_name: "Small: m5.large" @@ -47,6 +33,30 @@ basehub: mem_guarantee: 6.5G node_selector: node.kubernetes.io/instance-type: m5.large + profile_options: &profile_options + image: + display_name: Image + choices: + a-victor-notebook: + display_name: Victor Notebook + default: true + kubespawner_override: + image: quay.io/volcanocyber/victor-notebook:a045ad3616d1 + b-pytorch-notebook: + display_name: Pangeo ML Notebook (Pytorch) + kubespawner_override: + image: "quay.io/pangeo/pytorch-notebook:2024.09.11" + c-ml-notebook: + display_name: Pangeo ML Notebook (Tensorflow) + kubespawner_override: + image: "quay.io/pangeo/ml-notebook:2024.09.11" + unlisted_choice: + enabled: true + display_name: "Custom image" + validation_regex: "^.+:.+$" + validation_message: "Must be a publicly available docker image, of form :" + kubespawner_override: + image: "{value}" - display_name: "Medium: m5.xlarge" description: "~4 CPU, ~15G RAM" kubespawner_override: @@ -54,6 +64,7 @@ basehub: mem_guarantee: 12G node_selector: node.kubernetes.io/instance-type: m5.xlarge + profile_options: *profile_options - display_name: "Large: m5.2xlarge" description: "~8 CPU, ~30G RAM" kubespawner_override: @@ -61,6 +72,7 @@ basehub: mem_guarantee: 25G node_selector: node.kubernetes.io/instance-type: m5.2xlarge + profile_options: *profile_options - display_name: "Huge: m5.8xlarge" description: "~16 CPU, ~60G RAM" kubespawner_override: @@ -68,3 +80,17 @@ basehub: mem_guarantee: 50G node_selector: node.kubernetes.io/instance-type: m5.8xlarge + profile_options: *profile_options + - display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs + description: "Start a container on a dedicated node with a GPU" + slug: "gpu" + kubespawner_override: + environment: + NVIDIA_DRIVER_CAPABILITIES: compute,utility + mem_limit: null + mem_guarantee: 14G + node_selector: + node.kubernetes.io/instance-type: g4dn.xlarge + extra_resource_limits: + nvidia.com/gpu: "1" + profile_options: *profile_options diff --git a/eksctl/victor.jsonnet b/eksctl/victor.jsonnet index 55de52dc02..106bcb587e 100644 --- a/eksctl/victor.jsonnet +++ b/eksctl/victor.jsonnet @@ -32,6 +32,18 @@ local notebookNodes = [ { instanceType: "r5.xlarge" }, { instanceType: "r5.4xlarge" }, { instanceType: "r5.16xlarge" }, + { + instanceType: "g4dn.xlarge", + tags+: { + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" + }, + taints+: { + "nvidia.com/gpu": "present:NoSchedule" + }, + // Allow provisioning GPUs across all AZs, to prevent situation where all + // GPUs in a single AZ are in use and no new nodes can be spawned + availabilityZones: masterAzs, + }, ]; local daskNodes = [