Skip to content

Commit

Permalink
victor: image choices and gpu machine
Browse files Browse the repository at this point in the history
  • Loading branch information
consideRatio committed Sep 24, 2024
1 parent fc611c5 commit 1932fd1
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 19 deletions.
33 changes: 33 additions & 0 deletions config/clusters/victor/common.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,27 +63,60 @@ basehub:
mem_guarantee: 6.5G
node_selector:
node.kubernetes.io/instance-type: m5.large
profile_options: &profile_options
image:
display_name: Image
choices:
a-victor-notebook:
display_name: Victor Notebook
default: true
kubespawner_override:
image: quay.io/volcanocyber/victor-notebook:a045ad3616d1
b-pytorch-notebook:
display_name: Pangeo ML Notebook (Pytorch)
kubespawner_override:
image: "quay.io/pangeo/pytorch-notebook:2024.09.11"
c-ml-notebook:
display_name: Pangeo ML Notebook (Tensorflow)
kubespawner_override:
image: "quay.io/pangeo/ml-notebook:2024.09.11"
- display_name: "Medium: m5.xlarge"
description: "~4 CPU, ~15G RAM"
kubespawner_override:
mem_limit: 15G
mem_guarantee: 12G
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
profile_options: *profile_options
- display_name: "Large: m5.2xlarge"
description: "~8 CPU, ~30G RAM"
kubespawner_override:
mem_limit: 30G
mem_guarantee: 25G
node_selector:
node.kubernetes.io/instance-type: m5.2xlarge
profile_options: *profile_options
- display_name: "Huge: m5.8xlarge"
description: "~16 CPU, ~60G RAM"
kubespawner_override:
mem_limit: 60G
mem_guarantee: 50G
node_selector:
node.kubernetes.io/instance-type: m5.8xlarge
profile_options: *profile_options
- display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs
description: "Start a container on a dedicated node with a GPU"
slug: "gpu"
kubespawner_override:
environment:
NVIDIA_DRIVER_CAPABILITIES: compute,utility
mem_limit: null
mem_guarantee: 14G
node_selector:
node.kubernetes.io/instance-type: g4dn.xlarge
extra_resource_limits:
nvidia.com/gpu: "1"
profile_options: *profile_options
defaultUrl: /lab
scheduling:
userScheduler:
Expand Down
64 changes: 45 additions & 19 deletions config/clusters/victor/staging.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,12 @@ basehub:
oauth_callback_url: https://staging.victor.2i2c.cloud/hub/oauth_callback
singleuser:
profileList:
# Create a small instance that can launch a custom image
- display_name: "Bring your own image - Small: m5.large"
description: "Specific your own image (must have python and jupyterhub installed in it) - ~2 CPU, ~8G RAM"
slug: custom
profile_options:
image:
display_name: Image
unlisted_choice:
enabled: true
display_name: "Custom image"
validation_regex: "^.+:.+$"
validation_message: "Must be a publicly available docker image, of form <image-name>:<tag>"
kubespawner_override:
image: "{value}"
mem_limit: 8G
mem_guarantee: 6.5G
node_selector:
node.kubernetes.io/instance-type: m5.large
choices: {}
#=== Below are copied from common file ===#
#
# But, they have been adjusted to include unlisted_choice to pick a
# custom image.
#

# The mem-guarantees are here so k8s doesn't schedule other pods
# on these nodes.
- display_name: "Small: m5.large"
Expand All @@ -47,24 +33,64 @@ basehub:
mem_guarantee: 6.5G
node_selector:
node.kubernetes.io/instance-type: m5.large
profile_options: &profile_options
image:
display_name: Image
choices:
a-victor-notebook:
display_name: Victor Notebook
default: true
kubespawner_override:
image: quay.io/volcanocyber/victor-notebook:a045ad3616d1
b-pytorch-notebook:
display_name: Pangeo ML Notebook (Pytorch)
kubespawner_override:
image: "quay.io/pangeo/pytorch-notebook:2024.09.11"
c-ml-notebook:
display_name: Pangeo ML Notebook (Tensorflow)
kubespawner_override:
image: "quay.io/pangeo/ml-notebook:2024.09.11"
unlisted_choice:
enabled: true
display_name: "Custom image"
validation_regex: "^.+:.+$"
validation_message: "Must be a publicly available docker image, of form <image-name>:<tag>"
kubespawner_override:
image: "{value}"
- display_name: "Medium: m5.xlarge"
description: "~4 CPU, ~15G RAM"
kubespawner_override:
mem_limit: 15G
mem_guarantee: 12G
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
profile_options: *profile_options
- display_name: "Large: m5.2xlarge"
description: "~8 CPU, ~30G RAM"
kubespawner_override:
mem_limit: 30G
mem_guarantee: 25G
node_selector:
node.kubernetes.io/instance-type: m5.2xlarge
profile_options: *profile_options
- display_name: "Huge: m5.8xlarge"
description: "~16 CPU, ~60G RAM"
kubespawner_override:
mem_limit: 60G
mem_guarantee: 50G
node_selector:
node.kubernetes.io/instance-type: m5.8xlarge
profile_options: *profile_options
- display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs
description: "Start a container on a dedicated node with a GPU"
slug: "gpu"
kubespawner_override:
environment:
NVIDIA_DRIVER_CAPABILITIES: compute,utility
mem_limit: null
mem_guarantee: 14G
node_selector:
node.kubernetes.io/instance-type: g4dn.xlarge
extra_resource_limits:
nvidia.com/gpu: "1"
profile_options: *profile_options
12 changes: 12 additions & 0 deletions eksctl/victor.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@ local notebookNodes = [
{ instanceType: "r5.xlarge" },
{ instanceType: "r5.4xlarge" },
{ instanceType: "r5.16xlarge" },
{
instanceType: "g4dn.xlarge",
tags+: {
"k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
},
taints+: {
"nvidia.com/gpu": "present:NoSchedule"
},
// Allow provisioning GPUs across all AZs, to prevent situation where all
// GPUs in a single AZ are in use and no new nodes can be spawned
availabilityZones: masterAzs,
},
];

local daskNodes = [
Expand Down

0 comments on commit 1932fd1

Please sign in to comment.