From 27504565c1462dc3101cdc02aa550a26177109d7 Mon Sep 17 00:00:00 2001 From: Samuel Larkin Date: Thu, 18 Jul 2024 10:19:43 -0400 Subject: [PATCH] [WIP]feat: Trixie - Be a Good Cluster-Citizen --- .github/workflows/doc.yaml | 43 ++ static/BeAGoodClusterCitizen.html | 981 ++++++++++++++++++++++++++++++ 2 files changed, 1024 insertions(+) create mode 100644 .github/workflows/doc.yaml create mode 100644 static/BeAGoodClusterCitizen.html diff --git a/.github/workflows/doc.yaml b/.github/workflows/doc.yaml new file mode 100644 index 0000000..4006b00 --- /dev/null +++ b/.github/workflows/doc.yaml @@ -0,0 +1,43 @@ +# Simple workflow for deploying static content to GitHub Pages +name: Deploy static content to Pages + +on: + # Runs on pushes targeting the default branch + push: + branches: [$default-branch, dev.sl/presentation] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Single deploy job since we're just deploying + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + # Upload entire repository + path: "." + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/static/BeAGoodClusterCitizen.html b/static/BeAGoodClusterCitizen.html new file mode 100644 index 0000000..67d3b73 --- /dev/null +++ b/static/BeAGoodClusterCitizen.html @@ -0,0 +1,981 @@ + + + Trixie + + + + + + + + +
+
+
+

Trixie

+

Be a Good Cluster-Citizen

+

Samuel Larkin

+
+ +
+

Trixie

+ +
+

+ trixie.res.nrc.gc.ca from the + black & + orange networks +

+ +

+ > ./onboarding.sh +

+ +

+ /gpfs/projects will remain as-is and should be + in-sync with the previous storage appliance, as such there should + be no apparent change +

+ +

+ /home/${USER} will now have a quota of 25GB + and will have snapshots dsabled. +

+ +

+ /gpfs/work/${USER} will be the primary + work-space for users and will have a quota of 500GB and 1M inodes + with snapshots enabled +

+ +

+ We request that users not create conda, mamba, venvs, etc + within the /gpfs/work space, and instead use their /home and create symlinks to + their projects. The snapshot feature of GPFS does not perform + optimally when many small files are created and destroyed as with + such utilities. We understand that some workflows on Trixie have + this behaviour inherently, but an effort to reduce this behaviour + is appreciated +

+
+ + +
+ +
+
+

Getting some Software

+ +

+ > module avail +

+ +
+

+ /usr/share/Modules/modulefiles +

+
+
    +
  • dot
  • +
  • module-git
  • +
  • module-info
  • +
  • modules
  • +
  • null
  • +
  • use.own
  • +
+
+ +

+ /usr/share/modulefiles +

+
+
    +
  • mpi/openmpi-x86_64
  • +
+
+ +

+ + /gpfs/share/Modules/modulefiles + +

+
+
    +
  • mathematica/14.0
  • +
+
+ +

+ + /gpfs/share/rhel9/opt/spack/share/spack/modules/linux-rhel9-skylake_avx512 + +

+
    +
  • anaconda3/2023.09-0-gcc-11.3.1-4njg4u3
  • +
  • autoconf-archive/2023.02.20-gcc-11.3.1-x3v3e3k
  • +
  • autoconf/2.72-gcc-11.3.1-zigeprh
  • +
  • automake/1.16.5-gcc-11.3.1-fiei3qm
  • +
  • bdftopcf/1.1-gcc-11.3.1-pjei4lg
  • +
  • berkeley-db/18.1.40-gcc-11.3.1-imacxi5
  • +
  • binutils/2.42-gcc-11.3.1-k2azs5r
  • +
  • bison/3.8.2-gcc-11.3.1-5ymaspt
  • +
  • bzip2/1.0.8-gcc-11.3.1-jftbk52
  • +
  • ca-certificates-mozilla/2023-05-30-gcc-11.3.1-3q7ngqu
  • +
  • cmake/3.27.9-gcc-11.3.1-ddiy5al
  • +
  • cpio/2.15-gcc-11.3.1-gysadjq
  • +
  • curl/8.7.1-gcc-11.3.1-ct7pjm2
  • +
  • diffutils/3.10-gcc-11.3.1-i2xxusk
  • +
  • expat/2.6.2-gcc-11.3.1-i6fawb2
  • +
  • findutils/4.9.0-gcc-11.3.1-vrwwaoy
  • +
  • fixesproto/5.0-gcc-11.3.1-rzgygqz
  • +
  • flex/2.6.3-gcc-11.3.1-lvm5eg4
  • +
  • font-util/1.4.0-gcc-11.3.1-kdzp4kf
  • +
  • fontconfig/2.15.0-gcc-11.3.1-oknygy4
  • +
  • fontsproto/2.1.3-gcc-11.3.1-hnnw5wr
  • +
  • freeglut/3.2.2-gcc-11.3.1-bj2wnkh
  • +
  • freetype/2.13.2-gcc-11.3.1-mukuuf4
  • +
  • gawk/5.3.0-gcc-11.3.1-uuiqxd2
  • +
  • + gcc-runtime/11.3.1-gcc-11.3.1-ts54e2r +
  • +
  • gcc/13.2.0-gcc-11.3.1-5cmgvey
  • +
  • gdbm/1.23-gcc-11.3.1-hf42icl
  • +
  • gettext/0.22.5-gcc-11.3.1-acdclxk
  • +
  • glibc/2.34-gcc-11.3.1-rv4ofgg
  • +
  • glproto/1.4.17-gcc-11.3.1-xwh2ngh
  • +
  • glx/1.4-gcc-11.3.1-exlgy5j
  • +
  • gmake/4.4.1-gcc-11.3.1-jpbz7dw
  • +
  • gmp/6.2.1-gcc-11.3.1-j2qp7w7
  • +
  • gperf/3.1-gcc-11.3.1-umkvi7d
  • +
  • htop/3.2.2-gcc-11.3.1-oa7ttvf
  • +
  • hwloc/2.9.1-gcc-11.3.1-weybm2e
  • +
  • inputproto/2.3.2-gcc-11.3.1-bo7qinx
  • +
  • intel-mpi/2019.7.217-gcc-11.3.1-o2z3fjd
  • +
  • + intel-oneapi-compilers-classic/2021.10.0-gcc-11.3.1-nocpr5m +
  • +
  • intel-oneapi-compilers/2023.2.4-gcc-11.3.1-ddcm4wi
  • +
  • intel-oneapi-mpi/2021.12.1-gcc-11.3.1-4kgjxio
  • +
  • kbproto/1.0.7-gcc-11.3.1-6uzvk2c
  • +
  • libbsd/0.12.1-gcc-11.3.1-tca36ex
  • +
  • libedit/3.1-20230828-gcc-11.3.1-ysouu3l
  • +
  • libffi/3.4.6-gcc-11.3.1-clmfd4j
  • +
  • libfontenc/1.1.8-gcc-11.3.1-w44jsoi
  • +
  • libice/1.1.1-gcc-11.3.1-76rcd55
  • +
  • libiconv/1.17-gcc-11.3.1-sqkuf4t
  • +
  • libmd/1.0.4-gcc-11.3.1-2yooftr
  • +
  • libpciaccess/0.17-gcc-11.3.1-niczctf
  • +
  • libpng/1.2.57-gcc-11.3.1-oqex7om
  • +
  • libpthread-stubs/0.5-gcc-11.3.1-xfbnr2b
  • +
  • libsigsegv/2.14-gcc-11.3.1-5kireuc
  • +
  • libsm/1.2.4-gcc-11.3.1-sgdfk4e
  • +
  • libtool/2.4.7-gcc-11.3.1-gtphuga
  • +
  • libunwind/1.6.2-gcc-11.3.1-6mtzyno
  • +
  • libx11/1.8.7-gcc-11.3.1-d5xeazl
  • +
  • libxau/1.0.11-gcc-11.3.1-dkkx74b
  • +
  • libxcb/1.16-gcc-11.3.1-fulboi2
  • +
  • libxcrypt/4.4.35-gcc-11.3.1-7kd52bn
  • +
  • libxdmcp/1.1.4-gcc-11.3.1-7fe4723
  • +
  • libxext/1.3.5-gcc-11.3.1-7d2ci6d
  • +
  • libxfixes/5.0.3-gcc-11.3.1-k53yjzd
  • +
  • libxfont/1.5.4-gcc-11.3.1-etmwjjy
  • +
  • libxft/2.3.8-gcc-11.3.1-vwyuhmz
  • +
  • libxi/1.7.10-gcc-11.3.1-rgytoua
  • +
  • libxml2/2.10.3-gcc-11.3.1-e5zt4m2
  • +
  • libxrandr/1.5.4-gcc-11.3.1-qxmrns4
  • +
  • libxrender/0.9.11-gcc-11.3.1-ao2tic2
  • +
  • libxscrnsaver/1.2.4-gcc-11.3.1-zs6aqfi
  • +
  • libxt/1.3.0-gcc-11.3.1-lwme3qs
  • +
  • libxxf86vm/1.1.5-gcc-11.3.1-6pze4ei
  • +
  • llvm/17.0.6-gcc-11.3.1-xlwz53w
  • +
  • lua/5.3.6-gcc-11.3.1-hn2ac7j
  • +
  • lumerical/2019b-r2-gcc-11.3.1-ejm6mo6
  • +
  • lumerical/2021-R1.1-2599-gcc-11.3.1-vvyw6z6
  • +
  • m4/1.4.19-gcc-11.3.1-tncckyq
  • +
  • mesa-glu/9.0.1-gcc-11.3.1-glwxfkj
  • +
  • mesa-glu/9.0.2-gcc-11.3.1-bd2lm7d
  • +
  • mesa/23.3.6-gcc-11.3.1-mx6glm4
  • +
  • meson/1.3.2-gcc-11.3.1-w7cv5bi
  • +
  • mkfontdir/1.0.7-gcc-11.3.1-3xvst7e
  • +
  • mkfontscale/1.2.3-gcc-11.3.1-4vmnuo2
  • +
  • mpc/1.3.1-gcc-11.3.1-ifuk5gu
  • +
  • mpfr/4.2.1-gcc-11.3.1-i6gtxh6
  • +
  • ncurses/6.5-gcc-11.3.1-z54b6d4
  • +
  • nghttp2/1.57.0-gcc-11.3.1-7c6bk73
  • +
  • ninja/1.11.1-gcc-11.3.1-wcew5xr
  • +
  • openssl/3.3.0-gcc-11.3.1-y2icle5
  • +
  • parallel/20220522-gcc-11.3.1-juzq7oy
  • +
  • patchelf/0.17.2-gcc-11.3.1-t6bdsvg
  • +
  • pcre2/10.43-gcc-11.3.1-3y4lcyt
  • +
  • perl-data-dumper/2.173-gcc-11.3.1-xwk47wz
  • +
  • perl/5.38.0-gcc-11.3.1-4wtgagw
  • +
  • pigz/2.8-gcc-11.3.1-bswu4yx
  • +
  • pkgconf/2.2.0-gcc-11.3.1-reshpid
  • +
  • py-mako/1.2.4-gcc-11.3.1-qzty4ic
  • +
  • py-markupsafe/2.1.3-gcc-11.3.1-xwc652n
  • +
  • py-pip/23.1.2-gcc-11.3.1-bmelf66
  • +
  • py-setuptools/69.2.0-gcc-11.3.1-vjtezkl
  • +
  • py-wheel/0.41.2-gcc-11.3.1-k5uxoxo
  • +
  • python-venv/1.0-gcc-11.3.1-jtnjye4
  • +
  • python/3.10.13-gcc-11.3.1-73qnjae
  • +
  • python/3.11.7-gcc-11.3.1-hsfwjwr
  • +
  • python/3.12.1-gcc-11.3.1-7tuhjhr
  • +
  • python/3.9.18-gcc-11.3.1-tc7dyca
  • +
  • randrproto/1.5.0-gcc-11.3.1-b6ssnsk
  • +
  • re2c/2.2-gcc-11.3.1-vxkdcjq
  • +
  • readline/8.2-gcc-11.3.1-qu4mv62
  • +
  • renderproto/0.11.1-gcc-11.3.1-uhoybj4
  • +
  • scrnsaverproto/1.2.2-gcc-11.3.1-cw3khfj
  • +
  • sqlite/3.43.2-gcc-11.3.1-bodsqyx
  • +
  • swig/4.1.1-gcc-11.3.1-zaf4wit
  • +
  • tar/1.34-gcc-11.3.1-wcuczap
  • +
  • tcl/8.6.12-gcc-11.3.1-o3atwx3
  • +
  • texinfo/7.0.3-gcc-11.3.1-bhmkpv4
  • +
  • tk/8.6.11-gcc-11.3.1-wz3kgh7
  • +
  • unzip/6.0-gcc-11.3.1-kptkztv
  • +
  • util-linux-uuid/2.38.1-gcc-11.3.1-ekl7sit
  • +
  • util-macros/1.19.3-gcc-11.3.1-3ocj656
  • +
  • xcb-proto/1.16.0-gcc-11.3.1-scf7ljj
  • +
  • xextproto/7.3.0-gcc-11.3.1-6mytvtp
  • +
  • xf86vidmodeproto/2.3.1-gcc-11.3.1-gc624cq
  • +
  • xproto/7.0.31-gcc-11.3.1-hhddgfu
  • +
  • xrandr/1.5.2-gcc-11.3.1-nbhkevq
  • +
  • xtrans/1.5.0-gcc-11.3.1-67qr5rx
  • +
  • xz/5.4.6-gcc-11.3.1-pswkf4y
  • +
  • zlib-ng/2.1.6-gcc-11.3.1-snhalql
  • +
  • zlib/1.3.1-gcc-11.3.1-fk7flnb
  • +
  • zstd/1.5.6-gcc-11.3.1-mldt4gi
  • +
+ +
+

Key:

+

+ loaded + auto-loaded + modulepath +

+
+
+ +

+ > module load MODULE +

+
+ + +
+ +
+
+

Partitions, JobTesting

+ +

+ > sinfo +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PARTITIONAVAILTIMELIMITNODESSTATENODELIST
TrixieMain*up12:00:004draincn[131-134]
TrixieMain*up12:00:002mixcn[108-109]
TrixieMain*up12:00:0022idlecn[107,110-130]
TrixieLongup2-00:00:001draincn131
TrixieLongup2-00:00:002mixcn[108-109]
TrixieLongup2-00:00:0022idlecn[107,110-130]
JobTestingup6:00:002idlecn[135-136]
+

+ > sbatch --partition=JobTesting ... +

+
+ + +
+ +
+

Account Name

+ +
+

+ See + Account-Codes + for a list of codes +

+

DT Digital Technologies / Technologies Numériques

+
    +
  • dt-dac
  • +
  • dt-dscs
  • +
  • dt-mtp
  • +
  • dt-ta
  • +
+ +

+ > sbatch --account=account_code ... +

+
+
+ +
+

Node's Resources

+ +
+ > sinfo --Node --responding --long + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NODELISTNODESPARTITIONSTATECPUSS:C:TMEMORYTMP_DISKWEIGHTAVAIL_FEREASON
cn1061DevTestidle642:16:219277701(null)none
cn1071TrixieLongidle642:16:219277701(null)none
cn1071TrixieMain*idle642:16:219277701(null)none
cn1081TrixieLongidle642:16:219277701(null)none
...
+
+ + +
+ +
+
+

What Do Nodes Have to Offer?

+ +

+ > scontrol show nodes +

+ +
+            
+NodeName=cn136 Arch=x86_64 CoresPerSocket=16
+  CPUAlloc=0 CPUTot=64 CPULoad=0.01
+  AvailableFeatures=(null)
+  ActiveFeatures=(null)
+  Gres=gpu:4
+  NodeAddr=cn136 NodeHostName=cn136
+  OS=Linux 3.10.0-1160.62.1.el7.x86_64 #1 SMP Tue Apr 5 16:57:59 UTC 2022
+  RealMemory=192777 AllocMem=0 FreeMem=183181 Sockets=2 Boards=1
+  State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
+  Partitions=JobTesting
+  BootTime=2024-05-29T14:23:15 SlurmdStartTime=2024-05-29T14:23:36
+  CfgTRES=cpu=64,mem=192777M,billing=64,gres/gpu=4
+  AllocTRES=
+  CapWatts=n/a
+  CurrentWatts=0 AveWatts=0
+  ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
+            
+          
+
+ + +
+ +
+
+

The GPUs

+
+

Tesla V100-SXM2-32GB

+

May 10, 2017

+

NVIDIA-SMI 550.54.15

+

Driver Version: 550.54.15

+

CUDA Version: 12.4

+

CUDA Compute Capabilities 7.0

+

Single Precision: 15 TFLOPS

+

Tensor Performance (Deep Learning): 120 TFLOPS

+
+
+ + +
+ +
+

CPUs

+
+
    +
  • + processor_type = Intel Xeon Gold 6130 CPU clocked at 2.1GHZ 16 + cores / CPU +
  • +
  • processors_per_node = 2
  • +
  • cores_per_socket = 16
  • +
  • threads_per_core = 2 (hyper-threading on)
  • +
  • RAM = 192 GB memory
  • +
+
+
+ +
+
+

Slurm Header Example

+ + + + + + +
+

+#!/bin/bash
+# vim:nowrap:
+
+#SBATCH --job-name=MSLC24.training
+#SBATCH --comment="Metric Score Landscape Challenge 24 NMT training"
+
+# On Trixie
+#SBATCH --partition=TrixieMain
+#SBATCH --account=dt-mtp
+
+#SBATCH --gres=gpu:4
+#SBATCH --time=12:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=6
+#SBATCH --mem=96G
+# To reserve a whole node for yourself
+##SBATCH --exclusive
+#SBATCH --open-mode=append
+#SBATCH --requeue
+#SBATCH --signal=B:USR1@30
+#SBATCH --output=%x-%j.out
+
+# Fix SLURM environment variables.
+SLURM_JOB_CPUS_PER_NODE=${SLURM_JOB_CPUS_PER_NODE%%(*)} # '24(x2)' => 24
+SLURM_TASKS_PER_NODE=${SLURM_TASKS_PER_NODE%%(*)} # '4(x2)' => '4'
+
+# NOTE: We set OMP_NUM_THREADS or else we get the following Warning:
+# WARNING:torch.distributed.run:
+# *****************************************
+# Setting OMP_NUM_THREADS environment variable for each process to be 1 in
+# default, to avoid your system being overloaded, please further tune the
+# variable for optimal performance in your application as needed.
+# *****************************************
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-$(nproc)}
+                  
+
+

+# Requeueing on Trixie
+# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/)
+#
+[source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html#how-to-recover-files-before-a-job-times-out)
+function _requeue {
+  echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID"
+  date
+  scontrol requeue $SLURM_JOBID
+}
+
+if [[ -n "$SLURM_JOBID" ]]; then
+  SACC_FORMAT="JobID,Submit,Start,End,Elapsed,ExitCode,State,CPUTime,MaxRSS,"
+  SACC_FORMAT+="MaxVMSize,MaxDiskRead,MaxDiskWrite,AllocCPUs,AllocGRES"
+  SACC_FORMAT+=",AllocTRES%-50,NodeList,JobName%-30,Comment%-80"
+  trap "sacct --jobs $SLURM_JOBID --format=$SACC_FORMAT" 0
+  trap _requeue USR1
+fi
+                  
+
+ + > sbatch my_wonderful.sh +
+ + +
+ +
+
+

Do I Really Need a Script?

+
+            
+> sbatch \
+    --job-name=MSLC24.training \
+    --comment="Metric Score Landscape Challenge 24 NMT training" \
+    --partition=TrixieMain \
+    --account=dt-mtp \
+    --gres=gpu:4 \
+    --time=12:00:00 \
+    --nodes=1 \
+    --ntasks-per-node=4 \
+    --cpus-per-task=6 \
+    --mem=96G \
+    --open-mode=append \
+    --requeue \
+    --signal=B:USR1@30 \
+    --output=%x-%j.out \
+    my_wonderful.sh args ...
+            
+          
+ > sbatch --job-name=MSLC24.training my_wonderful.sh +
+
+ +
+

Limit RAM to Allow no GPU Jobs to Also Run

+ +
+ #SBATCH --mem=96G + +

+ Otherwise the scheduler assumes that you want all of the memory + preventing other jobs to run. +

+
+ + +
+ +
+
+

Make your Job Resumable

+ +
+            
+#SBATCH --requeue
+#SBATCH --signal=B:USR1@30
+
+# Requeueing on Trixie
+# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/)
+#
+[source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html#how-to-recover-files-before-a-job-times-out)
+function _requeue {
+  echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID"
+  date
+  scontrol requeue $SLURM_JOBID
+}
+
+if [[ -n "$SLURM_JOBID" ]]; then
+  SACCT_FORMAT="JobID,Submit,Start,End,Elapsed,ExitCode,State,CPUTime,MaxRSS,MaxVMSize"
+  SACCT_FORMAT+=",MaxDiskRead,MaxDiskWrite,AllocCPUs,AllocGRES,AllocTRES%-50,NodeList"
+  SACCT_FORMAT+=",JobName%-30,Comment%-80"
+  trap "sacct --jobs $SLURM_JOBID --format=$ACCT_FORMAT" 0
+  trap _requeue USR1
+fi
+            
+          
+ Automatically Resuming Requeueing +
+ + +
+ +
+
+

Submit my Job

+ +

queue your job

+

+ > sbatch my_wonderful.sh +

+ +

check that your job is running

+

+ > squeue +

+ + + + + + + + + + + + + + + + + + + + + + + + + +
JOBIDNAMEUSERSTTIMENODESNODELIST(REASON)SUBMIT_TIMECOMMENT
733MSLC24larkinsR7:43:441trixie-cn1012024-07-17T02:26:0MSLC24Training
+
+
+ +
+
+

Check nvidia-smi -l for Good GPU Usage

+ > ssh -t cn101 nvidia-smi -l +
+            
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6           |
+|-------------------------------+----------------------+----------------------+
+| GPU Name Persistence-M        | Bus-Id Disp.A        | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap   | Memory-Usage         | GPU-Util Compute M.  |
+|                               |                      | MIG M.               |
+|===============================+======================+======================|
+| 0 Tesla V100-SXM2... Off      | 00000000:89:00.0 Off | 0                    |
+| N/A 27C P0 53W / 300W         | 0MiB / 32768MiB      | 0% Default           |
+|                               |                      | N/A                  |
++-------------------------------+----------------------+----------------------+
+| 1 Tesla V100-SXM2... Off      | 00000000:8A:00.0 Off | 0                    |
+| N/A 31C P0 54W / 300W         | 0MiB / 32768MiB      | 0% Default           |
+|                               |                      | N/A                  |
++-------------------------------+----------------------+----------------------+
+| 2 Tesla V100-SXM2... Off      | 00000000:B2:00.0 Off | 0                    |
+| N/A 31C P0 55W / 300W         | 0MiB / 32768MiB      | 0% Default           |
+|                               |                      | N/A                  |
++-------------------------------+----------------------+----------------------+
+| 3 Tesla V100-SXM2... Off      | 00000000:B3:00.0 Off | 0                    |
+| N/A 32C P0 54W / 300W         | 0MiB / 32768MiB      | 2% Default           |
+|                               |                      | N/A                  |
++-------------------------------+----------------------+----------------------+
+
++-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+| GPU GI CI PID Type Process name GPU Memory                                  |
+| ID ID Usage                                                                 |
+|=============================================================================|
+| No running processes found                                                  |
++-----------------------------------------------------------------------------+
+            
+          
+
+
+ +
+

Asking For Too Much

+ +
+

+ > sbatch --mem=400G my_wonderful.sh +

+ + sbatch: error: Batch job submission failed: Memory required by + task is not available + +
+
+ +
+

Jupyter Notebook

+ +

+ Please refer to + Jobs Conda JupyterLab + as it is a bit more involved +

+ +

+ WARNING: Don't let your worker node run if you are not using it +

+ + +
+ +
+

Conclusion

+ +
    +
  • + This good-citizen principle doesn't apply only to Trixie but to + all clusters +
  • +
  • We want to maximize the cluster usage
  • +
  • We want to maximize everyone enjoyment of the cluster
  • +
  • https://github.com/ai4d-iasc/trixie
  • +
  • https://github.com/ai4d-iasc/trixie/wiki
  • +
  • https://github.com/ai4d-iasc/trixie/issues
  • +
+ + +
+
+
+ + + + + +