diff --git a/.github/workflows/doc.yaml b/.github/workflows/doc.yaml new file mode 100644 index 0000000..4006b00 --- /dev/null +++ b/.github/workflows/doc.yaml @@ -0,0 +1,43 @@ +# Simple workflow for deploying static content to GitHub Pages +name: Deploy static content to Pages + +on: + # Runs on pushes targeting the default branch + push: + branches: [$default-branch, dev.sl/presentation] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Single deploy job since we're just deploying + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + # Upload entire repository + path: "." + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/static/BeAGoodClusterCitizen.html b/static/BeAGoodClusterCitizen.html new file mode 100644 index 0000000..67d3b73 --- /dev/null +++ b/static/BeAGoodClusterCitizen.html @@ -0,0 +1,981 @@ + +
++ trixie.res.nrc.gc.ca from the + black & + orange networks +
+ +
+ > ./onboarding.sh
+
+ /gpfs/projects will remain as-is and should be + in-sync with the previous storage appliance, as such there should + be no apparent change +
+ +
+ /home/${USER}
will now have a quota of 25GB
+ and will have snapshots dsabled.
+
+ /gpfs/work/${USER} will be the primary + work-space for users and will have a quota of 500GB and 1M inodes + with snapshots enabled +
+ ++ We request that users not create conda, mamba, venvs, etc + within the /gpfs/work space, and instead use their /home and create symlinks to + their projects. The snapshot feature of GPFS does not perform + optimally when many small files are created and destroyed as with + such utilities. We understand that some workflows on Trixie have + this behaviour inherently, but an effort to reduce this behaviour + is appreciated +
+
+ > module avail
+
Key:
++ loaded + auto-loaded + modulepath +
+
+ > module load MODULE
+
+ > sinfo
+
PARTITION | +AVAIL | +TIMELIMIT | +NODES | +STATE | +NODELIST | +
---|---|---|---|---|---|
TrixieMain* | +up | +12:00:00 | +4 | +drain | +cn[131-134] | +
TrixieMain* | +up | +12:00:00 | +2 | +mix | +cn[108-109] | +
TrixieMain* | +up | +12:00:00 | +22 | +idle | +cn[107,110-130] | +
TrixieLong | +up | +2-00:00:00 | +1 | +drain | +cn131 | +
TrixieLong | +up | +2-00:00:00 | +2 | +mix | +cn[108-109] | +
TrixieLong | +up | +2-00:00:00 | +22 | +idle | +cn[107,110-130] | +
JobTesting | +up | +6:00:00 | +2 | +idle | +cn[135-136] | +
+ > sbatch --partition=JobTesting ...
+
+ See + Account-Codes + for a list of codes +
+DT Digital Technologies / Technologies Numériques
+
+ > sbatch --account=account_code ...
+
> sinfo --Node --responding --long
+
+ NODELIST | +NODES | +PARTITION | +STATE | +CPUS | +S:C:T | +MEMORY | +TMP_DISK | +WEIGHT | +AVAIL_FE | +REASON | +
---|---|---|---|---|---|---|---|---|---|---|
cn106 | +1 | +DevTest | +idle | +64 | +2:16:2 | +192777 | +0 | +1 | +(null) | +none | +
cn107 | +1 | +TrixieLong | +idle | +64 | +2:16:2 | +192777 | +0 | +1 | +(null) | +none | +
cn107 | +1 | +TrixieMain* | +idle | +64 | +2:16:2 | +192777 | +0 | +1 | +(null) | +none | +
cn108 | +1 | +TrixieLong | +idle | +64 | +2:16:2 | +192777 | +0 | +1 | +(null) | +none | +
... | +
+ > scontrol show nodes
+
+ +NodeName=cn136 Arch=x86_64 CoresPerSocket=16 + CPUAlloc=0 CPUTot=64 CPULoad=0.01 + AvailableFeatures=(null) + ActiveFeatures=(null) + Gres=gpu:4 + NodeAddr=cn136 NodeHostName=cn136 + OS=Linux 3.10.0-1160.62.1.el7.x86_64 #1 SMP Tue Apr 5 16:57:59 UTC 2022 + RealMemory=192777 AllocMem=0 FreeMem=183181 Sockets=2 Boards=1 + State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A + Partitions=JobTesting + BootTime=2024-05-29T14:23:15 SlurmdStartTime=2024-05-29T14:23:36 + CfgTRES=cpu=64,mem=192777M,billing=64,gres/gpu=4 + AllocTRES= + CapWatts=n/a + CurrentWatts=0 AveWatts=0 + ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s + ++
Tesla V100-SXM2-32GB
+May 10, 2017
+NVIDIA-SMI 550.54.15
+Driver Version: 550.54.15
+CUDA Version: 12.4
+CUDA Compute Capabilities 7.0
+Single Precision: 15 TFLOPS
+Tensor Performance (Deep Learning): 120 TFLOPS
+
+ +#!/bin/bash +# vim:nowrap: + +#SBATCH --job-name=MSLC24.training +#SBATCH --comment="Metric Score Landscape Challenge 24 NMT training" + +# On Trixie +#SBATCH --partition=TrixieMain +#SBATCH --account=dt-mtp + +#SBATCH --gres=gpu:4 +#SBATCH --time=12:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=6 +#SBATCH --mem=96G +# To reserve a whole node for yourself +##SBATCH --exclusive +#SBATCH --open-mode=append +#SBATCH --requeue +#SBATCH --signal=B:USR1@30 +#SBATCH --output=%x-%j.out + +# Fix SLURM environment variables. +SLURM_JOB_CPUS_PER_NODE=${SLURM_JOB_CPUS_PER_NODE%%(*)} # '24(x2)' => 24 +SLURM_TASKS_PER_NODE=${SLURM_TASKS_PER_NODE%%(*)} # '4(x2)' => '4' + +# NOTE: We set OMP_NUM_THREADS or else we get the following Warning: +# WARNING:torch.distributed.run: +# ***************************************** +# Setting OMP_NUM_THREADS environment variable for each process to be 1 in +# default, to avoid your system being overloaded, please further tune the +# variable for optimal performance in your application as needed. +# ***************************************** +export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-$(nproc)} ++ |
+
+ +# Requeueing on Trixie +# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/) +# +[source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html#how-to-recover-files-before-a-job-times-out) +function _requeue { + echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID" + date + scontrol requeue $SLURM_JOBID +} + +if [[ -n "$SLURM_JOBID" ]]; then + SACC_FORMAT="JobID,Submit,Start,End,Elapsed,ExitCode,State,CPUTime,MaxRSS," + SACC_FORMAT+="MaxVMSize,MaxDiskRead,MaxDiskWrite,AllocCPUs,AllocGRES" + SACC_FORMAT+=",AllocTRES%-50,NodeList,JobName%-30,Comment%-80" + trap "sacct --jobs $SLURM_JOBID --format=$SACC_FORMAT" 0 + trap _requeue USR1 +fi ++ |
+
> sbatch my_wonderful.sh
+
+
+> sbatch \
+ --job-name=MSLC24.training \
+ --comment="Metric Score Landscape Challenge 24 NMT training" \
+ --partition=TrixieMain \
+ --account=dt-mtp \
+ --gres=gpu:4 \
+ --time=12:00:00 \
+ --nodes=1 \
+ --ntasks-per-node=4 \
+ --cpus-per-task=6 \
+ --mem=96G \
+ --open-mode=append \
+ --requeue \
+ --signal=B:USR1@30 \
+ --output=%x-%j.out \
+ my_wonderful.sh args ...
+
+
+ > sbatch --job-name=MSLC24.training my_wonderful.sh
+ + Otherwise the scheduler assumes that you want all of the memory + preventing other jobs to run. +
++ +#SBATCH --requeue +#SBATCH --signal=B:USR1@30 + +# Requeueing on Trixie +# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/) +# +[source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html#how-to-recover-files-before-a-job-times-out) +function _requeue { + echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID" + date + scontrol requeue $SLURM_JOBID +} + +if [[ -n "$SLURM_JOBID" ]]; then + SACCT_FORMAT="JobID,Submit,Start,End,Elapsed,ExitCode,State,CPUTime,MaxRSS,MaxVMSize" + SACCT_FORMAT+=",MaxDiskRead,MaxDiskWrite,AllocCPUs,AllocGRES,AllocTRES%-50,NodeList" + SACCT_FORMAT+=",JobName%-30,Comment%-80" + trap "sacct --jobs $SLURM_JOBID --format=$ACCT_FORMAT" 0 + trap _requeue USR1 +fi + ++ Automatically Resuming Requeueing +
+ > sbatch my_wonderful.sh
+
+ > squeue
+
JOBID | +NAME | +USER | +ST | +TIME | +NODES | +NODELIST(REASON) | +SUBMIT_TIME | +COMMENT | +|
---|---|---|---|---|---|---|---|---|---|
733 | +MSLC24 | +larkins | +R | +7:43:44 | +1 | +trixie-cn101 | +2024-07-17T02:26:0 | +MSLC24 | +Training | +
nvidia-smi -l
for Good GPU Usage> ssh -t cn101 nvidia-smi -l
+ + ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|===============================+======================+======================| +| 0 Tesla V100-SXM2... Off | 00000000:89:00.0 Off | 0 | +| N/A 27C P0 53W / 300W | 0MiB / 32768MiB | 0% Default | +| | | N/A | ++-------------------------------+----------------------+----------------------+ +| 1 Tesla V100-SXM2... Off | 00000000:8A:00.0 Off | 0 | +| N/A 31C P0 54W / 300W | 0MiB / 32768MiB | 0% Default | +| | | N/A | ++-------------------------------+----------------------+----------------------+ +| 2 Tesla V100-SXM2... Off | 00000000:B2:00.0 Off | 0 | +| N/A 31C P0 55W / 300W | 0MiB / 32768MiB | 0% Default | +| | | N/A | ++-------------------------------+----------------------+----------------------+ +| 3 Tesla V100-SXM2... Off | 00000000:B3:00.0 Off | 0 | +| N/A 32C P0 54W / 300W | 0MiB / 32768MiB | 2% Default | +| | | N/A | ++-------------------------------+----------------------+----------------------+ + ++-----------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=============================================================================| +| No running processes found | ++-----------------------------------------------------------------------------+ + ++
+ > sbatch --mem=400G my_wonderful.sh
+
+ Please refer to + Jobs Conda JupyterLab + as it is a bit more involved +
+ ++ WARNING: Don't let your worker node run if you are not using it +
+ + +