From 27504565c1462dc3101cdc02aa550a26177109d7 Mon Sep 17 00:00:00 2001
From: Samuel Larkin <Samuel.Larkin@cnrc-nrc.gc.ca>
Date: Thu, 18 Jul 2024 10:19:43 -0400
Subject: [PATCH] [WIP]feat: Trixie - Be a Good Cluster-Citizen

---
 .github/workflows/doc.yaml        |  43 ++
 static/BeAGoodClusterCitizen.html | 981 ++++++++++++++++++++++++++++++
 2 files changed, 1024 insertions(+)
 create mode 100644 .github/workflows/doc.yaml
 create mode 100644 static/BeAGoodClusterCitizen.html
diff --git a/.github/workflows/doc.yaml b/.github/workflows/doc.yaml
new file mode 100644
index 0000000..4006b00
--- /dev/null
+++ b/.github/workflows/doc.yaml
@@ -0,0 +1,43 @@
+# Simple workflow for deploying static content to GitHub Pages
+name: Deploy static content to Pages
+
+on:
+  # Runs on pushes targeting the default branch
+  push:
+    branches: [$default-branch, dev.sl/presentation]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  # Single deploy job since we're just deploying
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Setup Pages
+        uses: actions/configure-pages@v5
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          # Upload entire repository
+          path: "."
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/static/BeAGoodClusterCitizen.html b/static/BeAGoodClusterCitizen.html
new file mode 100644
index 0000000..67d3b73
--- /dev/null
+++ b/static/BeAGoodClusterCitizen.html
@@ -0,0 +1,981 @@
+<html>
+  <head>
+    <title>Trixie</title>
+    <!--
+  <link rel="stylesheet" href="reveal.js/dist/theme/dracula.css" />
+    -->
+    <link
+      rel="stylesheet"
+      href="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/5.1.0/reveal.min.css"
+      integrity="sha512-0AUO8B5ll9y1ERV/55xq3HeccBGnvAJQsVGitNac/iQCLyDTGLUBMPqlupIWp/rJg0hV3WWHusXchEIdqFAv1Q=="
+      crossorigin="anonymous"
+      referrerpolicy="no-referrer"
+    />
+    <link
+      rel="stylesheet"
+      href="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/5.1.0/theme/dracula.min.css"
+      integrity="sha512-RPqCDyRPg34CE04QpEtt720zRNPR9oZElahfl1OrNl+pxERwvXw7usOOsR+vP6HnDQEmw+ioz2Nh/XOGP8aV0w=="
+      crossorigin="anonymous"
+      referrerpolicy="no-referrer"
+    />
+    <link
+      rel="stylesheet"
+      href="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/5.1.0/plugin/highlight/monokai.min.css"
+      integrity="sha512-z8wQkuDRFwCBfoj7KOiu1MECaRVoXx6rZQWL21x0BsVVH7JkqCp1Otf39qve6CrCycOOL5o9vgfII5Smds23rg=="
+      crossorigin="anonymous"
+      referrerpolicy="no-referrer"
+    />
+    <style>
+      table {
+        color: white;
+      }
+      code {
+        color: lime;
+      }
+      ul.multicolumns {
+        list-style-type: none;
+        column-count: 3;
+      }
+
+      h2.striked {
+        width: 100%;
+        text-align: center;
+        border-bottom: 1px solid #000;
+        line-height: 0.1em;
+        margin: 0.5em;
+        font-size: 1em;
+      }
+
+      h2.striked span {
+        background-color: #282a36;
+        padding: 0 10px;
+      }
+
+      .modulepath {
+        color: dodgerblue;
+      }
+
+      .loaded {
+        background: lightgrey;
+      }
+
+      .autoloaded {
+        background: grey;
+        color: lightgrey;
+      }
+
+      .keyDefinition p {
+        margin-top: 0.5em;
+        margin-bottom: 0.5em;
+      }
+
+      .keyDefinition p span {
+        margin-left: 1em;
+        margin-right: 1em;
+      }
+    </style>
+  </head>
+
+  <body>
+    <div class="reveal">
+      <div class="slides">
+        <section>
+          <h1>Trixie</h1>
+          <h2>Be a Good Cluster-Citizen</h2>
+          <h4>Samuel Larkin</h4>
+        </section>
+
+        <section>
+          <h1>Trixie</h1>
+
+          <div style="text-align: left" class="r-fit-text">
+            <p>
+              <em>trixie.res.nrc.gc.ca</em> from the
+              <b style="color: black">black</b> &
+              <b style="color: orange">orange</b> networks
+            </p>
+
+            <p>
+              <code> &gt; ./onboarding.sh </code>
+            </p>
+
+            <p>
+              <b><samp>/gpfs/projects</samp></b> will remain as-is and should be
+              in-sync with the previous storage appliance, as such there should
+              be no apparent change
+            </p>
+
+            <p>
+              <b><code>/home/${USER}</code></b> will now have a quota of 25GB
+              and will have <b>snapshots dsabled</b>.
+            </p>
+
+            <p>
+              <b><samp>/gpfs/work/${USER}</samp> </b> will be the primary
+              work-space for users and will have a quota of 500GB and 1M inodes
+              with snapshots enabled
+            </p>
+
+            <p>
+              <b
+                >We request that users not create conda, mamba, venvs, etc
+                within the <samp>/gpfs/work</samp> space</b
+              >, and instead use their <samp>/home</samp> and create symlinks to
+              their projects. The snapshot feature of GPFS does not perform
+              optimally when many small files are created and destroyed as with
+              such utilities. We understand that some workflows on Trixie have
+              this behaviour inherently, but an effort to reduce this behaviour
+              is appreciated
+            </p>
+          </div>
+
+          <aside class="notes">
+            Running RHEL 9
+
+            <code>&gt; sbatch --version</code><samp> slurm 22.05.9 </samp>
+
+            https://github.com/ai4d-iasc/trixie/wiki/Hardware
+
+            <samp>
+              ************************************************************************
+              ** On first log-in to the RHEL9 setup, please run onboarding.sh in
+              ** ** your /home/${USER} directory. If you do not have
+              onboarding.sh in ** ** your home directory, copy it from
+              /gpfs/work/ ** ** ** ** Note the new quotas: ** ** /home/${USER} :
+              25GiB/1M inodes ** ** /gpfs/work/${USER} : 500GiB/1M inodes ** **
+              **
+              ************************************************************************
+            </samp>
+          </aside>
+        </section>
+
+        <section>
+          <div style="text-align: left" class="r-fit-text">
+            <h1>Getting some Software</h1>
+
+            <p>
+              <code>&gt; module avail </code>
+            </p>
+
+            <div style="font-size: 0.32em">
+              <h2 class="striked">
+                <span class="modulepath"> /usr/share/Modules/modulefiles </span>
+              </h2>
+              <div>
+                <ul class="multicolumns" style="column-count: 6">
+                  <li>dot</li>
+                  <li>module-git</li>
+                  <li>module-info</li>
+                  <li>modules</li>
+                  <li>null</li>
+                  <li>use.own</li>
+                </ul>
+              </div>
+
+              <h2 class="striked">
+                <span class="modulepath"> /usr/share/modulefiles </span>
+              </h2>
+              <div>
+                <ul class="multicolumns">
+                  <li>mpi/openmpi-x86_64</li>
+                </ul>
+              </div>
+
+              <h2 class="striked">
+                <span class="modulepath">
+                  /gpfs/share/Modules/modulefiles
+                </span>
+              </h2>
+              <div>
+                <ul class="multicolumns">
+                  <li>mathematica/14.0</li>
+                </ul>
+              </div>
+
+              <h2 class="striked">
+                <span class="modulepath">
+                  /gpfs/share/rhel9/opt/spack/share/spack/modules/linux-rhel9-skylake_avx512
+                </span>
+              </h2>
+              <ul class="multicolumns" style="column-count: 4">
+                <li>anaconda3/2023.09-0-gcc-11.3.1-4njg4u3</li>
+                <li>autoconf-archive/2023.02.20-gcc-11.3.1-x3v3e3k</li>
+                <li>autoconf/2.72-gcc-11.3.1-zigeprh</li>
+                <li>automake/1.16.5-gcc-11.3.1-fiei3qm</li>
+                <li>bdftopcf/1.1-gcc-11.3.1-pjei4lg</li>
+                <li>berkeley-db/18.1.40-gcc-11.3.1-imacxi5</li>
+                <li>binutils/2.42-gcc-11.3.1-k2azs5r</li>
+                <li>bison/3.8.2-gcc-11.3.1-5ymaspt</li>
+                <li>bzip2/1.0.8-gcc-11.3.1-jftbk52</li>
+                <li>ca-certificates-mozilla/2023-05-30-gcc-11.3.1-3q7ngqu</li>
+                <li>cmake/3.27.9-gcc-11.3.1-ddiy5al</li>
+                <li>cpio/2.15-gcc-11.3.1-gysadjq</li>
+                <li>curl/8.7.1-gcc-11.3.1-ct7pjm2</li>
+                <li>diffutils/3.10-gcc-11.3.1-i2xxusk</li>
+                <li>expat/2.6.2-gcc-11.3.1-i6fawb2</li>
+                <li>findutils/4.9.0-gcc-11.3.1-vrwwaoy</li>
+                <li>fixesproto/5.0-gcc-11.3.1-rzgygqz</li>
+                <li>flex/2.6.3-gcc-11.3.1-lvm5eg4</li>
+                <li>font-util/1.4.0-gcc-11.3.1-kdzp4kf</li>
+                <li>fontconfig/2.15.0-gcc-11.3.1-oknygy4</li>
+                <li>fontsproto/2.1.3-gcc-11.3.1-hnnw5wr</li>
+                <li>freeglut/3.2.2-gcc-11.3.1-bj2wnkh</li>
+                <li>freetype/2.13.2-gcc-11.3.1-mukuuf4</li>
+                <li>gawk/5.3.0-gcc-11.3.1-uuiqxd2</li>
+                <li class="autoloaded">
+                  gcc-runtime/11.3.1-gcc-11.3.1-ts54e2r
+                </li>
+                <li>gcc/13.2.0-gcc-11.3.1-5cmgvey</li>
+                <li>gdbm/1.23-gcc-11.3.1-hf42icl</li>
+                <li>gettext/0.22.5-gcc-11.3.1-acdclxk</li>
+                <li class="autoloaded">glibc/2.34-gcc-11.3.1-rv4ofgg</li>
+                <li>glproto/1.4.17-gcc-11.3.1-xwh2ngh</li>
+                <li>glx/1.4-gcc-11.3.1-exlgy5j</li>
+                <li>gmake/4.4.1-gcc-11.3.1-jpbz7dw</li>
+                <li>gmp/6.2.1-gcc-11.3.1-j2qp7w7</li>
+                <li>gperf/3.1-gcc-11.3.1-umkvi7d</li>
+                <li>htop/3.2.2-gcc-11.3.1-oa7ttvf</li>
+                <li>hwloc/2.9.1-gcc-11.3.1-weybm2e</li>
+                <li>inputproto/2.3.2-gcc-11.3.1-bo7qinx</li>
+                <li>intel-mpi/2019.7.217-gcc-11.3.1-o2z3fjd</li>
+                <li>
+                  intel-oneapi-compilers-classic/2021.10.0-gcc-11.3.1-nocpr5m
+                </li>
+                <li>intel-oneapi-compilers/2023.2.4-gcc-11.3.1-ddcm4wi</li>
+                <li>intel-oneapi-mpi/2021.12.1-gcc-11.3.1-4kgjxio</li>
+                <li>kbproto/1.0.7-gcc-11.3.1-6uzvk2c</li>
+                <li>libbsd/0.12.1-gcc-11.3.1-tca36ex</li>
+                <li>libedit/3.1-20230828-gcc-11.3.1-ysouu3l</li>
+                <li>libffi/3.4.6-gcc-11.3.1-clmfd4j</li>
+                <li>libfontenc/1.1.8-gcc-11.3.1-w44jsoi</li>
+                <li>libice/1.1.1-gcc-11.3.1-76rcd55</li>
+                <li>libiconv/1.17-gcc-11.3.1-sqkuf4t</li>
+                <li>libmd/1.0.4-gcc-11.3.1-2yooftr</li>
+                <li>libpciaccess/0.17-gcc-11.3.1-niczctf</li>
+                <li>libpng/1.2.57-gcc-11.3.1-oqex7om</li>
+                <li>libpthread-stubs/0.5-gcc-11.3.1-xfbnr2b</li>
+                <li>libsigsegv/2.14-gcc-11.3.1-5kireuc</li>
+                <li>libsm/1.2.4-gcc-11.3.1-sgdfk4e</li>
+                <li>libtool/2.4.7-gcc-11.3.1-gtphuga</li>
+                <li>libunwind/1.6.2-gcc-11.3.1-6mtzyno</li>
+                <li>libx11/1.8.7-gcc-11.3.1-d5xeazl</li>
+                <li>libxau/1.0.11-gcc-11.3.1-dkkx74b</li>
+                <li>libxcb/1.16-gcc-11.3.1-fulboi2</li>
+                <li>libxcrypt/4.4.35-gcc-11.3.1-7kd52bn</li>
+                <li>libxdmcp/1.1.4-gcc-11.3.1-7fe4723</li>
+                <li>libxext/1.3.5-gcc-11.3.1-7d2ci6d</li>
+                <li>libxfixes/5.0.3-gcc-11.3.1-k53yjzd</li>
+                <li>libxfont/1.5.4-gcc-11.3.1-etmwjjy</li>
+                <li>libxft/2.3.8-gcc-11.3.1-vwyuhmz</li>
+                <li>libxi/1.7.10-gcc-11.3.1-rgytoua</li>
+                <li>libxml2/2.10.3-gcc-11.3.1-e5zt4m2</li>
+                <li>libxrandr/1.5.4-gcc-11.3.1-qxmrns4</li>
+                <li>libxrender/0.9.11-gcc-11.3.1-ao2tic2</li>
+                <li>libxscrnsaver/1.2.4-gcc-11.3.1-zs6aqfi</li>
+                <li>libxt/1.3.0-gcc-11.3.1-lwme3qs</li>
+                <li>libxxf86vm/1.1.5-gcc-11.3.1-6pze4ei</li>
+                <li>llvm/17.0.6-gcc-11.3.1-xlwz53w</li>
+                <li>lua/5.3.6-gcc-11.3.1-hn2ac7j</li>
+                <li>lumerical/2019b-r2-gcc-11.3.1-ejm6mo6</li>
+                <li>lumerical/2021-R1.1-2599-gcc-11.3.1-vvyw6z6</li>
+                <li>m4/1.4.19-gcc-11.3.1-tncckyq</li>
+                <li>mesa-glu/9.0.1-gcc-11.3.1-glwxfkj</li>
+                <li>mesa-glu/9.0.2-gcc-11.3.1-bd2lm7d</li>
+                <li>mesa/23.3.6-gcc-11.3.1-mx6glm4</li>
+                <li>meson/1.3.2-gcc-11.3.1-w7cv5bi</li>
+                <li>mkfontdir/1.0.7-gcc-11.3.1-3xvst7e</li>
+                <li>mkfontscale/1.2.3-gcc-11.3.1-4vmnuo2</li>
+                <li>mpc/1.3.1-gcc-11.3.1-ifuk5gu</li>
+                <li>mpfr/4.2.1-gcc-11.3.1-i6gtxh6</li>
+                <li class="loaded">ncurses/6.5-gcc-11.3.1-z54b6d4</li>
+                <li>nghttp2/1.57.0-gcc-11.3.1-7c6bk73</li>
+                <li>ninja/1.11.1-gcc-11.3.1-wcew5xr</li>
+                <li>openssl/3.3.0-gcc-11.3.1-y2icle5</li>
+                <li>parallel/20220522-gcc-11.3.1-juzq7oy</li>
+                <li>patchelf/0.17.2-gcc-11.3.1-t6bdsvg</li>
+                <li>pcre2/10.43-gcc-11.3.1-3y4lcyt</li>
+                <li>perl-data-dumper/2.173-gcc-11.3.1-xwk47wz</li>
+                <li>perl/5.38.0-gcc-11.3.1-4wtgagw</li>
+                <li>pigz/2.8-gcc-11.3.1-bswu4yx</li>
+                <li>pkgconf/2.2.0-gcc-11.3.1-reshpid</li>
+                <li>py-mako/1.2.4-gcc-11.3.1-qzty4ic</li>
+                <li>py-markupsafe/2.1.3-gcc-11.3.1-xwc652n</li>
+                <li>py-pip/23.1.2-gcc-11.3.1-bmelf66</li>
+                <li>py-setuptools/69.2.0-gcc-11.3.1-vjtezkl</li>
+                <li>py-wheel/0.41.2-gcc-11.3.1-k5uxoxo</li>
+                <li>python-venv/1.0-gcc-11.3.1-jtnjye4</li>
+                <li>python/3.10.13-gcc-11.3.1-73qnjae</li>
+                <li>python/3.11.7-gcc-11.3.1-hsfwjwr</li>
+                <li>python/3.12.1-gcc-11.3.1-7tuhjhr</li>
+                <li>python/3.9.18-gcc-11.3.1-tc7dyca</li>
+                <li>randrproto/1.5.0-gcc-11.3.1-b6ssnsk</li>
+                <li>re2c/2.2-gcc-11.3.1-vxkdcjq</li>
+                <li>readline/8.2-gcc-11.3.1-qu4mv62</li>
+                <li>renderproto/0.11.1-gcc-11.3.1-uhoybj4</li>
+                <li>scrnsaverproto/1.2.2-gcc-11.3.1-cw3khfj</li>
+                <li>sqlite/3.43.2-gcc-11.3.1-bodsqyx</li>
+                <li>swig/4.1.1-gcc-11.3.1-zaf4wit</li>
+                <li>tar/1.34-gcc-11.3.1-wcuczap</li>
+                <li>tcl/8.6.12-gcc-11.3.1-o3atwx3</li>
+                <li>texinfo/7.0.3-gcc-11.3.1-bhmkpv4</li>
+                <li>tk/8.6.11-gcc-11.3.1-wz3kgh7</li>
+                <li>unzip/6.0-gcc-11.3.1-kptkztv</li>
+                <li>util-linux-uuid/2.38.1-gcc-11.3.1-ekl7sit</li>
+                <li>util-macros/1.19.3-gcc-11.3.1-3ocj656</li>
+                <li>xcb-proto/1.16.0-gcc-11.3.1-scf7ljj</li>
+                <li>xextproto/7.3.0-gcc-11.3.1-6mytvtp</li>
+                <li>xf86vidmodeproto/2.3.1-gcc-11.3.1-gc624cq</li>
+                <li>xproto/7.0.31-gcc-11.3.1-hhddgfu</li>
+                <li>xrandr/1.5.2-gcc-11.3.1-nbhkevq</li>
+                <li>xtrans/1.5.0-gcc-11.3.1-67qr5rx</li>
+                <li>xz/5.4.6-gcc-11.3.1-pswkf4y</li>
+                <li>zlib-ng/2.1.6-gcc-11.3.1-snhalql</li>
+                <li>zlib/1.3.1-gcc-11.3.1-fk7flnb</li>
+                <li>zstd/1.5.6-gcc-11.3.1-mldt4gi</li>
+              </ul>
+
+              <div class="keyDefinition">
+                <p>Key:</p>
+                <p>
+                  <span class="loaded"> loaded </span>
+                  <span class="autoloaded"> auto-loaded </span>
+                  <span class="modulepath"> modulepath </span>
+                </p>
+              </div>
+            </div>
+
+            <p>
+              <code>&gt; module load MODULE </code>
+            </p>
+          </div>
+
+          <aside class="notes">
+            <p>Trixie comes with bare minimum software.</p>
+            <p>You might have to invest some time setting up.</p>
+          </aside>
+        </section>
+
+        <section>
+          <div style="text-align: left" class="r-fit-text">
+            <h1>Partitions, JobTesting</h1>
+
+            <p>
+              <code>&gt; sinfo</code>
+            </p>
+            <table>
+              <tr>
+                <th>PARTITION</th>
+                <th>AVAIL</th>
+                <th>TIMELIMIT</th>
+                <th>NODES</th>
+                <th>STATE</th>
+                <th>NODELIST</th>
+              </tr>
+              <tr>
+                <td>TrixieMain*</td>
+                <td>up</td>
+                <td>12:00:00</td>
+                <td>4</td>
+                <td>drain</td>
+                <td>cn[131-134]</td>
+              </tr>
+              <tr>
+                <td>TrixieMain*</td>
+                <td>up</td>
+                <td>12:00:00</td>
+                <td>2</td>
+                <td>mix</td>
+                <td>cn[108-109]</td>
+              </tr>
+              <tr>
+                <td>TrixieMain*</td>
+                <td>up</td>
+                <td>12:00:00</td>
+                <td>22</td>
+                <td>idle</td>
+                <td>cn[107,110-130]</td>
+              </tr>
+              <tr>
+                <td>TrixieLong</td>
+                <td>up</td>
+                <td>2-00:00:00</td>
+                <td>1</td>
+                <td>drain</td>
+                <td>cn131</td>
+              </tr>
+              <tr>
+                <td>TrixieLong</td>
+                <td>up</td>
+                <td>2-00:00:00</td>
+                <td>2</td>
+                <td>mix</td>
+                <td>cn[108-109]</td>
+              </tr>
+              <tr>
+                <td>TrixieLong</td>
+                <td>up</td>
+                <td>2-00:00:00</td>
+                <td>22</td>
+                <td>idle</td>
+                <td>cn[107,110-130]</td>
+              </tr>
+              <tr>
+                <td>JobTesting</td>
+                <td>up</td>
+                <td>6:00:00</td>
+                <td>2</td>
+                <td>idle</td>
+                <td>cn[135-136]</td>
+              </tr>
+            </table>
+            <p>
+              <code>&gt; sbatch --partition=JobTesting ... </code>
+            </p>
+          </div>
+
+          <aside class="notes">
+            <p>
+              If you have to have jupyter running on a worker node, use a
+              JobTesting node and don’t hold it if your not using it aka during
+              the night.
+            </p>
+          </aside>
+        </section>
+
+        <section>
+          <h1>Account Name</h1>
+
+          <div style="text-align: left" class="r-fit-text">
+            <p>
+              See
+              <a @href="https://github.com/ai4d-iasc/trixie/wiki/Account-Codes "
+                >Account-Codes</a
+              >
+              for a list of codes
+            </p>
+            <p>DT Digital Technologies / Technologies Num&eacute;riques</p>
+            <ul>
+              <li>dt-dac</li>
+              <li>dt-dscs</li>
+              <li>dt-mtp</li>
+              <li>dt-ta</li>
+            </ul>
+
+            <p>
+              <code>&gt; sbatch --account=account_code ...</code>
+            </p>
+          </div>
+        </section>
+
+        <section>
+          <h1>Node's Resources</h1>
+
+          <div style="text-align: left" class="r-fit-text">
+            <code>&gt; sinfo --Node --responding --long</code>
+
+            <table>
+              <tr>
+                <th>NODELIST</th>
+                <th>NODES</th>
+                <th>PARTITION</th>
+                <th>STATE</th>
+                <th>CPUS</th>
+                <th>S:C:T</th>
+                <th>MEMORY</th>
+                <th>TMP_DISK</th>
+                <th>WEIGHT</th>
+                <th>AVAIL_FE</th>
+                <th>REASON</th>
+              </tr>
+              <tr>
+                <td>cn106</td>
+                <td>1</td>
+                <td>DevTest</td>
+                <td>idle</td>
+                <td>64</td>
+                <td>2:16:2</td>
+                <td>192777</td>
+                <td>0</td>
+                <td>1</td>
+                <td>(null)</td>
+                <td>none</td>
+              </tr>
+              <tr>
+                <td>cn107</td>
+                <td>1</td>
+                <td>TrixieLong</td>
+                <td>idle</td>
+                <td>64</td>
+                <td>2:16:2</td>
+                <td>192777</td>
+                <td>0</td>
+                <td>1</td>
+                <td>(null)</td>
+                <td>none</td>
+              </tr>
+              <tr>
+                <td>cn107</td>
+                <td>1</td>
+                <td>TrixieMain*</td>
+                <td>idle</td>
+                <td>64</td>
+                <td>2:16:2</td>
+                <td>192777</td>
+                <td>0</td>
+                <td>1</td>
+                <td>(null)</td>
+                <td>none</td>
+              </tr>
+              <tr>
+                <td>cn108</td>
+                <td>1</td>
+                <td>TrixieLong</td>
+                <td>idle</td>
+                <td>64</td>
+                <td>2:16:2</td>
+                <td>192777</td>
+                <td>0</td>
+                <td>1</td>
+                <td>(null)</td>
+                <td>none</td>
+              </tr>
+              <tr>
+                <td colspan="11">...</td>
+              </tr>
+            </table>
+          </div>
+
+          <aside class="notes">
+            https://slurm.schedmd.com/sinfo.html S:C:T Count of sockets (S),
+            cores (C), and threads (T) on these nodes. Note: I don’t know if it
+            is still the case but trying to use all 64 CPUs may result in slower
+            performances
+          </aside>
+        </section>
+
+        <section>
+          <div style="text-align: left" class="r-fit-text">
+            <h1>What Do Nodes Have to Offer?</h1>
+
+            <p>
+              <code>&gt; scontrol show nodes</code>
+            </p>
+
+            <pre>
+            <samp>
+NodeName=cn136 Arch=x86_64 CoresPerSocket=16
+  CPUAlloc=0 CPUTot=64 CPULoad=0.01
+  AvailableFeatures=(null)
+  ActiveFeatures=(null)
+  Gres=gpu:4
+  NodeAddr=cn136 NodeHostName=cn136
+  OS=Linux 3.10.0-1160.62.1.el7.x86_64 #1 SMP Tue Apr 5 16:57:59 UTC 2022
+  RealMemory=192777 AllocMem=0 FreeMem=183181 Sockets=2 Boards=1
+  State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
+  Partitions=JobTesting
+  BootTime=2024-05-29T14:23:15 SlurmdStartTime=2024-05-29T14:23:36
+  CfgTRES=cpu=64,mem=192777M,billing=64,gres/gpu=4
+  AllocTRES=
+  CapWatts=n/a
+  CurrentWatts=0 AveWatts=0
+  ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
+            </samp>
+          </pre>
+          </div>
+
+          <aside class="notes">
+            <a href="https://github.com/ai4d-iasc/trixie/wiki/Hardware"
+              >Trixie's Hardware Wiki page</a
+            >
+          </aside>
+        </section>
+
+        <section>
+          <div class="r-fit-text">
+            <h2>The GPUs</h2>
+            <div style="text-align: left; font-size: 0.8em">
+              <p>Tesla V100-SXM2-32GB</p>
+              <p>May 10, 2017</p>
+              <p>NVIDIA-SMI 550.54.15</p>
+              <p>Driver Version: 550.54.15</p>
+              <p>CUDA Version: 12.4</p>
+              <p style="color: orange">CUDA Compute Capabilities 7.0</p>
+              <p>Single Precision: 15 TFLOPS</p>
+              <p>Tensor Performance (Deep Learning): 120 TFLOPS</p>
+            </div>
+          </div>
+
+          <aside class="notes">
+            While trying to install unsloth.ai on previous Trixie’s
+            configuration:
+            <samp>
+              WARNING: Compute capability &lt; 7.5 detected! Only slow 8-bit
+              matmul is supported for your GPU!
+            </samp>
+
+            https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units
+            https://www.anandtech.com/show/11367/nvidia-volta-unveiled-gv100-gpu-and-tesla-v100-accelerator-announced
+            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capability-7-x
+            https://www.anandtech.com/show/11559/nvidia-formally-announces-pcie-tesla-v100-available-later-this-year
+            https://github.com/ai4d-iasc/trixie/wiki/Hardware
+          </aside>
+        </section>
+
+        <section>
+          <h1>CPUs</h1>
+          <div style="text-align: left" class="r-fit-text">
+            <ul>
+              <li>
+                processor_type = Intel Xeon Gold 6130 CPU clocked at 2.1GHZ 16
+                cores / CPU
+              </li>
+              <li>processors_per_node = 2</li>
+              <li>cores_per_socket = 16</li>
+              <li>threads_per_core = 2 (hyper-threading on)</li>
+              <li>RAM = 192 GB memory</li>
+            </ul>
+          </div>
+        </section>
+
+        <section>
+          <div style="text-align: left" class="r-fit-text">
+            <h1>Slurm Header Example</h1>
+
+            <table>
+              <tr>
+                <td>
+                  <pre><samp data-trim data-noescape>
+#!/bin/bash
+# vim:nowrap:
+
+#SBATCH --job-name=MSLC24.training
+#SBATCH --comment="Metric Score Landscape Challenge 24 NMT training"
+
+# On Trixie
+#SBATCH --partition=TrixieMain
+#SBATCH --account=dt-mtp
+
+#SBATCH --gres=gpu:4
+#SBATCH --time=12:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=6
+#SBATCH --mem=96G
+# To reserve a whole node for yourself
+##SBATCH --exclusive
+#SBATCH --open-mode=append
+#SBATCH --requeue
+#SBATCH --signal=B:USR1@30
+#SBATCH --output=%x-%j.out
+
+# Fix SLURM environment variables.
+SLURM_JOB_CPUS_PER_NODE=${SLURM_JOB_CPUS_PER_NODE%%(*)} # '24(x2)' => 24
+SLURM_TASKS_PER_NODE=${SLURM_TASKS_PER_NODE%%(*)} # '4(x2)' => '4'
+
+# NOTE: We set OMP_NUM_THREADS or else we get the following Warning:
+# WARNING:torch.distributed.run:
+# *****************************************
+# Setting OMP_NUM_THREADS environment variable for each process to be 1 in
+# default, to avoid your system being overloaded, please further tune the
+# variable for optimal performance in your application as needed.
+# *****************************************
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-$(nproc)}
+                  </samp></pre>
+                </td>
+                <td>
+                  <pre><samp data-trim data-noescape>
+# Requeueing on Trixie
+# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/)
+#
+[source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html#how-to-recover-files-before-a-job-times-out)
+function _requeue {
+  echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID"
+  date
+  scontrol requeue $SLURM_JOBID
+}
+
+if [[ -n "$SLURM_JOBID" ]]; then
+  SACC_FORMAT="JobID,Submit,Start,End,Elapsed,ExitCode,State,CPUTime,MaxRSS,"
+  SACC_FORMAT+="MaxVMSize,MaxDiskRead,MaxDiskWrite,AllocCPUs,AllocGRES"
+  SACC_FORMAT+=",AllocTRES%-50,NodeList,JobName%-30,Comment%-80"
+  trap "sacct --jobs $SLURM_JOBID --format=$SACC_FORMAT" 0
+  trap _requeue USR1
+fi
+                  </samp></pre>
+                </td>
+              </tr>
+            </table>
+
+            <code>&gt; sbatch my_wonderful.sh</code>
+          </div>
+
+          <aside class="notes">
+            Q: Do I need to wrap my commands in a slurm script? A: No you don’t,
+            you can specify all your parameters on the command line. Trixie -
+            not just an GPU Cluster. It can also be used to preprocess your data
+            on CPU-only. Make sure you run your jobs with the minimum resources
+            required plus some buffer
+          </aside>
+        </section>
+
+        <section>
+          <div style="text-align: left" class="r-fit-text">
+            <h1>Do I Really Need a Script?</h1>
+            <pre>
+            <code>
+> sbatch \
+    --job-name=MSLC24.training \
+    --comment="Metric Score Landscape Challenge 24 NMT training" \
+    --partition=TrixieMain \
+    --account=dt-mtp \
+    --gres=gpu:4 \
+    --time=12:00:00 \
+    --nodes=1 \
+    --ntasks-per-node=4 \
+    --cpus-per-task=6 \
+    --mem=96G \
+    --open-mode=append \
+    --requeue \
+    --signal=B:USR1@30 \
+    --output=%x-%j.out \
+    my_wonderful.sh args ...
+            </code>
+          </pre>
+            <code> &gt; sbatch --job-name=MSLC24.training my_wonderful.sh</code>
+          </div>
+        </section>
+
+        <section>
+          <h3>Limit RAM to Allow no GPU Jobs to Also Run</h3>
+
+          <div style="text-align: left">
+            <samp> #SBATCH --mem=96G </samp>
+
+            <p>
+              Otherwise the scheduler assumes that you want all of the memory
+              preventing other jobs to run.
+            </p>
+          </div>
+
+          <aside class="notes">
+            <p>
+              Your first run can be more generous on resources but you should
+              monitor it and dial it down for the subsequent runs, with some
+              head room.
+            </p>
+          </aside>
+        </section>
+
+        <section>
+          <div style="text-align: left" class="r-fit-text">
+            <h1>Make your Job Resumable</h1>
+
+            <pre>
+            <samp>
+#SBATCH --requeue
+#SBATCH --signal=B:USR1@30
+
+# Requeueing on Trixie
+# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/)
+#
+[source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html#how-to-recover-files-before-a-job-times-out)
+function _requeue {
+  echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID"
+  date
+  scontrol requeue $SLURM_JOBID
+}
+
+if [[ -n "$SLURM_JOBID" ]]; then
+  SACCT_FORMAT="JobID,Submit,Start,End,Elapsed,ExitCode,State,CPUTime,MaxRSS,MaxVMSize"
+  SACCT_FORMAT+=",MaxDiskRead,MaxDiskWrite,AllocCPUs,AllocGRES,AllocTRES%-50,NodeList"
+  SACCT_FORMAT+=",JobName%-30,Comment%-80"
+  trap "sacct --jobs $SLURM_JOBID --format=$ACCT_FORMAT" 0
+  trap _requeue USR1
+fi
+            </samp>
+          </pre>
+            <a
+              href="https://github.com/ai4d-iasc/trixie/wiki/Automatically-Resuming-Requeueing"
+              >Automatically Resuming Requeueing</a
+            >
+          </div>
+
+          <aside class="notes">
+            <p>
+              There are even some framework that are Slurm aware and can handle
+              resuming for you.
+            </p>
+          </aside>
+        </section>
+
+        <section>
+          <div style="text-align: left" class="r-fit-text">
+            <h1>Submit my Job</h1>
+
+            <h2>queue your job</h2>
+            <p>
+              <code>&gt; sbatch my_wonderful.sh</code>
+            </p>
+
+            <h2>check that your job is running</h2>
+            <p>
+              <code>&gt; squeue</code>
+            </p>
+
+            <table>
+              <tr>
+                <th>JOBID</th>
+                <th>NAME</th>
+                <th>USER</th>
+                <th>ST</th>
+                <th>TIME</th>
+                <th>NODES</th>
+                <th>NODELIST(REASON)</th>
+                <th>SUBMIT_TIME</th>
+                <th>COMMENT</th>
+              </tr>
+              <tr>
+                <td>733</td>
+                <td>MSLC24</td>
+                <td>larkins</td>
+                <td>R</td>
+                <td>7:43:44</td>
+                <td>1</td>
+                <td>trixie-cn101</td>
+                <td>2024-07-17T02:26:0</td>
+                <td>MSLC24</td>
+                <td>Training</td>
+              </tr>
+            </table>
+          </div>
+        </section>
+
+        <section>
+          <div style="text-align: left" class="r-fit-text">
+            <h3>Check <code> nvidia-smi -l </code> for Good GPU Usage</h3>
+            <code>&gt; ssh -t cn101 nvidia-smi -l </code>
+            <pre style="font-size: 0.35em">
+            <samp>
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6           |
+|-------------------------------+----------------------+----------------------+
+| GPU Name Persistence-M        | Bus-Id Disp.A        | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap   | Memory-Usage         | GPU-Util Compute M.  |
+|                               |                      | MIG M.               |
+|===============================+======================+======================|
+| 0 Tesla V100-SXM2... Off      | 00000000:89:00.0 Off | 0                    |
+| N/A 27C P0 53W / 300W         | 0MiB / 32768MiB      | 0% Default           |
+|                               |                      | N/A                  |
++-------------------------------+----------------------+----------------------+
+| 1 Tesla V100-SXM2... Off      | 00000000:8A:00.0 Off | 0                    |
+| N/A 31C P0 54W / 300W         | 0MiB / 32768MiB      | 0% Default           |
+|                               |                      | N/A                  |
++-------------------------------+----------------------+----------------------+
+| 2 Tesla V100-SXM2... Off      | 00000000:B2:00.0 Off | 0                    |
+| N/A 31C P0 55W / 300W         | 0MiB / 32768MiB      | 0% Default           |
+|                               |                      | N/A                  |
++-------------------------------+----------------------+----------------------+
+| 3 Tesla V100-SXM2... Off      | 00000000:B3:00.0 Off | 0                    |
+| N/A 32C P0 54W / 300W         | 0MiB / 32768MiB      | 2% Default           |
+|                               |                      | N/A                  |
++-------------------------------+----------------------+----------------------+
+
++-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+| GPU GI CI PID Type Process name GPU Memory                                  |
+| ID ID Usage                                                                 |
+|=============================================================================|
+| No running processes found                                                  |
++-----------------------------------------------------------------------------+
+            </samp>
+          </pre>
+          </div>
+        </section>
+
+        <section>
+          <h1>Asking For Too Much</h1>
+
+          <div style="text-align: left">
+            <p>
+              <code> &gt; sbatch --mem=400G my_wonderful.sh </code>
+            </p>
+            <samp>
+              sbatch: error: Batch job submission failed: Memory required by
+              task is not available
+            </samp>
+          </div>
+        </section>
+
+        <section>
+          <h1>Jupyter Notebook</h1>
+
+          <p>
+            Please refer to
+            <a
+              href="https://github.com/ai4d-iasc/trixie/wiki/jobs-conda-jupyterlab"
+              >Jobs Conda JupyterLab</a
+            >
+            as it is a bit more involved
+          </p>
+
+          <p style="color: red">
+            WARNING: Don't let your worker node run if you are not using it
+          </p>
+
+          <aside class="notes">
+            <p>
+              A cluster is not met to be used like a Jupyter Notebook, it is met
+              to be used as a submit and forget.
+            </p>
+          </aside>
+        </section>
+
+        <section>
+          <h1>Conclusion</h1>
+
+          <ul>
+            <li>
+              This good-citizen principle doesn't apply only to Trixie but to
+              all clusters
+            </li>
+            <li>We want to maximize the cluster usage</li>
+            <li>We want to maximize everyone enjoyment of the cluster</li>
+            <li>https://github.com/ai4d-iasc/trixie</li>
+            <li>https://github.com/ai4d-iasc/trixie/wiki</li>
+            <li>https://github.com/ai4d-iasc/trixie/issues</li>
+          </ul>
+
+          <aside class="notes">
+            <p>
+              This is just the basic and you will have to do some work to get
+              your own software stack to work properly.
+            </p>
+            <p>Clusters work best when you sleep.</p>
+          </aside>
+        </section>
+      </div>
+    </div>
+    <script
+      src="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/5.1.0/reveal.min.js"
+      integrity="sha512-sMRSj1Ns64C2OE6VNS7WrV63OHW7dLAvi96CXRoy9AEe/tKF+868fhUJpc5ZKS166lwhe2ArCYjFitLJUY+VWA=="
+      crossorigin="anonymous"
+      referrerpolicy="no-referrer"
+    ></script>
+    <script
+      src="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/5.1.0/plugin/highlight/highlight.min.js"
+      integrity="sha512-xkVKkN0o7xECTHSUZ9zdsBYRXiAKH7CZ3aICpW6aQJZsufVVRLhEBTDjTpC1tPzm+gNZiOeW174zXAB2fOLsTg=="
+      crossorigin="anonymous"
+      referrerpolicy="no-referrer"
+    ></script>
+    <script
+      src="https://cdnjs.cloudflare.com/ajax/libs/reveal.js/5.1.0/plugin/notes/notes.min.js"
+      integrity="sha512-tDzEYXTFwGOO3tWp8oTV3yGeZ4lG4gUt+tJFQTsP7W4jV/MJ1Rgr6m9aICicjh/ymL5GD6jkHXDi9cHKfctFGQ=="
+      crossorigin="anonymous"
+      referrerpolicy="no-referrer"
+    ></script>
+    <script>
+      // Reveal.initialize({plugins: [RevealNotes, RevealHighlight], });
+      Reveal.initialize({ plugins: [RevealNotes] });
+    </script>
+  </body>
+</html>

PARTITION	AVAIL	TIMELIMIT	NODES	STATE	NODELIST
TrixieMain*	up	12:00:00	4	drain	cn[131-134]
TrixieMain*	up	12:00:00	2	mix	cn[108-109]
TrixieMain*	up	12:00:00	22	idle	cn[107,110-130]
TrixieLong	up	2-00:00:00	1	drain	cn131
TrixieLong	up	2-00:00:00	2	mix	cn[108-109]
TrixieLong	up	2-00:00:00	22	idle	cn[107,110-130]
JobTesting	up	6:00:00	2	idle	cn[135-136]
NODELIST	NODES	PARTITION	STATE	CPUS	S:C:T	MEMORY	TMP_DISK	WEIGHT	AVAIL_FE	REASON
cn106	1	DevTest	idle	64	2:16:2	192777	0	1	(null)	none
cn107	1	TrixieLong	idle	64	2:16:2	192777	0	1	(null)	none
cn107	1	TrixieMain*	idle	64	2:16:2	192777	0	1	(null)	none
cn108	1	TrixieLong	idle	64	2:16:2	192777	0	1	(null)	none
...