Skip to content

Commit

Permalink
fix too many leading spaces
Browse files Browse the repository at this point in the history
  • Loading branch information
SamuelLarkin committed Jul 18, 2024
1 parent e1491c3 commit 4634069
Showing 1 changed file with 135 additions and 135 deletions.
270 changes: 135 additions & 135 deletions web/BeAGoodClusterCitizen.html
Original file line number Diff line number Diff line change
Expand Up @@ -546,22 +546,22 @@ <h1>What Do Nodes Have to Offer?</h1>

<pre>
<code>
NodeName=cn136 Arch=x86_64 CoresPerSocket=16
CPUAlloc=0 CPUTot=64 CPULoad=0.01
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:4
NodeAddr=cn136 NodeHostName=cn136
OS=Linux 3.10.0-1160.62.1.el7.x86_64 #1 SMP Tue Apr 5 16:57:59 UTC 2022
RealMemory=192777 AllocMem=0 FreeMem=183181 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=JobTesting
BootTime=2024-05-29T14:23:15 SlurmdStartTime=2024-05-29T14:23:36
CfgTRES=cpu=64,mem=192777M,billing=64,gres/gpu=4
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
NodeName=cn136 Arch=x86_64 CoresPerSocket=16
CPUAlloc=0 CPUTot=64 CPULoad=0.01
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:4
NodeAddr=cn136 NodeHostName=cn136
OS=Linux 3.10.0-1160.62.1.el7.x86_64 #1 SMP Tue Apr 5 16:57:59 UTC 2022
RealMemory=192777 AllocMem=0 FreeMem=183181 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=JobTesting
BootTime=2024-05-29T14:23:15 SlurmdStartTime=2024-05-29T14:23:36
CfgTRES=cpu=64,mem=192777M,billing=64,gres/gpu=4
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
</code>
</pre>
</div>
Expand Down Expand Up @@ -627,62 +627,62 @@ <h1>Slurm Header Example</h1>
<tr>
<td>
<pre><code data-trim data-noescape>
#!/bin/bash
# vim:nowrap:

#SBATCH --job-name=MSLC24.training
#SBATCH --comment="Metric Score Landscape Challenge 24 NMT training"

# On Trixie
#SBATCH --partition=TrixieMain
#SBATCH --account=dt-mtp

#SBATCH --gres=gpu:4
#SBATCH --time=12:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=6
#SBATCH --mem=96G
# To reserve a whole node for yourself
##SBATCH --exclusive
#SBATCH --open-mode=append
#SBATCH --requeue
#SBATCH --signal=B:USR1@30
#SBATCH --output=%x-%j.out

# Fix SLURM environment variables.
SLURM_JOB_CPUS_PER_NODE=${SLURM_JOB_CPUS_PER_NODE%%(*)} # '24(x2)' => 24
SLURM_TASKS_PER_NODE=${SLURM_TASKS_PER_NODE%%(*)} # '4(x2)' => '4'

# NOTE: We set OMP_NUM_THREADS or else we get the following Warning:
# WARNING:torch.distributed.run:
# *****************************************
# Setting OMP_NUM_THREADS environment variable for each process to be 1 in
# default, to avoid your system being overloaded, please further tune the
# variable for optimal performance in your application as needed.
# *****************************************
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-$(nproc)}
#!/bin/bash
# vim:nowrap:

#SBATCH --job-name=MSLC24.training
#SBATCH --comment="Metric Score Landscape Challenge 24 NMT training"

# On Trixie
#SBATCH --partition=TrixieMain
#SBATCH --account=dt-mtp

#SBATCH --gres=gpu:4
#SBATCH --time=12:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=6
#SBATCH --mem=96G
# To reserve a whole node for yourself
##SBATCH --exclusive
#SBATCH --open-mode=append
#SBATCH --requeue
#SBATCH --signal=B:USR1@30
#SBATCH --output=%x-%j.out

# Fix SLURM environment variables.
SLURM_JOB_CPUS_PER_NODE=${SLURM_JOB_CPUS_PER_NODE%%(*)} # '24(x2)' => 24
SLURM_TASKS_PER_NODE=${SLURM_TASKS_PER_NODE%%(*)} # '4(x2)' => '4'

# NOTE: We set OMP_NUM_THREADS or else we get the following Warning:
# WARNING:torch.distributed.run:
# *****************************************
# Setting OMP_NUM_THREADS environment variable for each process to be 1 in
# default, to avoid your system being overloaded, please further tune the
# variable for optimal performance in your application as needed.
# *****************************************
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-$(nproc)}
</code></pre>
</td>
<td>
<pre><code data-trim data-noescape>
# Requeueing on Trixie
# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/)
#
[source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html#how-to-recover-files-before-a-job-times-out)
function _requeue {
echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID"
date
scontrol requeue $SLURM_JOBID
}

if [[ -n "$SLURM_JOBID" ]]; then
SACC_FORMAT="JobID,Submit,Start,End,Elapsed,ExitCode,State,CPUTime,MaxRSS,"
SACC_FORMAT+="MaxVMSize,MaxDiskRead,MaxDiskWrite,AllocCPUs,AllocGRES"
SACC_FORMAT+=",AllocTRES%-50,NodeList,JobName%-30,Comment%-80"
trap "sacct --jobs $SLURM_JOBID --format=$SACC_FORMAT" 0
trap _requeue USR1
fi
# Requeueing on Trixie
# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/)
#
[source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html#how-to-recover-files-before-a-job-times-out)
function _requeue {
echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID"
date
scontrol requeue $SLURM_JOBID
}

if [[ -n "$SLURM_JOBID" ]]; then
SACC_FORMAT="JobID,Submit,Start,End,Elapsed,ExitCode,State,CPUTime,MaxRSS,"
SACC_FORMAT+="MaxVMSize,MaxDiskRead,MaxDiskWrite,AllocCPUs,AllocGRES"
SACC_FORMAT+=",AllocTRES%-50,NodeList,JobName%-30,Comment%-80"
trap "sacct --jobs $SLURM_JOBID --format=$SACC_FORMAT" 0
trap _requeue USR1
fi
</code></pre>
</td>
</tr>
Expand All @@ -705,22 +705,22 @@ <h1>Slurm Header Example</h1>
<h1>Do I Really Need a Script?</h1>
<pre>
<code style="color: green;">
> sbatch \
--job-name=MSLC24.training \
--comment="Metric Score Landscape Challenge 24 NMT training" \
--partition=TrixieMain \
--account=dt-mtp \
--gres=gpu:4 \
--time=12:00:00 \
--nodes=1 \
--ntasks-per-node=4 \
--cpus-per-task=6 \
--mem=96G \
--open-mode=append \
--requeue \
--signal=B:USR1@30 \
--output=%x-%j.out \
my_wonderful.sh args ...
> sbatch \
--job-name=MSLC24.training \
--comment="Metric Score Landscape Challenge 24 NMT training" \
--partition=TrixieMain \
--account=dt-mtp \
--gres=gpu:4 \
--time=12:00:00 \
--nodes=1 \
--ntasks-per-node=4 \
--cpus-per-task=6 \
--mem=96G \
--open-mode=append \
--requeue \
--signal=B:USR1@30 \
--output=%x-%j.out \
my_wonderful.sh args ...
</code>
</pre>
<code style="color: green"
Expand Down Expand Up @@ -756,26 +756,26 @@ <h1>Make your Job Resumable</h1>

<pre>
<code style="color: green;">
#SBATCH --requeue
#SBATCH --signal=B:USR1@30

# Requeueing on Trixie
# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/)
#
[source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html#how-to-recover-files-before-a-job-times-out)
function _requeue {
echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID"
date
scontrol requeue $SLURM_JOBID
}

if [[ -n "$SLURM_JOBID" ]]; then
SACCT_FORMAT="JobID,Submit,Start,End,Elapsed,ExitCode,State,CPUTime,MaxRSS,MaxVMSize"
SACCT_FORMAT+=",MaxDiskRead,MaxDiskWrite,AllocCPUs,AllocGRES,AllocTRES%-50,NodeList"
SACCT_FORMAT+=",JobName%-30,Comment%-80"
trap "sacct --jobs $SLURM_JOBID --format=$ACCT_FORMAT" 0
trap _requeue USR1
fi
#SBATCH --requeue
#SBATCH --signal=B:USR1@30

# Requeueing on Trixie
# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/)
#
[source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html#how-to-recover-files-before-a-job-times-out)
function _requeue {
echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID"
date
scontrol requeue $SLURM_JOBID
}

if [[ -n "$SLURM_JOBID" ]]; then
SACCT_FORMAT="JobID,Submit,Start,End,Elapsed,ExitCode,State,CPUTime,MaxRSS,MaxVMSize"
SACCT_FORMAT+=",MaxDiskRead,MaxDiskWrite,AllocCPUs,AllocGRES,AllocTRES%-50,NodeList"
SACCT_FORMAT+=",JobName%-30,Comment%-80"
trap "sacct --jobs $SLURM_JOBID --format=$ACCT_FORMAT" 0
trap _requeue USR1
fi
</code>
</pre>
<a
Expand Down Expand Up @@ -843,37 +843,37 @@ <h3>
<code style="color: green">&gt; ssh -t cn101 nvidia-smi -l </code>
<pre style="font-size: 0.35em">
<samp>
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla V100-SXM2... Off | 00000000:89:00.0 Off | 0 |
| N/A 27C P0 53W / 300W | 0MiB / 32768MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 Tesla V100-SXM2... Off | 00000000:8A:00.0 Off | 0 |
| N/A 31C P0 54W / 300W | 0MiB / 32768MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 2 Tesla V100-SXM2... Off | 00000000:B2:00.0 Off | 0 |
| N/A 31C P0 55W / 300W | 0MiB / 32768MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 3 Tesla V100-SXM2... Off | 00000000:B3:00.0 Off | 0 |
| N/A 32C P0 54W / 300W | 0MiB / 32768MiB | 2% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla V100-SXM2... Off | 00000000:89:00.0 Off | 0 |
| N/A 27C P0 53W / 300W | 0MiB / 32768MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 Tesla V100-SXM2... Off | 00000000:8A:00.0 Off | 0 |
| N/A 31C P0 54W / 300W | 0MiB / 32768MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 2 Tesla V100-SXM2... Off | 00000000:B2:00.0 Off | 0 |
| N/A 31C P0 55W / 300W | 0MiB / 32768MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 3 Tesla V100-SXM2... Off | 00000000:B3:00.0 Off | 0 |
| N/A 32C P0 54W / 300W | 0MiB / 32768MiB | 2% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
</samp>
</pre>
</div>
Expand Down

0 comments on commit 4634069

Please sign in to comment.