#!/bin/bash
#SBATCH --job-name=bench-nccl
#SBATCH --nodes=2
#SBATCH --gpus=8
#SBATCH --time=00:10:00
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.err

srun -N 2 \
    --gpus 8 \
    --cpus-per-task 72 \
    --tasks-per-node 1 \
    --network=disable_rdzv_get \
    --mpi=pmi2 \
    singularity exec --nv \
    --bind /opt/cray/libfabric/1.22.0:/host/opt/cray/libfabric/1.22.0:ro \
    --bind /usr/lib64:/host/usr/lib64:ro \
    --bind $HOME/nccl_build:/opt/slingshot \
    $HOME/sif-images/pytorch.sif /opt/slingshot/nccl-tests/all_reduce_perf -b 32KB -e 8GB -f 2 -g 4
