#!/bin/bash
#SBATCH --job-name=build-nccl
#SBATCH --nodes=1
#SBATCH --gpus=1
#SBATCH --ntasks=1
#SBATCH --time=00:30:00
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.err

# Directory on host to store built libraries and binaries
mkdir $HOME/nccl_build

singularity exec --nv \
    --bind /opt/cray/libfabric/1.22.0:/host/opt/cray/libfabric/1.22.0:ro \
    --bind /usr/lib64:/host/usr/lib64:ro \
    --bind $HOME/nccl_build:/opt/slingshot \
    $HOME/sif-images/pytorch.sif bash -c '
        export CUDA_HOME=/usr/local/cuda
        export NCCL_HOME=/lib/aarch64-linux-gnu
        export LIBFABRIC_HOME=/host/opt/cray/libfabric/1.22.0
        export MPI_HOME=/usr/local/mpi
        export TMPDIR=/tmp

        # Build nccl library
        cd /tmp && LD_LIBRARY_PATH=/lib/aarch64-linux-gnu git clone --branch "v2.29.2-1" https://github.com/NVIDIA/nccl.git
        cd /tmp/nccl && mkdir /opt/slingshot/nccl
        make -j $(nproc) install src.build BUILDDIR=/opt/slingshot/nccl NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90"
        export NCCL_HOME=/opt/slingshot/nccl

        # Build hwloc library, required to build aws-ofi-nccl
        cd /tmp && LD_LIBRARY_PATH=/lib/aarch64-linux-gnu git clone --branch "v2.13" https://github.com/open-mpi/hwloc.git
        cd /tmp/hwloc && ./autogen.sh
        ./configure --disable-nvml --prefix=/opt/slingshot/hwloc
        make -j $(nproc) install

        # Build aws-ofi-nccl
        cd /tmp && LD_LIBRARY_PATH=/lib/aarch64-linux-gnu git clone --branch "v1.18.0" https://github.com/aws/aws-ofi-nccl.git
        cd /tmp/aws-ofi-nccl && ./autogen.sh
        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/host/usr/lib64
        ./configure --prefix=/opt/slingshot/aws-ofi-nccl \
            --with-cuda=${CUDA_HOME} \
            --with-libfabric=${LIBFABRIC_HOME} \
            --with-mpi=${MPI_HOME} \
            --with-hwloc=/opt/slingshot/hwloc \
            --disable-tests
        make -j $(nproc) install

        # Build nccl-tests, if required - this is optional but useful for testing purposes
        cd /tmp && LD_LIBRARY_PATH=/lib/aarch64-linux-gnu git clone https://github.com/NVIDIA/nccl-tests.git
        cd /tmp/nccl-tests && make -j $(nproc) MPI=1
        cp -r /tmp/nccl-tests/build /opt/slingshot/nccl-tests
    '
