Bootstrap: docker From: nvcr.io/nvidia/pytorch:26.01-py3 %setup # Copy NCCL environment variables script, downloaded above, into container cp env_vars.sh ${SINGULARITY_ROOTFS}/opt %post apt-get update && apt-get install -y --no-install-recommends \ build-essential git autoconf automake libtool apt-get clean && rm -rf /var/lib/apt/lists/* # This is specific to this image # - Overwrite ld cache for libfabric and the OFI plugin # - Ensures that host libfabric is used, and compatible our aws-ofi-nccl plugin # - We will build aws-ofi-nccl into the /opt/slingshot/aws-ofi-nccl folder sed -i 's|/opt/amazon/efa/lib|/host/opt/cray/libfabric/1.22.0/lib64|g' /etc/ld.so.conf.d/efa.conf sed -i 's|/opt/amazon/aws-ofi-nccl/lib|/opt/slingshot/aws-ofi-nccl/lib|g' /etc/ld.so.conf.d/aws-ofi-nccl.conf ldconfig %environment # Set NCCL environment variables . /opt/env_vars.sh export CUDA_HOME=/usr/local/cuda export NCCL_HOME=/opt/slingshot/nccl export LIBFABRIC_HOME=/host/opt/cray/libfabric/1.22.0 export MPI_HOME=/usr/local/mpi export TMPDIR=/tmp export LD_LIBRARY_PATH=$LIBFABRIC_HOME/lib64:$NCCL_HOME/lib:/opt/slingshot/aws-ofi-nccl/lib:$LD_LIBRARY_PATH:/host/usr/lib64 %runscript exec "$@"