#!/bin/bash
#SBATCH --job-name=vllm-serve
#SBATCH --nodes=2
#SBATCH --gpus=8
#SBATCH --time=4:00:00
#SBATCH --exclusive
#SBATCH --output=out/%x.%j.out

source .venv/bin/activate
export HF_HOME=/projects/public/brics/hf
export MODEL_PATH=$HF_HOME/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/
export YAML_CONFIG="/projects/public/brics/distributed_vllm/GPT-OSS_Hopper.yaml"
# Fix issue https://github.com/vllm-project/vllm/issues/22525#issuecomment-3172271363
export TIKTOKEN_ENCODINGS_BASE="/projects/public/brics/distributed_vllm/etc/encodings"
export TENSOR_PARALLELISM_SIZE=8
export SERVER_ADDRESS=$(dig +short ${HOSTNAME}-hsn0)
echo SERVING ON $HOSTNAME with TENSOR_PARALLELISM_SIZE=$TENSOR_PARALLELISM_SIZE

module load brics/nccl
module list

export VLLM_LOGGING_LEVEL=DEBUG
export VLLM_ALLREDUCE_USE_SYMM_MEM=0
export VLLM_USE_RAY_COMPILED_DAG=1
export VLLM_USE_RAY_SPMD_WORKER=1
export VLLM_USE_RAY_SPMD_HEAD=1

export HEAD_NODE=$(scontrol show hostnames $SLURM_NODELIST | head -n1)
export WORKER_NODES=$(scontrol show hostnames $SLURM_NODELIST | tail -n+2)
export HEAD_NODE_IP=$(dig +short ${HEAD_NODE})
export RAY_PORT=6378
export RAY_ADDRESS=$HEAD_NODE_IP:$RAY_PORT

# Start the vLLM server in the background
echo "Starting head node $HEAD_NODE..."
srun \
    --nodelist $HEAD_NODE \
    --nodes=1 \
    --gpus=4 \
    --cpus-per-task 72 \
    --ntasks-per-node 1 \
    bash -c "export VLLM_HOST_IP=$HEAD_NODE_IP; ray start --block --head --node-ip-address=$HEAD_NODE_IP --port=$RAY_PORT" &
sleep 20

echo "Starting worker nodes..."
for WORKER in $WORKER_NODES; do
    WORKER_IP=$(dig +short ${WORKER})
    echo "Starting worker node: $WORKER with IP $WORKER_IP"

    srun \
        --nodelist $WORKER \
        --nodes=1 \
        --gpus=4 \
        --cpus-per-task 72 \
        --ntasks-per-node 1 \
        bash -c "export VLLM_HOST_IP=$WORKER_IP; ray start --block --address=$HEAD_NODE_IP:$RAY_PORT --node-ip-address=$WORKER_IP" &
done
sleep 20

echo "Checking cluster status..."
srun \
    --overlap \
    --nodelist $HEAD_NODE \
    --nodes=1 \
    --gpus=4 \
    --ntasks-per-node 1 \
    ray status

wait