echo "STARTING VLLM SERVE ON RAY CLUSTER"

if [ $# -ne 1 ]; then
    echo "Usage: $0 <ray_jobid>"
    echo "Example: $0 160852"
    exit 1
fi

RAY_JOBID=$1
HEAD_NODE=$(scontrol show hostnames $(squeue -j ${RAY_JOBID} -h -o %R) | head -n 1)
HEAD_NODE_IP=$(dig +short ${HEAD_NODE})

module reset
module load brics/nccl
module list

source .venv/bin/activate

YAML_CONFIG="/projects/public/brics/distributed_vllm/GPT-OSS_Hopper.yaml"
HF_HOME=/projects/public/brics/hf
MODEL_PATH=$HF_HOME/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/
MODEL_NAME="openai/gpt-oss-120b"

export TIKTOKEN_ENCODINGS_BASE="/projects/public/brics/distributed_vllm/etc/encodings"
export VLLM_HOST_IP=$HEAD_NODE_IP
export VLLM_LOGGING_LEVEL=DEBUG
export VLLM_ALLREDUCE_USE_SYMM_MEM=0
export VLLM_NCCL_USE_SYMM_MEM=0

srun \
    --overlap \
    --jobid=${RAY_JOBID} \
    --nodelist=${HEAD_NODE} \
    --nodes=1 \
    --gpus=4 \
    --ntasks-per-node=1 \
    vllm serve \
    $MODEL_PATH \
    --served-model-name $MODEL_NAME \
    --distributed-executor-backend ray \
    --port 8000 \
    --max-num-seqs 512 \
    --config $YAML_CONFIG \
    --tensor_parallel_size=8
