#!/bin/bash

export GPUS_PER_NODE=$nproc_per_node
export NCCL_DEBUG=WARN
# export NCCL_DEBUG_SUBSYS=ALL
# export NCCL_DEBUG_FILE=nccl-log.%h.%p
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_HCA=ib7s

conda init
source ~/.bashrc
conda activate lf


pip install transformers==4.50.0 datasets==3.2.0 accelerate==1.6.0 peft==0.15.1 -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install tensorFlow tensorboard tf-keras -i https://pypi.tuna.tsinghua.edu.cn/simple/

# cd /ts-lf/ts-llamafactory/LLaMA-Factory
# 进入工作目录
echo "$(date '+%Y-%m-%d %H:%M:%S') - 正在进入工作目录..."
if ! cd "/workspace/LLaMA-Factory"; then
    echo "$(date '+%Y-%m-%d %H:%M:%S') - 错误：无法进入目录 '/ts-llamafactory/LLaMA-Factory'"
    exit 1
fi

#上传模型到/dev/shm目录，提升训练过程模型文件的加载速度
# 命令格式：rclone copy 源目录 目的目录，实际上传的时候需要修改为您自己的源目录路径
rclone copy /workspace/model/Qwen2.5-VL-7B-Instruct  /dev/shm/llamafactory/model/Qwen2.5-VL-7B-Instruct  --transfers 8 -P 
#上传数据集到/dev/shm  
rclone copy /workspace/LLaMA-Factory/data/images /dev/shm/llamafactory/dataset/qa_images/images --transfers 8 -P 

echo "master_addr:$master_addr" 
echo "nnodes:$nnodes" 
echo "master_port:$master_port" 
echo "node_rank:$node_rank" 
echo "nproc_per_node：$nproc_per_node"

# accelerate
accelerate launch \
--config_file /workspace/LLaMA-Factory/examples/accelerate/accelerate_mmmc_fsdp.yaml \
--main_process_ip $master_addr \
--main_process_port $master_port \
--machine_rank $node_rank \
/workspace/LLaMA-Factory/src/train.py examples/train_lora/qwen2.5VL_lora_mmmc_FSDP_ACC.yaml

