#!/bin/bash

# 手动添加路径（临时生效）
# export PATH=/opt/conda/envs/lf/bin/llamafactory-cli:$PATH
conda init
source ~/.bashrc
conda activate lf

#mlflow
pip install mlflow psutil pynvml -i https://pypi.tuna.tsinghua.edu.cn/simple/
#系统指标
#pip install pyrsmi #AMD/HIP GPU
#进入llamafactory工作目录
cd /workspace/LLaMA-Factory
# 开启系统资源检查
export MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING=true
echo "$(date '+%Y-%m-%d %H:%M:%S') - MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING: $MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"
#重定向日志输出，替换为您的日志输出目录
#替换部分：/ts-lf/ts-llamafactory/LLaMA-Factory/start-log/mlflow.log
nohup mlflow ui --port 5000 --host 0.0.0.0 > /workspace/LLaMA-Factory/mlflow.log 2>&1 &
echo "$(date '+%Y-%m-%d %H:%M:%S') - mlflow Server started in background"


#上传模型到/dev/shm目录，提升训练过程模型文件的加载速度
# 命令格式：rclone copy 源目录 目的目录，实际上传的时候需要修改为您自己的源目录路径
rclone copy /workspace/model/Qwen2.5-VL-7B-Instruct /dev/shm/llamafactory/model/Qwen2.5-VL-7B-Instruct  --transfers 8 -P 
#上传数据集到/dev/shm  
rclone copy /workspace/LLaMA-Factory/data/images /dev/shm/llamafactory/dataset/qa_images/images --transfers 8 -P 

# 进入工作目录
echo "$(date '+%Y-%m-%d %H:%M:%S') - 正在进入工作目录..."
if ! cd "/workspace/LLaMA-Factory"; then
    echo "$(date '+%Y-%m-%d %H:%M:%S') - 错误：无法进入目录 '/ts-llamafactory/LLaMA-Factory'"
    exit 1
fi

# ======================accelerate===================
accelerate launch \
--config_file /workspace/LLaMA-Factory/examples/accelerate/smmc_fsdp_config.yaml \
src/train.py examples/train_lora/qwen2.5VL_lora_sccm_FSDP.yaml

