#!/bin/bash

# 手动添加路径（临时生效）
# export PATH=/opt/conda/envs/lf/bin/llamafactory-cli:$PATH
conda init
source ~/.bashrc
conda activate lf

apt update
apt install rclone


#mlflow
pip install mlflow psutil pynvml -i https://pypi.tuna.tsinghua.edu.cn/simple/
#系统指标
#pip install pyrsmi #AMD/HIP GPU
#进入llamafactory工作目录
cd /workspace/LLaMA-Factory
#开启系统资源检查
export MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING=true
echo "$(date '+%Y-%m-%d %H:%M:%S') - MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING: $MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"
#重定向日志输出，替换为您的日志输出目录
#替换部分：/ts-lf/ts-llamafactory/LLaMA-Factory/start-log/mlflow.log
nohup mlflow ui --port 5000 --host 0.0.0.0 > /workspace/LLaMA-Factory/start-log/mlflow.log 2>&1 &
echo "$(date '+%Y-%m-%d %H:%M:%S') - mlflow Server started in background"

# #如果需要使用deepspeed训练，需要先执行安装
pip install deepspeed==0.16.4  -i https://pypi.tuna.tsinghua.edu.cn/simple/


#上传模型到/dev/shm目录，提升训练过程模型文件的加载速度
# 命令格式：rclone copy 源目录 目的目录，实际上传的时候需要修改为您自己的源目录路径
#上传数据集到/dev/shm
rclone copy /workspace/model/Qwen2.5-VL-7B-Instruct  /dev/shm/llamafactory/model/Qwen2.5-VL-7B-Instruct  --transfers 8 -P 
#上传数据集到/dev/shm  
rclone copy /workspace/LLaMA-Factory/data/images /dev/shm/llamafactory/dataset/qa_images/images --transfers 8 -P 


# 进入工作目录
echo "$(date '+%Y-%m-%d %H:%M:%S') - 正在进入工作目录..."
if ! cd "/workspace/LLaMA-Factory"; then
    echo "$(date '+%Y-%m-%d %H:%M:%S') - 错误：无法进入目录 '/ts-llamafactory/LLaMA-Factory'"
    exit 1
fi

# # ========================llamafactory-cli ================== 
FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/qwen2.5VL_lora_sccm_ds.yaml


# # # ============================deepspeed========================
# deepspeed  --num_gpus 4 src/train.py \
# --deepspeed examples/deepspeed/ds_z2_config.json \
# --stage sft \
# --model_name_or_path /dev/shm/llamafactory/model/Qwen2.5-VL-7B-Instruct  \
# --do_train \
# --dataset QA_from_CoVLA_zh \
# --template qwen2_vl \
# --finetuning_type lora \
# --lora_rank 8 \
# --lora_alpha 16 \
# --output_dir  saves/qwen2.5vl-7B/lora/sccm_ds/train_2025-07-16-19-38-34 \
# --overwrite_cache \
# --per_device_train_batch_size 2 \
# --gradient_accumulation_steps 8 \
# --lr_scheduler_type cosine \
# --logging_steps 5 \
# --save_steps 100 \
# --learning_rate 1e-4 \
# --num_train_epochs 10.0 \
# --plot_loss \
# --bf16 \
# --cutoff_len 2048 \
# --preprocessing_num_workers 16 \
# --dataloader_num_workers 12 \
# --report_to mlflow
