ROLL/examples/qwen2.5-7B-sft_megatron/sft_config.yaml at main · alibaba/ROLL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
hydra:
  run:
    dir: .
  output_subdir: null

exp_name: "qwen2.5-7B-sft-config"
seed: 42
logging_dir: ./output/logs
output_dir: ./output_sft
system_envs:
  USE_MODELSCOPE: '1'

#track_with: wandb
#tracker_kwargs:
#  api_key:
#  project: roll_examples
#  notes: roll_examples
#  tags:
#    - sft
#    - baseline

track_with: tensorboard
tracker_kwargs:
  log_dir: ./rl_examples/llm/tensorboard/roll_exp/rlvr

num_gpus_per_node: 8

save_steps: 100
logging_steps: 1
eval_steps: 10
resume_from_checkpoint: false

sequence_length: 2048

pretrain: Qwen/Qwen2.5-7B

# sft related
# system_key: system_prompt # use the default system prompt in the tokenizer tmplate if not provided
prompt_key: instruction
query_key: input
response_key: output

validation:
  data_args:
    file_name: data/code_alpaca_20k.json
    template: qwen2_5

sft_train:
  model_args:
    dtype: bf16
  training_args:
    num_train_epochs: 1
    per_device_train_batch_size: 2
    gradient_accumulation_steps: 16
    learning_rate: 5.0e-6
  data_args:
    file_name: data/code_alpaca_20k.json # https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k
    template: qwen2_5
    preprocessing_num_workers: 4
  strategy_args:
    strategy_name: megatron_train
    strategy_config:
      tensor_model_parallel_size: 2
      sequence_parallel: true
      pipeline_model_parallel_size: 2
      use_distributed_optimizer: true
      context_parallel_size: 2
  use_sequence_packing: True
  device_mapping: list(range(0,8))
  infer_batch_size: 2