Skip to content
GitHub
View on GitHub

Qwen3_6_35b_Recipe

Qwen3.6-35B-A3B (MoE) on 1×8×H100 with TP2/PP2/CP1/EP4.

from modal_training_gym.train_recipes.slime_recipe.qwen3_6_35b import Qwen3_6_35b_Recipe

Qwen3.6-35B-A3B (MoE) on 1×8×H100 with TP2/PP2/CP1/EP4.

Inherits from: SlimeRecipe, BaseTrainRecipe

FieldTypeDefaultDescription
gpu_typestr"H100"
colocateboolTrue
tensor_model_parallel_sizeint2
sequence_parallelboolTrue
rollout_num_gpus_per_engineint4
num_rolloutint3000
rollout_batch_sizeint16
rollout_max_response_lenint16384
rollout_temperaturefloat1.0
save_intervalint20
recipe_typeRecipeTypeslime
namestr""
app_tagsdict{}
environmentdict{'PYTHONPATH': '/root/Megatron-LM/', 'CUDA_DEVICE_MAX_CONNECTIONS': '1', 'NCCL_NVLS_ENABLE': '1'}
async_modeboolFalse
wandbWandbConfig | NoneNone
image_overlaycollections.abc.Callable[[modal.image.Image], modal.image.Image] | NoneNone
local_slimestr | NoneNone
memoryint | tuple[int, int] | NoneNone
cloudstr | NoneNone
regionstr | NoneNone
slime_model_scriptstr"scripts/models/qwen3.5-35B-A3B.sh"
source_hf_checkpointstr | NoneNone
megatron_conversion_hf_checkpointstr | NoneNone
patch_fileslist[str][]
image_run_commandslist[str][]
image_envdict[str, str]{}
train_function_kwargsdict[str, int]{'ephemeral_disk': 1048576}
actor_num_nodesint1
actor_num_gpus_per_nodeint8
rollout_num_gpusint | NoneNone
use_criticboolFalse
critic_num_nodesint | NoneNone
critic_num_gpus_per_nodeint | NoneNone
advantage_estimatorstr"grpo"
n_samples_per_promptint8
eps_clipfloat0.2
eps_clip_highfloat0.28
use_kl_lossboolTrue
kl_loss_typestr"low_var_kl"
kl_loss_coeffloat0.0
kl_coeffloat0.0
entropy_coeffloat0.0
calculate_per_token_lossboolFalse
ref_loadstr"/checkpoints/Qwen3.6-35B-A3B_torch_dist_tp2pp2"
over_sampling_batch_sizeint | NoneNone
dynamic_sampling_filter_pathstr | NoneNone
balance_databoolTrue
rollout_shuffleboolTrue
rollout_top_pfloat1.0
rollout_stop_token_idslist[int] | NoneNone
sglang_mem_fraction_staticfloat0.75
global_batch_sizeint128
lrfloat1e-06
lr_decay_stylestr"constant"
weight_decayfloat0.1
adam_beta1float0.9
adam_beta2float0.98
optimizerstr"adam"
attention_dropoutfloat0.0
hidden_dropoutfloat0.0
attention_softmax_in_fp32boolTrue
accumulate_allreduce_grads_in_fp32boolTrue
use_distributed_optimizerboolFalse
recompute_granularitystr"full"
recompute_methodstr"uniform"
recompute_num_layersint1
use_dynamic_batch_sizeboolTrue
max_tokens_per_gpuint8192
eval_intervalint | NoneNone
n_samples_per_eval_promptint4
eval_max_response_lenint16384
eval_top_pfloat1.0
eval_configdict | NoneNone
savestr"/checkpoints"
loadstr""
no_save_optimboolTrue
megatron_to_hf_modestr""
use_fault_toleranceboolTrue
update_weight_modestr"full"
update_weight_transportstr"nccl"
update_weight_encodingstr"indices"
update_weight_disk_dirstr""
rm_typestr | NoneNone
custom_rm_functioncollections.abc.Callable | NoneNone
custom_generate_functioncollections.abc.Callable | NoneNone
custom_rollout_log_functioncollections.abc.Callable | str | NoneNone
custom_eval_rollout_log_functioncollections.abc.Callable | str | NoneNone
rollout_functioncollections.abc.Callable | str | NoneNone
custom_megatron_before_log_prob_hookcollections.abc.Callable | str | NoneNone
custom_megatron_before_train_step_hookcollections.abc.Callable | str | NoneNone
sglang_enable_dp_attentionboolTrue
sglang_dp_sizeint | None4
sglang_ep_sizeint | None4
sglang_enable_dp_lm_headboolTrue
sglang_disable_custom_all_reduceboolFalse
sglang_cuda_graph_bslist[int] | None[1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256]
sglang_max_running_requestsint | None512
extra_configdict | NoneNone
sglang_configdict | NoneNone
sglang_request_paramsdict | NoneNone
apply_chat_template_kwargsdict | str""
train_env_varsdict | str | NoneNone
multimodal_keysdict | str | NoneNone
hf_checkpointstr"Qwen/Qwen3.6-35B-A3B"
pipeline_model_parallel_sizeint2
context_parallel_sizeint2
expert_model_parallel_sizeint4
expert_tensor_parallel_sizeint1
sglang_speculative_algorithmstr"EAGLE"
sglang_speculative_num_stepsint3
sglang_speculative_eagle_topkint1
sglang_speculative_num_draft_tokensint4
sglang_mamba_scheduler_strategystr"extra_buffer"
moe_token_dispatcher_typestr"flex"
moe_enable_deepepboolTrue
optimizer_cpu_offloadboolTrue
overlap_cpu_optimizer_d2h_h2dboolTrue
use_precision_aware_optimizerboolTrue
attention_backendstr"flash"

cli_args(self, dataset: 'DatasetConfig | None' = None, model: 'ModelConfig | None' = None) -> list[str]

Section titled “cli_args(self, dataset: 'DatasetConfig | None' = None, model: 'ModelConfig | None' = None) -> list[str]”

get_base_recipe(model_config: modal_training_gym.common.models.base.ModelConfig) -> 'SlimeRecipe | None'

Section titled “get_base_recipe(model_config: modal_training_gym.common.models.base.ModelConfig) -> 'SlimeRecipe | None'”

Source: modal_training_gym/train_recipes/slime_recipe/qwen3_6_35b.py