在 Nvidia Tesla P40 上安装 Kohya 进行 stable diffusion 模型微调

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  Tesla P40                      Off |   00000000:00:0D.0 Off |                    0 |
| N/A   30C    P0             48W /  250W |     362MiB /  23040MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla P40                      Off |   00000000:00:0E.0 Off |                    0 |
| N/A   23C    P8              8W /  250W |       2MiB /  23040MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   2  Tesla P40                      Off |   00000000:00:0F.0 Off |                    0 |
| N/A   22C    P8              9W /  250W |       2MiB /  23040MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   3  Tesla P40                      Off |   00000000:00:10.0 Off |                    0 |
| N/A   19C    P8              9W /  250W |       2MiB /  23040MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

众所周知的原因,我只有一台4卡P40的老旧机器,不支持 `bf16`,在SD训练LoRA 的时候无法使用系统预设的训练参数。

经过整理,以下是基于 `sd15 – LoKr v2.0.json` 的训练调参,在 P40 上可以正常训练。

{
    "LoRA_type": "LyCORIS/LoKr",
    "LyCORIS_preset": "full",
    "adaptive_noise_scale": 0,
    "additional_parameters": "--lr_scheduler_type \"CosineAnnealingLR\" --lr_scheduler_args \"T_max=1000\" \"eta_min=0e-0\"",
    "block_alphas": "",
    "block_dims": "",
    "block_lr_zero_threshold": "",
    "bucket_no_upscale": true,
    "bucket_reso_steps": 1,
    "cache_latents": true,
    "cache_latents_to_disk": true,
    "caption_dropout_every_n_epochs": 0.0,
    "caption_dropout_rate": 0.1,
    "caption_extension": ".txt",
    "clip_skip": "1",
    "color_aug": false,
    "constrain": 0.0,
    "conv_alpha": 1,
    "conv_block_alphas": "",
    "conv_block_dims": "",
    "conv_dim": 100000,
    "debiased_estimation_loss": false,
    "decompose_both": false,
    "dim_from_weights": false,
    "down_lr_weight": "",
    "enable_bucket": true,
    "epoch": 50,
    "factor": 6,
    "flip_aug": false,
    "full_bf16": false,
    "full_fp16": false,
    "gradient_accumulation_steps": 1,
    "gradient_checkpointing": false,
    "keep_tokens": 1,
    "learning_rate": 1.0,
    "lora_network_weights": "",
    "lr_scheduler": "cosine",
    "lr_scheduler_args": "",
    "lr_scheduler_num_cycles": "",
    "lr_scheduler_power": "",
    "lr_warmup": 0,
    "max_bucket_reso": 2048,
    "max_data_loader_n_workers": "0",
    "max_grad_norm": 1,
    "max_resolution": "512,512",
    "max_timestep": 1000,
    "max_token_length": "75",
    "max_train_epochs": "",
    "max_train_steps": "",
    "mem_eff_attn": false,
    "mid_lr_weight": "",
    "min_bucket_reso": 256,
    "min_snr_gamma": 5,
    "min_timestep": 0,
    "mixed_precision": "fp16",
    "module_dropout": 0,
    "multires_noise_discount": 0.1,
    "multires_noise_iterations": 6,
    "network_alpha": 1,
    "network_dim": 100000,
    "network_dropout": 0,
    "no_token_padding": false,
    "noise_offset": 0,
    "noise_offset_type": "Multires",
    "num_cpu_threads_per_process": 2,
    "optimizer": "Prodigy",
    "optimizer_args": "\"d0=1e-5\" \"d_coef=1.0\" \"weight_decay=0.4\" \"decouple=True\" \"safeguard_warmup=True\" \"use_bias_correction=True\"",
    "persistent_data_loader_workers": false,
    "prior_loss_weight": 1.0,
    "random_crop": false,
    "rank_dropout": 0,
    "rank_dropout_scale": false,
    "rescaled": false,
    "save_every_n_epochs": 15,
    "save_every_n_steps": 0,
    "save_last_n_steps": 0,
    "save_last_n_steps_state": 0,
    "save_precision": "fp16",
    "scale_v_pred_loss_like_noise_pred": false,
    "scale_weight_norms": 0,
    "sdxl": false,
    "sdxl_cache_text_encoder_outputs": false,
    "sdxl_no_half_vae": true,
    "seed": "",
    "shuffle_caption": true,
    "stop_text_encoder_training": 0,
    "text_encoder_lr": 1.0,
    "train_batch_size": 2,
    "train_norm": false,
    "train_on_input": false,
    "training_comment": "KoopaTroopa",
    "unet_lr": 1.0,
    "unit": 1,
    "up_lr_weight": "",
    "use_cp": false,
    "use_scalar": false,
    "use_tucker": false,
    "log_with": "",
    "v2": false,
    "v_parameterization": false,
    "v_pred_like_loss": 0,
    "vae": "",
    "vae_batch_size": 0,
    "weighted_captions": false,
    "xformers": "xformers"
}

测试环境
SD 1.5
Kohya_ss GUI v24.1.3