HKUSTAudio
/

AudioX

Stable Audio Tools

Model card Files Files and versions

AudioX / config.json

Zeyue7's picture

AudioX

80fc359 about 1 year ago

history blame contribute delete

5.39 kB

	{
	"model_type": "diffusion_cond",
	"sample_size": 485100,
	"sample_rate": 44100,
	"video_fps": 5,
	"audio_channels": 2,
	"model": {
	"pretransform": {
	"type": "autoencoder",
	"iterate_batch": true,
	"config": {
	"encoder": {
	"type": "oobleck",
	"requires_grad": false,
	"config": {
	"in_channels": 2,
	"channels": 128,
	"c_mults": [1, 2, 4, 8, 16],
	"strides": [2, 4, 4, 8, 8],
	"latent_dim": 128,
	"use_snake": true
	}
	},
	"decoder": {
	"type": "oobleck",
	"config": {
	"out_channels": 2,
	"channels": 128,
	"c_mults": [1, 2, 4, 8, 16],
	"strides": [2, 4, 4, 8, 8],
	"latent_dim": 64,
	"use_snake": true,
	"final_tanh": false
	}
	},
	"bottleneck": {
	"type": "vae"
	},
	"latent_dim": 64,
	"downsampling_ratio": 2048,
	"io_channels": 2
	}
	},
	"conditioning": {
	"configs": [
	{
	"id": "video_prompt",
	"type": "clip",
	"config": {
	"clip_model_name": "clip-vit-base-patch32"
	}
	},
	{
	"id": "text_prompt",
	"type": "t5",
	"config": {
	"t5_model_name": "t5-base",
	"max_length": 128
	}
	},
	{
	"id": "audio_prompt",
	"type": "audio_autoencoder",
	"config": {

	"sample_rate": 44100,
	"pretransform_config": {
	"type": "autoencoder",
	"iterate_batch": true,
	"config": {
	"encoder": {
	"type": "oobleck",
	"requires_grad": false,
	"config": {
	"in_channels": 2,
	"channels": 128,
	"c_mults": [1, 2, 4, 8, 16],
	"strides": [2, 4, 4, 8, 8],
	"latent_dim": 128,
	"use_snake": true
	}
	},
	"decoder": {
	"type": "oobleck",
	"config": {
	"out_channels": 2,
	"channels": 128,
	"c_mults": [1, 2, 4, 8, 16],
	"strides": [2, 4, 4, 8, 8],
	"latent_dim": 64,
	"use_snake": true,
	"final_tanh": false
	}
	},
	"bottleneck": {
	"type": "vae"
	},
	"latent_dim": 64,
	"downsampling_ratio": 2048,
	"io_channels": 2
	}
	}
	}
	}
	],
	"cond_dim": 768
	},
	"diffusion": {
	"cross_attention_cond_ids": ["video_prompt", "text_prompt", "audio_prompt"],
	"global_cond_ids": [],
	"type": "dit",
	"config": {
	"io_channels": 64,
	"embed_dim": 1536,
	"depth": 24,
	"num_heads": 24,
	"cond_token_dim": 768,
	"global_cond_dim": 1536,
	"project_cond_tokens": false,
	"transformer_type": "continuous_transformer",
	"video_fps": 5
	}
	},
	"io_channels": 64
	},
	"training": {
	"use_ema": true,
	"log_loss_info": false,
	"optimizer_configs": {
	"diffusion": {
	"optimizer": {
	"type": "AdamW",
	"config": {
	"lr": 5e-5,
	"betas": [0.9, 0.999],
	"weight_decay": 1e-3
	}
	},
	"scheduler": {
	"type": "InverseLR",
	"config": {
	"inv_gamma": 1000000,
	"power": 0.5,
	"warmup": 0.99
	}
	}
	}
	}
	}
	}