Instructions to use wikeeyang/Hunyuan-Image-30-Qint4 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use wikeeyang/Hunyuan-Image-30-Qint4 with Transformers:
# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("wikeeyang/Hunyuan-Image-30-Qint4", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| # load_quantized_model.py | |
| import json | |
| import torch | |
| from safetensors.torch import load_file | |
| from optimum.quanto import requantize, quantize, qint4 | |
| from hunyuan_image_3.hunyuan import HunyuanImage3ForCausalMM | |
| from transformers import AutoConfig, QuantoConfig | |
| from transformers.generation.utils import GenerationConfig | |
| def load_quantized_hi3_m1(model_path): | |
| print(f"Loading model architecture from {model_path} to CPU...") | |
| Qmodel = HunyuanImage3ForCausalMM.from_pretrained( | |
| model_path, | |
| dtype=torch.bfloat16, | |
| device_map=None, | |
| attn_implementation="sdpa", | |
| moe_impl="eager", | |
| moe_drop_tokens=True, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=False, | |
| ) | |
| print("Applying int4 quantization structure...") | |
| quantize(Qmodel, weights=qint4) | |
| print("Loading quantized weights...") | |
| state_dict = load_file(f"{model_path}/model.safetensors") | |
| Qmodel.load_state_dict(state_dict, strict=False, assign=True) | |
| print("Moving quantized model to GPU...") | |
| Qmodel = Qmodel.to("cuda") | |
| return Qmodel | |
| def load_quantized_hi3_m2(model_path): | |
| config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) | |
| state_dict = load_file(f"{model_path}/model.safetensors") | |
| with open(f"{model_path}/quantization_map.json", "r") as f: quantization_map = json.load(f) | |
| print("Create Meta model and Loading quantized weights to CPU...") | |
| with torch.device('meta'): Qmodel = HunyuanImage3ForCausalMM(config) | |
| Qmodel = Qmodel.to(torch.bfloat16) | |
| requantize(Qmodel, state_dict, quantization_map, device=torch.device('cpu')) | |
| generation_config = GenerationConfig.from_pretrained(model_path) | |
| Qmodel.generation_config = generation_config | |
| print("Moving quantized model to GPU...") | |
| Qmodel = Qmodel.to(torch.device('cuda')) | |
| return Qmodel | |
| # modify your "app/pipeline.py" script as below: | |
| # from load_quantized_model import load_quantized_hi3_m1, load_quantized_hi3_m2 | |
| # replace: | |
| # self.model = HunyuanImage3ForCausalMM.from_pretrained(args.model_id, **kwargs) | |
| # with: | |
| # self.model = load_quantized_hi3_m1(args.model_id) | |
| # or with: | |
| # self.model = load_quantized_hi3_m2(args.model_id) | |