| from transformers import AutoProcessor, LlavaForConditionalGeneration | |
| from llmcompressor.modifiers.quantization import QuantizationModifier | |
| from llmcompressor import oneshot | |
| from llmcompressor.utils import dispatch_for_generation | |
| MODEL_ID = "llama-joycaption-beta-one-hf-llava" | |
| # Load model. | |
| model_class = LlavaForConditionalGeneration | |
| model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| # Configure the quantization algorithm and scheme. | |
| # In this case, we: | |
| # * quantize the weights to fp8 with per channel via ptq | |
| # * quantize the activations to fp8 with dynamic per token | |
| recipe = QuantizationModifier( | |
| targets="Linear", | |
| scheme="FP8_DYNAMIC", | |
| ignore=["re:.*lm_head", "re:.*multi_modal_projector.*", "re:.*vision_tower.*"], | |
| ) | |
| # Apply quantization and save to disk in compressed-tensors format. | |
| SAVE_DIR = MODEL_ID + "-FP8-Dynamic" | |
| oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR, save_compressed=True) | |
| processor.save_pretrained(SAVE_DIR) | |