Update README.md
Browse files
README.md
CHANGED
|
@@ -47,7 +47,9 @@ Only the weights of the linear operators within transformers blocks are quantize
|
|
| 47 |
### Use with vLLM
|
| 48 |
|
| 49 |
This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
|
| 50 |
-
|
|
|
|
|
|
|
| 51 |
```python
|
| 52 |
from vllm import LLM, SamplingParams
|
| 53 |
from transformers import AutoTokenizer
|
|
@@ -73,6 +75,7 @@ outputs = llm.generate(prompts, sampling_params)
|
|
| 73 |
generated_text = outputs[0].outputs[0].text
|
| 74 |
print(generated_text)
|
| 75 |
```
|
|
|
|
| 76 |
|
| 77 |
vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
|
| 78 |
|
|
@@ -80,101 +83,182 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
|
|
| 80 |
|
| 81 |
This model was created by applying [LLM Compressor with calibration samples from neuralmagic/calibration dataset](https://github.com/vllm-project/llm-compressor/blob/main/examples/multimodal_vision/llama4_example.py), as presented in the code snipet below.
|
| 82 |
|
|
|
|
|
|
|
|
|
|
| 83 |
```python
|
|
|
|
|
|
|
| 84 |
import torch
|
| 85 |
from datasets import load_dataset
|
| 86 |
-
from transformers import Llama4ForConditionalGeneration, Llama4Processor
|
| 87 |
|
| 88 |
from llmcompressor import oneshot
|
| 89 |
-
from llmcompressor.modeling import prepare_for_calibration
|
| 90 |
from llmcompressor.modifiers.quantization import GPTQModifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
|
| 93 |
-
model_id = "meta-llama/Llama-4-Scout-17B-16E
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
| 95 |
processor = Llama4Processor.from_pretrained(model_id)
|
| 96 |
-
# We update `Llama4TextMoe` modules with custom `SequentialLlama4TextMoe`.
|
| 97 |
-
# This change allows compatibility with vllm.
|
| 98 |
-
# To apply your own custom module for experimentation, consider updating
|
| 99 |
-
# `SequentialLlama4TextMoe` under llmcompressor/modeling/llama4.py
|
| 100 |
-
model = prepare_for_calibration(model)
|
| 101 |
|
|
|
|
|
|
|
|
|
|
| 102 |
DATASET_ID = "neuralmagic/calibration"
|
| 103 |
NUM_CALIBRATION_SAMPLES = 512
|
| 104 |
MAX_SEQUENCE_LENGTH = 8192
|
| 105 |
|
| 106 |
ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
|
| 107 |
|
| 108 |
-
|
| 109 |
def preprocess_function(example):
|
| 110 |
messgages = []
|
| 111 |
for message in example["messages"]:
|
| 112 |
messgages.append(
|
| 113 |
{
|
| 114 |
-
"role": message["role"],
|
| 115 |
-
"content": [{"type": "text", "text": message["content"]}]
|
| 116 |
}
|
| 117 |
)
|
| 118 |
-
|
| 119 |
return processor.apply_chat_template(
|
| 120 |
-
messgages,
|
| 121 |
-
return_tensors="pt",
|
| 122 |
-
padding=False,
|
| 123 |
-
truncation=True,
|
| 124 |
max_length=MAX_SEQUENCE_LENGTH,
|
| 125 |
tokenize=True,
|
| 126 |
add_special_tokens=False,
|
| 127 |
return_dict=True,
|
| 128 |
add_generation_prompt=False,
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
|
|
|
| 135 |
def data_collator(batch):
|
| 136 |
assert len(batch) == 1
|
| 137 |
return {
|
| 138 |
-
key: torch.tensor(value)
|
| 139 |
-
if key != "pixel_values"
|
| 140 |
-
else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
|
| 141 |
for key, value in batch[0].items()
|
| 142 |
}
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
-
|
| 146 |
-
recipe = GPTQModifier(
|
| 147 |
-
targets="Linear",
|
| 148 |
-
scheme="W4A16",
|
| 149 |
-
ignore=[
|
| 150 |
-
"re:.*lm_head",
|
| 151 |
-
"re:.*self_attn",
|
| 152 |
-
"re:.*router",
|
| 153 |
-
"re:vision_model.*",
|
| 154 |
-
"re:multi_modal_projector.*",
|
| 155 |
-
"Llama4TextAttention",
|
| 156 |
-
],
|
| 157 |
-
)
|
| 158 |
|
| 159 |
-
#
|
| 160 |
-
# due to the large size of Llama4, we specify sequential targets such that
|
| 161 |
-
# only one MLP is loaded into GPU memory at a time
|
| 162 |
oneshot(
|
| 163 |
model=model,
|
|
|
|
| 164 |
dataset=ds,
|
| 165 |
recipe=recipe,
|
| 166 |
max_seq_length=MAX_SEQUENCE_LENGTH,
|
| 167 |
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
|
|
|
| 168 |
data_collator=data_collator,
|
| 169 |
-
|
| 170 |
)
|
| 171 |
|
| 172 |
# Save to disk compressed.
|
| 173 |
-
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
|
| 174 |
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
| 175 |
processor.save_pretrained(SAVE_DIR)
|
| 176 |
|
| 177 |
```
|
|
|
|
| 178 |
|
| 179 |
## Evaluation
|
| 180 |
|
|
@@ -328,6 +412,9 @@ This model was evaluated on the well-known OpenLLM v1, OpenLLM v2, HumanEval, an
|
|
| 328 |
|
| 329 |
The results were obtained using the following commands:
|
| 330 |
|
|
|
|
|
|
|
|
|
|
| 331 |
#### MMLU_LLAMA
|
| 332 |
```
|
| 333 |
lm_eval \
|
|
@@ -434,3 +521,4 @@ lm_eval \
|
|
| 434 |
--tasks humaneval_64_instruct \
|
| 435 |
--batch_size auto
|
| 436 |
```
|
|
|
|
|
|
| 47 |
### Use with vLLM
|
| 48 |
|
| 49 |
This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
|
| 50 |
+
<details>
|
| 51 |
+
<summary>Model Usage Code</summary>
|
| 52 |
+
|
| 53 |
```python
|
| 54 |
from vllm import LLM, SamplingParams
|
| 55 |
from transformers import AutoTokenizer
|
|
|
|
| 75 |
generated_text = outputs[0].outputs[0].text
|
| 76 |
print(generated_text)
|
| 77 |
```
|
| 78 |
+
</details>
|
| 79 |
|
| 80 |
vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
|
| 81 |
|
|
|
|
| 83 |
|
| 84 |
This model was created by applying [LLM Compressor with calibration samples from neuralmagic/calibration dataset](https://github.com/vllm-project/llm-compressor/blob/main/examples/multimodal_vision/llama4_example.py), as presented in the code snipet below.
|
| 85 |
|
| 86 |
+
<details>
|
| 87 |
+
<summary>Model Creation Code</summary>
|
| 88 |
+
|
| 89 |
```python
|
| 90 |
+
from transformers import Llama4ForConditionalGeneration, Llama4Processor
|
| 91 |
+
from transformers.quantizers.quantizers_utils import get_module_from_name
|
| 92 |
import torch
|
| 93 |
from datasets import load_dataset
|
|
|
|
| 94 |
|
| 95 |
from llmcompressor import oneshot
|
|
|
|
| 96 |
from llmcompressor.modifiers.quantization import GPTQModifier
|
| 97 |
+
from llmcompressor.utils.dev import skip_weights_initialize
|
| 98 |
+
from transformers.models.llama4.modeling_llama4 import Llama4TextMLP
|
| 99 |
+
from llmcompressor.modifiers.quantization import QuantizationModifier
|
| 100 |
+
import gc
|
| 101 |
+
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
| 102 |
+
|
| 103 |
+
def convert_model_for_quantization(model):
|
| 104 |
+
to_delete = []
|
| 105 |
+
for name, module in model.named_modules():
|
| 106 |
+
module_class_name = module.__class__.__name__
|
| 107 |
+
if module_class_name == "Llama4TextMoe":
|
| 108 |
+
parent_module, module_name = get_module_from_name(model, name)
|
| 109 |
+
parent_module._modules[module_name] = SequentialLlama4TextMoe(
|
| 110 |
+
model.config.get_text_config(),
|
| 111 |
+
module,
|
| 112 |
+
)
|
| 113 |
+
to_delete.append(module)
|
| 114 |
+
print(f"Patched {name} with SequentialLlama4TextMoe", flush=True)
|
| 115 |
+
|
| 116 |
+
for module in to_delete:
|
| 117 |
+
del module
|
| 118 |
+
gc.collect()
|
| 119 |
+
torch.cuda.empty_cache()
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class SequentialLlama4TextMoe(torch.nn.Module):
|
| 123 |
+
def __init__(self, config, original_moe):
|
| 124 |
+
super().__init__()
|
| 125 |
+
self.top_k = config.num_experts_per_tok
|
| 126 |
+
self.hidden_dim = config.hidden_size
|
| 127 |
+
self.num_experts = config.num_local_experts
|
| 128 |
+
self.experts = SequentialLlama4TextExperts(config, original_moe.experts)
|
| 129 |
+
self.router = original_moe.router
|
| 130 |
+
self.shared_expert = original_moe.shared_expert
|
| 131 |
+
|
| 132 |
+
def forward(self, hidden_states):
|
| 133 |
+
hidden_states = hidden_states.reshape(-1, self.hidden_dim)
|
| 134 |
+
router_logits = self.router(hidden_states)
|
| 135 |
+
|
| 136 |
+
router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=1)
|
| 137 |
+
|
| 138 |
+
router_scores = (
|
| 139 |
+
torch.full_like(router_logits, float("-inf")).scatter_(1, router_indices, router_top_value).transpose(0, 1)
|
| 140 |
+
)
|
| 141 |
+
router_scores = torch.sigmoid(router_scores.float()).to(hidden_states.dtype)
|
| 142 |
+
|
| 143 |
+
out = self.shared_expert(hidden_states)
|
| 144 |
+
for i in range(self.num_experts):
|
| 145 |
+
out += self.experts[i](hidden_states) * router_scores[i].reshape(-1, 1)
|
| 146 |
+
|
| 147 |
+
return out, router_scores
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
class SequentialLlama4TextExperts(torch.nn.ModuleList):
|
| 151 |
+
def __init__(self, config, original_experts):
|
| 152 |
+
self.num_experts = original_experts.gate_up_proj.shape[0]
|
| 153 |
+
with skip_weights_initialize():
|
| 154 |
+
super().__init__([Llama4TextMLP(config) for _ in range(self.num_experts)])
|
| 155 |
+
|
| 156 |
+
intermediate_size = original_experts.down_proj.shape[1]
|
| 157 |
+
|
| 158 |
+
for i in range(self.num_experts):
|
| 159 |
+
gate_up = original_experts.gate_up_proj[i]
|
| 160 |
+
down = original_experts.down_proj[i]
|
| 161 |
+
|
| 162 |
+
gate_proj = gate_up[:, :intermediate_size]
|
| 163 |
+
up_proj = gate_up[:, intermediate_size:]
|
| 164 |
+
|
| 165 |
+
self[i].gate_proj.weight.data = gate_proj.t().clone().contiguous()
|
| 166 |
+
self[i].up_proj.weight.data = up_proj.t().clone().contiguous()
|
| 167 |
+
self[i].down_proj.weight.data = down.t().clone().contiguous()
|
| 168 |
+
|
| 169 |
+
original_experts.gate_up_proj = None
|
| 170 |
+
original_experts.down_proj = None
|
| 171 |
+
gc.collect()
|
| 172 |
+
torch.cuda.empty_cache()
|
| 173 |
|
| 174 |
+
|
| 175 |
+
model_id = "meta-llama/Llama-4-Scout-17B-16E"
|
| 176 |
+
|
| 177 |
+
model = Llama4ForConditionalGeneration.from_pretrained(
|
| 178 |
+
model_id, torch_dtype=torch.bfloat16 # load on cpu
|
| 179 |
+
)
|
| 180 |
processor = Llama4Processor.from_pretrained(model_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
+
convert_model_for_quantization(model)
|
| 183 |
+
|
| 184 |
+
# Oneshot arguments
|
| 185 |
DATASET_ID = "neuralmagic/calibration"
|
| 186 |
NUM_CALIBRATION_SAMPLES = 512
|
| 187 |
MAX_SEQUENCE_LENGTH = 8192
|
| 188 |
|
| 189 |
ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
|
| 190 |
|
|
|
|
| 191 |
def preprocess_function(example):
|
| 192 |
messgages = []
|
| 193 |
for message in example["messages"]:
|
| 194 |
messgages.append(
|
| 195 |
{
|
| 196 |
+
"role": message["role"],
|
| 197 |
+
"content": [{"type": "text", "text": message["content"]}]
|
| 198 |
}
|
| 199 |
)
|
| 200 |
+
|
| 201 |
return processor.apply_chat_template(
|
| 202 |
+
messgages,
|
| 203 |
+
return_tensors="pt",
|
| 204 |
+
padding=False,
|
| 205 |
+
truncation=True,
|
| 206 |
max_length=MAX_SEQUENCE_LENGTH,
|
| 207 |
tokenize=True,
|
| 208 |
add_special_tokens=False,
|
| 209 |
return_dict=True,
|
| 210 |
add_generation_prompt=False,
|
| 211 |
+
).to("cuda:0")
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
+
ds = ds.map(
|
| 214 |
+
preprocess_function,
|
| 215 |
+
batched=False,
|
| 216 |
+
remove_columns=ds.column_names
|
| 217 |
+
)
|
| 218 |
|
| 219 |
+
# Define a oneshot data collator for multimodal inputs.
|
| 220 |
def data_collator(batch):
|
| 221 |
assert len(batch) == 1
|
| 222 |
return {
|
| 223 |
+
key: torch.tensor(value) if key != "pixel_values" else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
|
|
|
|
|
|
|
| 224 |
for key, value in batch[0].items()
|
| 225 |
}
|
| 226 |
|
| 227 |
+
# Recipe
|
| 228 |
+
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4",
|
| 229 |
+
ignore=[
|
| 230 |
+
're:.*lm_head',
|
| 231 |
+
're:.*self_attn',
|
| 232 |
+
're:.*router',
|
| 233 |
+
're:.*vision_model',
|
| 234 |
+
're:.*multi_modal_projector',
|
| 235 |
+
're:.*multi_modal_projector',
|
| 236 |
+
"Llama4TextAttention",
|
| 237 |
+
],
|
| 238 |
+
sequential_targets=["Llama4TextMLP"],
|
| 239 |
+
)
|
| 240 |
|
| 241 |
+
SAVE_DIR = f"{model_id.split('/')[1]}-{recipe.scheme}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
+
# Perform oneshot
|
|
|
|
|
|
|
| 244 |
oneshot(
|
| 245 |
model=model,
|
| 246 |
+
tokenizer=model_id,
|
| 247 |
dataset=ds,
|
| 248 |
recipe=recipe,
|
| 249 |
max_seq_length=MAX_SEQUENCE_LENGTH,
|
| 250 |
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
| 251 |
+
trust_remote_code_model=True,
|
| 252 |
data_collator=data_collator,
|
| 253 |
+
output_dir=SAVE_DIR
|
| 254 |
)
|
| 255 |
|
| 256 |
# Save to disk compressed.
|
|
|
|
| 257 |
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
| 258 |
processor.save_pretrained(SAVE_DIR)
|
| 259 |
|
| 260 |
```
|
| 261 |
+
</details>
|
| 262 |
|
| 263 |
## Evaluation
|
| 264 |
|
|
|
|
| 412 |
|
| 413 |
The results were obtained using the following commands:
|
| 414 |
|
| 415 |
+
<details>
|
| 416 |
+
<summary>Model Evaluation Commands</summary>
|
| 417 |
+
|
| 418 |
#### MMLU_LLAMA
|
| 419 |
```
|
| 420 |
lm_eval \
|
|
|
|
| 521 |
--tasks humaneval_64_instruct \
|
| 522 |
--batch_size auto
|
| 523 |
```
|
| 524 |
+
</details>
|