Update README.md
Browse files
README.md
CHANGED
|
@@ -5,9 +5,90 @@ license_link: >-
|
|
| 5 |
https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf
|
| 6 |
---
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
**This has no support for inference, yet.** All I've done is move the weights out of NVIDIAs NeMo architecture so people smarter than me can get a headstart on making it work with other backends.
|
| 11 |
|
| 12 |
## Nemotron-4-340B-Instruct
|
| 13 |
|
|
|
|
| 5 |
https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf
|
| 6 |
---
|
| 7 |
|
| 8 |
+
FP8 weight-only quantized checkpoint of https://huggingface.co/mgoin/Nemotron-4-340B-Instruct-vllm. For use with https://github.com/vllm-project/vllm/pull/6611
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
This script was used for the creation of this model, in addition to adding the quantization config to config.json:
|
| 12 |
+
```python
|
| 13 |
+
import argparse
|
| 14 |
+
import os
|
| 15 |
+
import json
|
| 16 |
+
import torch
|
| 17 |
+
import safetensors.torch
|
| 18 |
+
|
| 19 |
+
def per_tensor_quantize(tensor):
|
| 20 |
+
"""Quantize a tensor to FP8 using per-tensor static scaling factor."""
|
| 21 |
+
finfo = torch.finfo(torch.float8_e4m3fn)
|
| 22 |
+
if tensor.numel() == 0:
|
| 23 |
+
min_val, max_val = torch.tensor(-16.0, dtype=tensor.dtype), torch.tensor(16.0, dtype=tensor.dtype)
|
| 24 |
+
else:
|
| 25 |
+
min_val, max_val = tensor.aminmax()
|
| 26 |
+
amax = torch.maximum(min_val.abs(), max_val.abs())
|
| 27 |
+
scale = finfo.max / amax.clamp(min=1e-12)
|
| 28 |
+
qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max).to(torch.float8_e4m3fn)
|
| 29 |
+
scale = scale.float().reciprocal()
|
| 30 |
+
return qweight, scale
|
| 31 |
+
|
| 32 |
+
def process_safetensors_file(file_path):
|
| 33 |
+
"""Process a single safetensors file in-place, quantizing weights to FP8."""
|
| 34 |
+
print(f"Processing {file_path}")
|
| 35 |
+
tensors = safetensors.torch.load_file(file_path)
|
| 36 |
+
|
| 37 |
+
modified_tensors = {}
|
| 38 |
+
for name, tensor in tensors.items():
|
| 39 |
+
if name.endswith('_proj.weight'):
|
| 40 |
+
print("Quantizing", name)
|
| 41 |
+
qweight, scale = per_tensor_quantize(tensor)
|
| 42 |
+
modified_tensors[name] = qweight
|
| 43 |
+
modified_tensors[f"{name}_scale"] = scale
|
| 44 |
+
else:
|
| 45 |
+
modified_tensors[name] = tensor
|
| 46 |
+
|
| 47 |
+
safetensors.torch.save_file(modified_tensors, file_path)
|
| 48 |
+
print(f"Updated {file_path} with quantized tensors")
|
| 49 |
+
|
| 50 |
+
def update_index_file(index_file_path):
|
| 51 |
+
"""Update the index file for the quantized model."""
|
| 52 |
+
print(f"Updating index file: {index_file_path}")
|
| 53 |
+
with open(index_file_path, 'r') as f:
|
| 54 |
+
index = json.load(f)
|
| 55 |
+
|
| 56 |
+
new_weight_map = {}
|
| 57 |
+
for tensor_name, file_name in index['weight_map'].items():
|
| 58 |
+
new_weight_map[tensor_name] = file_name
|
| 59 |
+
if tensor_name.endswith('_proj.weight'):
|
| 60 |
+
new_weight_map[f"{tensor_name}_scale"] = file_name
|
| 61 |
+
|
| 62 |
+
index['weight_map'] = new_weight_map
|
| 63 |
+
|
| 64 |
+
# Recalculate total_size
|
| 65 |
+
total_size = sum(os.path.getsize(os.path.join(os.path.dirname(index_file_path), file))
|
| 66 |
+
for file in set(index['weight_map'].values()))
|
| 67 |
+
index['metadata']['total_size'] = total_size
|
| 68 |
+
|
| 69 |
+
with open(index_file_path, 'w') as f:
|
| 70 |
+
json.dump(index, f, indent=2)
|
| 71 |
+
print(f"Updated index file {index_file_path}")
|
| 72 |
+
|
| 73 |
+
def process_directory(directory):
|
| 74 |
+
"""Process all safetensors files in the given directory."""
|
| 75 |
+
for filename in os.listdir(directory):
|
| 76 |
+
file_path = os.path.join(directory, filename)
|
| 77 |
+
if filename.endswith('.safetensors'):
|
| 78 |
+
process_safetensors_file(file_path)
|
| 79 |
+
elif filename == 'model.safetensors.index.json':
|
| 80 |
+
index_file_path = file_path
|
| 81 |
+
|
| 82 |
+
update_index_file(index_file_path)
|
| 83 |
+
|
| 84 |
+
if __name__ == '__main__':
|
| 85 |
+
parser = argparse.ArgumentParser(description='Convert safetensors model to FP8 in-place.')
|
| 86 |
+
parser.add_argument('directory', type=str, help='The directory containing the safetensors files and index file.')
|
| 87 |
+
|
| 88 |
+
args = parser.parse_args()
|
| 89 |
+
process_directory(args.directory)
|
| 90 |
+
```
|
| 91 |
|
|
|
|
| 92 |
|
| 93 |
## Nemotron-4-340B-Instruct
|
| 94 |
|