HaoranChu commited on May 10

Commit

e9ad0c9

verified ·

1 Parent(s): f7e973d

Upload 47 files

Browse files

Files changed (47) hide show

GPTQ-for-Qwen_hf/.gitignore +1 -0
GPTQ-for-Qwen_hf/.style.yapf +3 -0
GPTQ-for-Qwen_hf/LICENSE.txt +201 -0
GPTQ-for-Qwen_hf/README.md +188 -0
GPTQ-for-Qwen_hf/__pycache__/categories.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/__pycache__/datautils.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/__pycache__/evaluate_.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/__pycache__/gptq.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/__pycache__/gptq.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/pip.log +1 -0
GPTQ-for-Qwen_hf/quant/__init__.py +5 -0
GPTQ-for-Qwen_hf/quant/__pycache__/__init__.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/__init__.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/custom_autotune.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/custom_autotune.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/fused_attn.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/fused_attn.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/fused_mlp.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/fused_mlp.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/quant_linear.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/quant_linear.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/quantizer.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/quantizer.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/triton_norm.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/quant/__pycache__/triton_norm.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/quant/custom_autotune.py +194 -0
GPTQ-for-Qwen_hf/quant/fused_attn.py +204 -0
GPTQ-for-Qwen_hf/quant/fused_mlp.py +288 -0
GPTQ-for-Qwen_hf/quant/quant_linear.py +423 -0
GPTQ-for-Qwen_hf/quant/quantizer.py +127 -0
GPTQ-for-Qwen_hf/quant/triton_norm.py +92 -0
GPTQ-for-Qwen_hf/qwen.py +292 -0
GPTQ-for-Qwen_hf/qwen_gptq_0.6B_loadtest.log +37 -0
GPTQ-for-Qwen_hf/requirements.txt +11 -0
GPTQ-for-Qwen_hf/test.sh +1 -0
GPTQ-for-Qwen_hf/utils/__init__.py +3 -0
GPTQ-for-Qwen_hf/utils/__pycache__/__init__.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/utils/__pycache__/__init__.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/utils/__pycache__/datautils.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/utils/__pycache__/datautils.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/utils/__pycache__/export.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/utils/__pycache__/export.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/utils/__pycache__/modelutils.cpython-310.pyc +0 -0
GPTQ-for-Qwen_hf/utils/__pycache__/modelutils.cpython-38.pyc +0 -0
GPTQ-for-Qwen_hf/utils/datautils.py +234 -0
GPTQ-for-Qwen_hf/utils/export.py +37 -0
GPTQ-for-Qwen_hf/utils/modelutils.py +83 -0

GPTQ-for-Qwen_hf/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

GPTQ-for-Qwen_hf/.style.yapf ADDED Viewed

	@@ -0,0 +1,3 @@

+[style]
+based_on_style = pep8
+column_limit = 200

GPTQ-for-Qwen_hf/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

GPTQ-for-Qwen_hf/README.md ADDED Viewed

	@@ -0,0 +1,188 @@

+# GPTQ-for-LLaMA
+**I am currently focusing on [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) and recommend using [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) instead of GPTQ for Llama.**
+<img src = https://user-images.githubusercontent.com/64115820/235287009-2d07bba8-9b85-4973-9e06-2a3c28777f06.png width="50%" height="50%">
+4 bits quantization of [LLaMA](https://arxiv.org/abs/2302.13971) using [GPTQ](https://arxiv.org/abs/2210.17323)
+GPTQ is SOTA one-shot weight quantization method
+**It can be used universally, but it is not the [fastest](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/old-cuda) and only supports linux.**
+**Triton only supports Linux, so if you are a Windows user, please use [WSL2](https://learn.microsoft.com/en-us/windows/wsl/install).**
+## News or Update
+**AutoGPTQ-triton, a packaged version of GPTQ with triton, has been integrated into [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ).**
+## Result
+<details>
+<summary>LLaMA-7B(click me)</summary>
+| [LLaMA-7B](https://arxiv.org/abs/2302.13971)       | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) |
+| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- |
+| FP16                                               |  16  |     -      |    13940    |    5.68   |         12.5        |
+| RTN                                                |  4   |     -      |      -      |    6.29   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |     -      |     4740    |    6.09   |          3.5        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |    128     |     4891    |    5.85   |          3.6        |
+| RTN                                                |  3   |     -      |      -      |   25.54   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |     -      |     3852    |    8.07   |          2.7        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |    128     |     4116    |    6.61   |          3.0        |
+</details>
+<details>
+<summary>LLaMA-13B</summary>
+| [LLaMA-13B](https://arxiv.org/abs/2302.13971)      | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) |
+| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- |
+| FP16                                               |  16  |     -      |     OOM     |    5.09   |         24.2        |
+| RTN                                                |  4   |     -      |      -      |    5.53   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |     -      |     8410    |    5.36   |          6.5        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |    128     |     8747    |    5.20   |          6.7        |
+| RTN                                                |  3   |     -      |      -      |   11.40   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |     -      |     6870    |    6.63   |          5.1        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |    128     |     7277    |    5.62   |          5.4        |
+</details>
+<details>
+<summary>LLaMA-33B</summary>
+| [LLaMA-33B](https://arxiv.org/abs/2302.13971)      | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) |
+| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- |
+| FP16                                               |  16  |     -      |     OOM     |    4.10   |         60.5        |
+| RTN                                                |  4   |     -      |      -      |    4.54   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |     -      |    19493    |    4.45   |         15.7        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |    128     |    20570    |    4.23   |         16.3        |
+| RTN                                                |  3   |     -      |      -      |   14.89   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |     -      |    15493    |    5.69   |         12.0        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |    128     |    16566    |    4.80   |         13.0        |
+</details>
+<details>
+<summary>LLaMA-65B</summary>
+| [LLaMA-65B](https://arxiv.org/abs/2302.13971)      | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) |
+| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- |
+| FP16                                               |  16  |     -      |     OOM     |    3.53   |         121.0       |
+| RTN                                                |  4   |     -      |      -      |    3.92   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |     -      |     OOM     |    3.84   |         31.1        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |    128     |     OOM     |    3.65   |         32.3        |
+| RTN                                                |  3   |     -      |      -      |   10.59   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |     -      |     OOM     |    5.04   |         23.6        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |    128     |     OOM     |    4.17   |         25.6        |
+</details>
+Quantization requires a large amount of CPU memory. However, the memory required can be reduced by using swap memory.
+Depending on the GPUs/drivers, there may be a difference in performance, which decreases as the model size increases.(https://github.com/IST-DASLab/gptq/issues/1)
+According to [GPTQ paper](https://arxiv.org/abs/2210.17323), As the size of the model increases, the difference in performance between FP16 and GPTQ decreases.
+## GPTQ vs [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+<details>
+<summary>LLaMA-7B(click me)</summary>
+| [LLaMA-7B(seqlen=2048)](https://arxiv.org/abs/2302.13971)       | Bits Per Weight(BPW)| memory(MiB) |  c4(ppl)  |
+| --------------------------------------------------------------- | ------------------- | ----------- | --------- |
+| FP16                                                            |  16                 |    13948    |    5.22   |
+| [GPTQ-128g](https://arxiv.org/abs/2210.17323)                   |  4.15               |     4781    |    5.30   |
+| [nf4-double_quant](https://arxiv.org/abs/2305.14314)            |  4.127              |     4804    |    5.30   |
+| [nf4](https://arxiv.org/abs/2305.14314)                         |  4.5                |     5102    |    5.30   |
+| [fp4](https://arxiv.org/abs/2212.09720)                         |  4.5                |     5102    |    5.33   |
+</details>
+<details>
+<summary>LLaMA-13B</summary>
+| [LLaMA-13B(seqlen=2048)](https://arxiv.org/abs/2302.13971)       | Bits Per Weight(BPW)| memory(MiB) |  c4(ppl)  |
+| ---------------------------------------------------------------- | ------------------- | ----------- | --------- |
+| FP16                                                             |  16                 |     OOM     |     -     |
+| [GPTQ-128g](https://arxiv.org/abs/2210.17323)                    |  4.15               |     8589    |    5.02   |
+| [nf4-double_quant](https://arxiv.org/abs/2305.14314)             |  4.127              |     8581    |    5.04   |
+| [nf4](https://arxiv.org/abs/2305.14314)                          |  4.5                |     9170    |    5.04   |
+| [fp4](https://arxiv.org/abs/2212.09720)                          |  4.5                |     9170    |    5.11   |
+</details>
+<details>
+<summary>LLaMA-33B</summary>
+| [LLaMA-33B(seqlen=1024)](https://arxiv.org/abs/2302.13971)       | Bits Per Weight(BPW)| memory(MiB) |  c4(ppl)  |
+| ---------------------------------------------------------------- | ------------------- | ----------- | --------- |
+| FP16                                                             |  16                 |     OOM     |     -     |
+| [GPTQ-128g](https://arxiv.org/abs/2210.17323)                    |  4.15               |    18441    |    3.71   |
+| [nf4-double_quant](https://arxiv.org/abs/2305.14314)             |  4.127              |    18313    |    3.76   |
+| [nf4](https://arxiv.org/abs/2305.14314)                          |  4.5                |    19729    |    3.75   |
+| [fp4](https://arxiv.org/abs/2212.09720)                          |  4.5                |    19729    |    3.75   |
+</details>
+## Installation
+If you don't have [conda](https://docs.conda.io/en/latest/miniconda.html), install it first.
+```
+conda create --name gptq python=3.9 -y
+conda activate gptq
+conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
+# Or, if you're having trouble with conda, use pip with python3.9:
+# pip3 install torch torchvision torchaudio
+git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa
+cd GPTQ-for-LLaMa
+pip install -r requirements.txt
+```
+## Dependencies
+* `torch`: tested on v2.0.0+cu117
+* `transformers`: tested on v4.28.0.dev0
+* `datasets`: tested on v2.10.1
+* `safetensors`: tested on v0.3.0
+All experiments were run on a single NVIDIA RTX3090.
+# Language Generation
+## LLaMA
+```
+#convert LLaMA to hf
+python convert_llama_weights_to_hf.py --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir ./llama-hf
+# Benchmark language generation with 4-bit LLaMA-7B:
+# Save compressed model
+CUDA_VISIBLE_DEVICES=0 python llama.py ${MODEL_DIR} c4 --wbits 4 --true-sequential --act-order --groupsize 128 --save llama7b-4bit-128g.pt
+# Or save compressed `.safetensors` model
+CUDA_VISIBLE_DEVICES=0 python llama.py ${MODEL_DIR} c4 --wbits 4 --true-sequential --act-order --groupsize 128 --save_safetensors llama7b-4bit-128g.safetensors
+# Benchmark generating a 2048 token sequence with the saved model
+CUDA_VISIBLE_DEVICES=0 python llama.py ${MODEL_DIR} c4 --wbits 4 --groupsize 128 --load llama7b-4bit-128g.pt --benchmark 2048 --check
+# Benchmark FP16 baseline, note that the model will be split across all listed GPUs
+CUDA_VISIBLE_DEVICES=0,1,2,3,4 python llama.py ${MODEL_DIR} c4 --benchmark 2048 --check
+# model inference with the saved model
+CUDA_VISIBLE_DEVICES=0 python llama_inference.py ${MODEL_DIR} --wbits 4 --groupsize 128 --load llama7b-4bit-128g.pt --text "this is llama"
+# model inference with the saved model using safetensors loaded direct to gpu
+CUDA_VISIBLE_DEVICES=0 python llama_inference.py ${MODEL_DIR} --wbits 4 --groupsize 128 --load llama7b-4bit-128g.safetensors --text "this is llama" --device=0
+# model inference with the saved model with offload(This is very slow).
+CUDA_VISIBLE_DEVICES=0 python llama_inference_offload.py ${MODEL_DIR} --wbits 4 --groupsize 128 --load llama7b-4bit-128g.pt --text "this is llama" --pre_layer 16
+It takes about 180 seconds to generate 45 tokens(5->50 tokens) on single RTX3090 based on LLaMa-65B. pre_layer is set to 50.
+```
+Basically, 4-bit quantization and 128 groupsize are recommended.
+You can also export quantization parameters with toml+numpy format.
+```
+CUDA_VISIBLE_DEVICES=0 python llama.py ${MODEL_DIR} c4 --wbits 4 --true-sequential --act-order --groupsize 128 --quant-directory ${TOML_DIR}
+```
+# Acknowledgements
+This code is based on [GPTQ](https://github.com/IST-DASLab/gptq)
+Thanks to Meta AI for releasing [LLaMA](https://arxiv.org/abs/2302.13971), a powerful LLM.
+Triton GPTQ kernel code is based on [GPTQ-triton](https://github.com/fpgaminer/GPTQ-triton)

GPTQ-for-Qwen_hf/__pycache__/categories.cpython-310.pyc ADDED Viewed

Binary file (2.32 kB). View file

GPTQ-for-Qwen_hf/__pycache__/datautils.cpython-310.pyc ADDED Viewed

Binary file (15.7 kB). View file

GPTQ-for-Qwen_hf/__pycache__/evaluate_.cpython-310.pyc ADDED Viewed

Binary file (4.83 kB). View file

GPTQ-for-Qwen_hf/__pycache__/gptq.cpython-310.pyc ADDED Viewed

Binary file (6.31 kB). View file

GPTQ-for-Qwen_hf/__pycache__/gptq.cpython-38.pyc ADDED Viewed

Binary file (6.3 kB). View file

GPTQ-for-Qwen_hf/pip.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip install lm_eval==0.4.7

GPTQ-for-Qwen_hf/quant/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .quantizer import Quantizer
+from .fused_attn import QuantLlamaAttention, make_quant_attn
+from .fused_mlp import QuantLlamaMLP, make_fused_mlp, autotune_warmup_fused
+from .quant_linear import QuantLinear, make_quant_linear, autotune_warmup_linear
+from .triton_norm import TritonLlamaRMSNorm, make_quant_norm

GPTQ-for-Qwen_hf/quant/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (575 Bytes). View file

GPTQ-for-Qwen_hf/quant/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (578 Bytes). View file

GPTQ-for-Qwen_hf/quant/__pycache__/custom_autotune.cpython-310.pyc ADDED Viewed

Binary file (7.94 kB). View file

GPTQ-for-Qwen_hf/quant/__pycache__/custom_autotune.cpython-38.pyc ADDED Viewed

Binary file (7.95 kB). View file

GPTQ-for-Qwen_hf/quant/__pycache__/fused_attn.cpython-310.pyc ADDED Viewed

Binary file (5.15 kB). View file

GPTQ-for-Qwen_hf/quant/__pycache__/fused_attn.cpython-38.pyc ADDED Viewed

Binary file (5.11 kB). View file

GPTQ-for-Qwen_hf/quant/__pycache__/fused_mlp.cpython-310.pyc ADDED Viewed

Binary file (7.2 kB). View file

GPTQ-for-Qwen_hf/quant/__pycache__/fused_mlp.cpython-38.pyc ADDED Viewed

Binary file (7.29 kB). View file

GPTQ-for-Qwen_hf/quant/__pycache__/quant_linear.cpython-310.pyc ADDED Viewed

Binary file (11 kB). View file

GPTQ-for-Qwen_hf/quant/__pycache__/quant_linear.cpython-38.pyc ADDED Viewed

Binary file (10.9 kB). View file

GPTQ-for-Qwen_hf/quant/__pycache__/quantizer.cpython-310.pyc ADDED Viewed

Binary file (3.48 kB). View file

GPTQ-for-Qwen_hf/quant/__pycache__/quantizer.cpython-38.pyc ADDED Viewed

Binary file (3.47 kB). View file

GPTQ-for-Qwen_hf/quant/__pycache__/triton_norm.cpython-310.pyc ADDED Viewed

Binary file (2.67 kB). View file

GPTQ-for-Qwen_hf/quant/__pycache__/triton_norm.cpython-38.pyc ADDED Viewed

Binary file (2.64 kB). View file

GPTQ-for-Qwen_hf/quant/custom_autotune.py ADDED Viewed

	@@ -0,0 +1,194 @@

+#https://github.com/fpgaminer/GPTQ-triton
+"""
+Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
+"""
+import builtins
+import math
+import time
+from typing import Dict
+import triton
+class Autotuner(triton.KernelInterface):
+    def __init__(self, fn, arg_names, configs, key, reset_to_zero, prune_configs_by: Dict = None, nearest_power_of_two: bool = False):
+        '''
+		:param prune_configs_by: a dict of functions that are used to prune configs, fields:
+			'perf_model': performance model used to predicate running time with different configs, returns running time
+			'top_k': number of configs to bench
+			'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs.
+			'nearest_power_of_two'(optional): whether to round key arguments to the nearest power of two when caching tuning results
+		'''
+        if not configs:
+            self.configs = [triton.Config({}, num_warps=4, num_stages=2)]
+        else:
+            self.configs = configs
+        self.key_idx = [arg_names.index(k) for k in key]
+        self.nearest_power_of_two = nearest_power_of_two
+        self.cache = {}
+        # hook to reset all required tensor to zeros before relaunching a kernel
+        self.hook = lambda args: 0
+        if reset_to_zero is not None:
+            self.reset_idx = [arg_names.index(k) for k in reset_to_zero]
+            def _hook(args):
+                for i in self.reset_idx:
+                    args[i].zero_()
+            self.hook = _hook
+        self.arg_names = arg_names
+        # prune configs
+        if prune_configs_by:
+            perf_model, top_k = prune_configs_by['perf_model'], prune_configs_by['top_k']
+            if 'early_config_prune' in prune_configs_by:
+                early_config_prune = prune_configs_by['early_config_prune']
+        else:
+            perf_model, top_k, early_config_prune = None, None, None
+        self.perf_model, self.configs_top_k = perf_model, top_k
+        self.early_config_prune = early_config_prune
+        self.fn = fn
+    def _bench(self, *args, config, **meta):
+        # check for conflicts, i.e. meta-parameters both provided
+        # as kwargs and by the autotuner
+        conflicts = meta.keys() & config.kwargs.keys()
+        if conflicts:
+            raise ValueError(f"Conflicting meta-parameters: {', '.join(conflicts)}."
+                             " Make sure that you don't re-define auto-tuned symbols.")
+        # augment meta-parameters with tunable ones
+        current = dict(meta, **config.kwargs)
+        def kernel_call():
+            if config.pre_hook:
+                config.pre_hook(self.nargs)
+            self.hook(args)
+            self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current)
+        try:
+            # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
+            # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
+            # return triton.testing.do_bench(kernel_call, percentiles=(0.5, 0.2, 0.8), rep=40)
+            return tuple(triton.testing.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8),rep=40))
+        except triton.OutOfResources:
+            return (float('inf'), float('inf'), float('inf'))
+    def run(self, *args, **kwargs):
+        self.nargs = dict(zip(self.arg_names, args))
+        if len(self.configs) > 1:
+            key = tuple(args[i] for i in self.key_idx)
+            # This reduces the amount of autotuning by rounding the keys to the nearest power of two
+            # In my testing this gives decent results, and greatly reduces the amount of tuning required
+            if self.nearest_power_of_two:
+                key = tuple([2**int(math.log2(x) + 0.5) for x in key])
+            if key not in self.cache:
+                # prune configs
+                pruned_configs = self.prune_configs(kwargs)
+                bench_start = time.time()
+                timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
+                bench_end = time.time()
+                self.bench_time = bench_end - bench_start
+                self.cache[key] = builtins.min(timings, key=timings.get)
+                self.hook(args)
+                self.configs_timings = timings
+            config = self.cache[key]
+        else:
+            config = self.configs[0]
+        self.best_config = config
+        if config.pre_hook is not None:
+            config.pre_hook(self.nargs)
+        return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs)
+    def prune_configs(self, kwargs):
+        pruned_configs = self.configs
+        if self.early_config_prune:
+            pruned_configs = self.early_config_prune(self.configs, self.nargs)
+        if self.perf_model:
+            top_k = self.configs_top_k
+            if isinstance(top_k, float) and top_k <= 1.0:
+                top_k = int(len(self.configs) * top_k)
+            if len(pruned_configs) > top_k:
+                est_timing = {config: self.perf_model(**self.nargs, **kwargs, **config.kwargs, num_stages=config.num_stages, num_warps=config.num_warps) for config in pruned_configs}
+                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
+        return pruned_configs
+    def warmup(self, *args, **kwargs):
+        self.nargs = dict(zip(self.arg_names, args))
+        for config in self.prune_configs(kwargs):
+            self.fn.warmup(
+                *args,
+                num_warps=config.num_warps,
+                num_stages=config.num_stages,
+                **kwargs,
+                **config.kwargs,
+            )
+        self.nargs = None
+def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False):
+    """
+	Decorator for auto-tuning a :code:`triton.jit`'d function.
+	.. highlight:: python
+	.. code-block:: python
+		@triton.autotune(configs=[
+			triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4),
+			triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8),
+			],
+			key=['x_size'] # the two above configs will be evaluated anytime
+							# the value of x_size changes
+		)
+		@triton.jit
+		def kernel(x_ptr, x_size, **META):
+			BLOCK_SIZE = META['BLOCK_SIZE']
+	:note: When all the configurations are evaluated, the kernel will run multiple time.
+			This means that whatever value the kernel updates will be updated multiple times.
+			To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
+			reset the value of the provided tensor to `zero` before running any configuration.
+	:param configs: a list of :code:`triton.Config` objects
+	:type configs: list[triton.Config]
+	:param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
+	:type key: list[str]
+	:param prune_configs_by: a dict of functions that are used to prune configs, fields:
+		'perf_model': performance model used to predicate running time with different configs, returns running time
+		'top_k': number of configs to bench
+		'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs.
+	:param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
+	:type reset_to_zero: list[str]
+	"""
+    def decorator(fn):
+        return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, prune_configs_by, nearest_power_of_two)
+    return decorator
+def matmul248_kernel_config_pruner(configs, nargs):
+    """
+    The main purpose of this function is to shrink BLOCK_SIZE_* when the corresponding dimension is smaller.
+    """
+    m = max(2**int(math.ceil(math.log2(nargs['M']))), 16)
+    n = max(2**int(math.ceil(math.log2(nargs['N']))), 16)
+    k = max(2**int(math.ceil(math.log2(nargs['K']))), 16)
+    used = set()
+    for config in configs:
+        block_size_m = min(m, config.kwargs['BLOCK_SIZE_M'])
+        block_size_n = min(n, config.kwargs['BLOCK_SIZE_N'])
+        block_size_k = min(k, config.kwargs['BLOCK_SIZE_K'])
+        group_size_m = config.kwargs['GROUP_SIZE_M']
+        if (block_size_m, block_size_n, block_size_k, group_size_m, config.num_stages, config.num_warps) in used:
+            continue
+        used.add((block_size_m, block_size_n, block_size_k, group_size_m, config.num_stages, config.num_warps))
+        yield triton.Config({
+            'BLOCK_SIZE_M': block_size_m,
+            'BLOCK_SIZE_N': block_size_n,
+            'BLOCK_SIZE_K': block_size_k,
+            'GROUP_SIZE_M': group_size_m
+        },
+                            num_stages=config.num_stages,
+                            num_warps=config.num_warps)

GPTQ-for-Qwen_hf/quant/fused_attn.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from torch.nn import functional as F
+from transformers.models.llama.modeling_llama import LlamaAttention
+from .quant_linear import *
+import triton
+import triton.language as tl
+@triton.jit
+def rotate_half_kernel(
+        qk_seq_ptr,
+        position_ids_ptr,
+        qk_seq_stride,
+        position_ids_batch_stride,
+        seq_len,
+        HEAD_DIM: tl.constexpr,
+        BLOCK_HEIGHT: tl.constexpr,
+        BLOCK_WIDTH: tl.constexpr,
+        INV_BASE: tl.constexpr
+):
+    # qk_seq_ptr: (bsz, seq_len, 2, num_heads, head_dim) -- OK to be discontinuous in 2nd dimension.
+    # position ids: (bsz, seq_len) -- must be contiguous in the last dimension.
+    HALF_HEAD: tl.constexpr = HEAD_DIM // 2
+    STEPS_PER_ROW: tl.constexpr = HALF_HEAD // BLOCK_WIDTH
+    batch_seq = tl.program_id(axis=0)
+    row_blk_x_col_blk = tl.program_id(axis=1)
+    row_blk = row_blk_x_col_blk // STEPS_PER_ROW
+    row = row_blk * BLOCK_HEIGHT
+    if BLOCK_WIDTH < HALF_HEAD:
+        col_blk = row_blk_x_col_blk % STEPS_PER_ROW
+        col = col_blk * BLOCK_WIDTH
+    else:
+        col: tl.constexpr = 0
+    # A block will never cross a sequence boundary, which simplifies things a lot.
+    batch = batch_seq // seq_len
+    seq = batch_seq % seq_len
+    position_id = tl.load(position_ids_ptr + batch * position_ids_batch_stride + seq)
+    # As sometimes happens, just calculating this on the fly is faster than loading it from memory.
+    # Use `tl.libdevice.exp` rather than `tl.exp` -- the latter is less accurate.
+    freq = tl.libdevice.exp((col + tl.arange(0, BLOCK_WIDTH)).to(tl.float32) * INV_BASE) * position_id
+    cos = tl.cos(freq).to(tl.float32)
+    sin = tl.sin(freq).to(tl.float32)
+    col_offsets: tl.constexpr = tl.arange(0, BLOCK_WIDTH)
+    embed_offsets = (row * HEAD_DIM + col) + col_offsets
+    x_ptrs = (qk_seq_ptr + batch_seq * qk_seq_stride) + embed_offsets
+    for k in range(0, BLOCK_HEIGHT):
+        x = tl.load(x_ptrs).to(tl.float32)
+        y = tl.load(x_ptrs + HALF_HEAD).to(tl.float32)
+        out_x = x * cos - y * sin
+        tl.store(x_ptrs, out_x)
+        out_y = x * sin + y * cos
+        tl.store(x_ptrs + HALF_HEAD, out_y)
+        x_ptrs += HEAD_DIM
+def triton_rotate_half_(qk, position_ids, config=None):
+    with torch.cuda.device(qk.device):
+        batch_size, seq_len, qandk, num_heads, head_dim = qk.shape
+        # This default is the fastest for most job sizes, at least on my RTX 4090, and when it's not it's within spitting distance of the best option. There are some odd cases where having a block height of 2 or 4 helps but the difference is within 5%. It makes sense that this configuration is fast from a memory bandwidth and caching perspective.
+        config = config or {'BLOCK_HEIGHT': 1, 'BLOCK_WIDTH': min(128, head_dim // 2), 'num_warps': 1}
+        config['BLOCK_HEIGHT'] = min(config['BLOCK_HEIGHT'], 2 * num_heads)
+        assert qk.stride(3) == head_dim
+        assert qk.stride(4) == 1
+        assert position_ids.shape == (batch_size, seq_len)
+        assert position_ids.stride(1) == 1, 'position_ids must be contiguous in the last dimension'
+        assert (2 * num_heads) % config['BLOCK_HEIGHT'] == 0, f'number of rows not evenly divisible by {config["BLOCK_HEIGHT"]}'
+        assert (head_dim // 2) % config['BLOCK_WIDTH'] == 0, f'number of columns ({head_dim // 2}) not evenly divisible by {config["BLOCK_WIDTH"]}'
+        qk_by_seq = qk.view(batch_size * seq_len, 2 * num_heads * head_dim)
+        grid = (qk_by_seq.shape[0], (2 * num_heads // config['BLOCK_HEIGHT']) * (head_dim // 2 // config['BLOCK_WIDTH']))
+        # Must be the same as the theta of the frequencies used to train the model.
+        BASE = 10000.0
+        rotate_half_kernel[grid](
+            qk_by_seq,
+            position_ids,
+            qk_by_seq.stride(0),
+            position_ids.stride(0),
+            seq_len,
+            HEAD_DIM=head_dim,
+            BLOCK_HEIGHT=config['BLOCK_HEIGHT'],
+            BLOCK_WIDTH=config['BLOCK_WIDTH'],
+            INV_BASE=-2.0 * math.log(BASE) / head_dim,
+            num_warps=config['num_warps']
+        )
+class QuantLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        qkv_proj,
+        o_proj
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        if (self.head_dim * num_heads) != self.hidden_size:
+            raise ValueError(f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                             f" and `num_heads`: {num_heads}).")
+        self.qkv_proj = qkv_proj
+        self.o_proj = o_proj
+    def forward(self, hidden_states, past_key_value=None, attention_mask=None, position_ids=None, output_attentions=False, use_cache=False):
+        """Input shape: Batch x Time x Channel"""
+        bsz, q_len, _ = hidden_states.size()
+        qkv_states = self.qkv_proj(hidden_states)
+        qkv_states = qkv_states.view(bsz, q_len, 3, self.num_heads, self.head_dim)
+        # This updates the query and key states in-place, saving VRAM.
+        triton_rotate_half_(qkv_states[:, :, :2], position_ids)
+        query_states, key_states, value_states = torch.split(qkv_states, 1, dim=2)
+        del qkv_states
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        is_causal = past_key_value is None
+        kv_seq_len = q_len
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        if use_cache:
+            # Since qkv_proj is fused, query_states etc will hold a reference to the original qkv_states tensor
+            # which can cause excessive memory usage by the cache. `contiguous` is a convenient way to workaround this.
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+            query_states = query_states.contiguous()
+        past_key_value = (key_states, value_states) if use_cache else None
+        with torch.backends.cuda.sdp_kernel(enable_math=False):
+            attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, is_causal=is_causal)
+        del query_states, key_states, value_states
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+def make_quant_attn(model):
+    """
+    Replace all LlamaAttention modules with QuantLlamaAttention modules, fusing the q, k, v projections.
+    """
+    for name, m in model.named_modules():
+        if not isinstance(m, LlamaAttention):
+            continue
+        q_proj = m.q_proj
+        k_proj = m.k_proj
+        v_proj = m.v_proj
+        qweights = torch.cat([q_proj.qweight, k_proj.qweight, v_proj.qweight], dim=1)
+        qzeros = torch.cat([q_proj.qzeros, k_proj.qzeros, v_proj.qzeros], dim=1)
+        scales = torch.cat([q_proj.scales, k_proj.scales, v_proj.scales], dim=1)
+        g_idx = torch.cat([q_proj.g_idx, k_proj.g_idx, v_proj.g_idx], dim=0)
+        bias = torch.cat([q_proj.bias, k_proj.bias, v_proj.bias], dim=0) if q_proj.bias is not None else None
+        qkv_layer = QuantLinear(q_proj.bits, q_proj.groupsize, q_proj.infeatures, q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures, True if q_proj.bias is not None else False)
+        qkv_layer.qweight = qweights
+        qkv_layer.qzeros = qzeros
+        qkv_layer.scales = scales
+        qkv_layer.g_idx = g_idx
+        qkv_layer.bias = bias
+        # We're dropping the rotary embedding layer m.rotary_emb here. We don't need it in the triton branch.
+        attn = QuantLlamaAttention(m.hidden_size, m.num_heads, qkv_layer, m.o_proj)
+        if '.' in name:
+            parent_name = name.rsplit('.', 1)[0]
+            child_name = name[len(parent_name) + 1:]
+            parent = model.get_submodule(parent_name)
+        else:
+            parent_name = ''
+            parent = model
+            child_name = name
+        #print(f"Replacing {name} with quant_attn; parent: {parent_name}, child's name: {child_name}")
+        setattr(parent, child_name, attn)

GPTQ-for-Qwen_hf/quant/fused_mlp.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from torch.cuda.amp import custom_bwd, custom_fwd
+from transformers.models.llama.modeling_llama import LlamaMLP
+try:
+    import triton
+    import triton.language as tl
+    from . import custom_autotune
+    # code based https://github.com/fpgaminer/GPTQ-triton
+    @custom_autotune.autotune(
+        configs=[
+            triton.Config({
+                'BLOCK_SIZE_M': 256,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),
+            triton.Config({
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),
+            triton.Config({
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),
+            triton.Config({
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),
+            triton.Config({
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),
+            triton.Config({
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),  # 3090
+            triton.Config({
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 16,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),  # 3090
+            triton.Config({
+                'BLOCK_SIZE_M': 32,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 128,
+                'GROUP_SIZE_M': 8
+            }, num_stages=2, num_warps=4),  # 3090
+            triton.Config({
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 16,
+                'BLOCK_SIZE_K': 64,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),  # 3090
+            triton.Config({
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 64,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),  # 3090
+        ],
+        key=['M', 'N', 'K'],
+        nearest_power_of_two=True,
+        prune_configs_by={
+            'early_config_prune': custom_autotune.matmul248_kernel_config_pruner,
+            'perf_model': None,
+            'top_k': None,
+        },
+    )
+    @triton.jit
+    def fusedmatmul_248_kernel(a_ptr, c_ptr, b1_ptr, scales1_ptr, zeros1_ptr, g1_ptr, b2_ptr, scales2_ptr, zeros2_ptr, g2_ptr, M, N, K, bits, maxq, stride_am, stride_ak, stride_bk, stride_bn,
+                               stride_cm, stride_cn, stride_scales, stride_zeros, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+        """
+        Computes: C = silu(A * B1) * (A * B2)
+        A is of shape (M, K) float16
+        B is of shape (K//8, N) int32
+        C is of shape (M, N) float16
+        scales is of shape (1, N) float16
+        zeros is of shape (1, N//8) int32
+        """
+        infearure_per_bits = 32 // bits
+        pid = tl.program_id(axis=0)
+        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+        num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + (pid % group_size_m)
+        pid_n = (pid % num_pid_in_group) // group_size_m
+        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        offs_k = tl.arange(0, BLOCK_SIZE_K)
+        a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        a_mask = (offs_am[:, None] < M)
+        # b_ptrs is set up such that it repeats elements along the K axis 8 times
+        b1_ptrs = b1_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)
+        b2_ptrs = b2_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)
+        g1_ptrs = g1_ptr + offs_k
+        g2_ptrs = g2_ptr + offs_k
+        # shifter is used to extract the N bits of each element in the 32-bit word from B
+        scales1_ptrs = scales1_ptr + offs_bn[None, :]
+        scales2_ptrs = scales2_ptr + offs_bn[None, :]
+        zeros1_ptrs = zeros1_ptr + (offs_bn[None, :] // infearure_per_bits)
+        zeros2_ptrs = zeros2_ptr + (offs_bn[None, :] // infearure_per_bits)
+        shifter = (offs_k % infearure_per_bits) * bits
+        zeros_shifter = (offs_bn % infearure_per_bits) * bits
+        accumulator1 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        accumulator2 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for k in range(0, num_pid_k):
+            g1_idx = tl.load(g1_ptrs)
+            g2_idx = tl.load(g2_ptrs)
+            # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+            scales1 = tl.load(scales1_ptrs + g1_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+            scales2 = tl.load(scales2_ptrs + g2_idx[:, None] * stride_scales)
+            zeros1 = tl.load(zeros1_ptrs + g1_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+            zeros1 = (zeros1 >> zeros_shifter[None, :]) & maxq
+            zeros1 = (zeros1 + 1)
+            zeros2 = tl.load(zeros2_ptrs + g2_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+            zeros2 = (zeros2 >> zeros_shifter[None, :]) & maxq
+            zeros2 = (zeros2 + 1)
+            a = tl.load(a_ptrs, mask=a_mask, other=0.)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+            b1 = tl.load(b1_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+            b2 = tl.load(b2_ptrs)
+            # Now we need to unpack b (which is N-bit values) into 32-bit values
+            b1 = (b1 >> shifter[:, None]) & maxq  # Extract the N-bit values
+            b1 = (b1 - zeros1) * scales1  # Scale and shift
+            accumulator1 += tl.dot(a, b1)
+            b2 = (b2 >> shifter[:, None]) & maxq
+            b2 = (b2 - zeros2) * scales2
+            accumulator2 += tl.dot(a, b2)
+            a_ptrs += BLOCK_SIZE_K
+            b1_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+            b2_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+            g1_ptrs += BLOCK_SIZE_K
+            g2_ptrs += BLOCK_SIZE_K
+        accumulator1 = silu(accumulator1)
+        c = accumulator1 * accumulator2
+        c = c.to(tl.float16)
+        c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+        c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+        tl.store(c_ptrs, c, mask=c_mask)
+    @triton.jit
+    def silu(x):
+        return x * tl.sigmoid(x)
+except:
+    print('triton not installed.')
+class QuantLlamaMLP(nn.Module):
+    def __init__(
+        self,
+        gate_proj,
+        down_proj,
+        up_proj,
+    ):
+        super().__init__()
+        self.register_buffer('gate_proj_qweight', gate_proj.qweight)
+        self.register_buffer('gate_proj_scales', gate_proj.scales)
+        self.register_buffer('gate_proj_qzeros', gate_proj.qzeros)
+        self.register_buffer('gate_proj_g_idx', gate_proj.g_idx)
+        self.register_buffer('up_proj_qweight', up_proj.qweight)
+        self.register_buffer('up_proj_scales', up_proj.scales)
+        self.register_buffer('up_proj_qzeros', up_proj.qzeros)
+        self.register_buffer('up_proj_g_idx', up_proj.g_idx)
+        self.infeatures = gate_proj.infeatures
+        self.intermediate_size = gate_proj.outfeatures
+        self.outfeatures = down_proj.outfeatures
+        self.bits = gate_proj.bits
+        self.maxq = gate_proj.maxq
+        self.down_proj = down_proj
+    def forward(self, x):
+        return self.down_proj(self.triton_llama_mlp(x))
+    def triton_llama_mlp(self, x):
+        with torch.cuda.device(x.device):
+            out_shape = x.shape[:-1] + (self.intermediate_size, )
+            x = x.reshape(-1, x.shape[-1])
+            M, K = x.shape
+            N = self.intermediate_size
+            c = torch.empty((M, N), device=x.device, dtype=torch.float16)
+            grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
+            fusedmatmul_248_kernel[grid](x, c, self.gate_proj_qweight, self.gate_proj_scales, self.gate_proj_qzeros, self.gate_proj_g_idx, self.up_proj_qweight, self.up_proj_scales,
+                                         self.up_proj_qzeros, self.up_proj_g_idx, M, N, K, self.bits, self.maxq, x.stride(0), x.stride(1), self.gate_proj_qweight.stride(0),
+                                         self.gate_proj_qweight.stride(1), c.stride(0), c.stride(1), self.gate_proj_scales.stride(0), self.gate_proj_qzeros.stride(0))
+            c = c.reshape(out_shape)
+            return c
+    def fused2cuda(self):
+        self.gate_proj_qweight = self.gate_proj_qweight.cuda()
+        self.gate_proj_scales = self.gate_proj_scales.cuda()
+        self.gate_proj_qzeros = self.gate_proj_qzeros.cuda()
+        self.gate_proj_g_idx = self.gate_proj_g_idx.cuda()
+        self.up_proj_qweight = self.up_proj_qweight.cuda()
+        self.up_proj_scales = self.up_proj_scales.cuda()
+        self.up_proj_qzeros = self.up_proj_qzeros.cuda()
+        self.up_proj_g_idx = self.up_proj_g_idx.cuda()
+    def fused2cpu(self):
+        self.gate_proj_qweight = self.gate_proj_qweight.cpu()
+        self.gate_proj_scales = self.gate_proj_scales.cpu()
+        self.gate_proj_qzeros = self.gate_proj_qzeros.cpu()
+        self.gate_proj_g_idx = self.gate_proj_g_idx.cpu()
+        self.up_proj_qweight = self.up_proj_qweight.cpu()
+        self.up_proj_scales = self.up_proj_scales.cpu()
+        self.up_proj_qzeros = self.up_proj_qzeros.cpu()
+        self.up_proj_g_idx = self.up_proj_g_idx.cpu()
+def make_fused_mlp(m, parent_name=''):
+    """
+    Replace all LlamaMLP modules with QuantLlamaMLP modules, which fuses many of the operations.
+    """
+    if isinstance(m, LlamaMLP):
+        return QuantLlamaMLP(m.gate_proj, m.down_proj, m.up_proj)
+    for name, child in m.named_children():
+        child = make_fused_mlp(child, parent_name=f"{parent_name}.{name}")
+        if isinstance(child, QuantLlamaMLP):
+            setattr(m, name, child)
+    return m
+def autotune_warmup_fused(model):
+    """
+    Pre-tunes the quantized kernel
+    """
+    from tqdm import tqdm
+    kn_values = {}
+    for _, m in model.named_modules():
+        if not isinstance(m, QuantLlamaMLP):
+            continue
+        k = m.infeatures
+        n = m.intermediate_size
+        m.fused2cuda()
+        if (k, n) not in kn_values:
+            kn_values[(k, n)] = m
+    print(f'Found {len(kn_values)} unique fused mlp KN values.')
+    print('Warming up autotune cache ...')
+    with torch.no_grad():
+        for m in tqdm(range(0, 12)):
+            m = 2**m  # [1, 2048]
+            for (k, n), (modules) in kn_values.items():
+                a = torch.randn(m, k, dtype=torch.float16, device='cuda')
+                modules.triton_llama_mlp(a)
+        for (k, n), (modules) in kn_values.items():
+            a = torch.randn(m, k, dtype=torch.float16, device='cuda')
+            modules.fused2cpu()
+    del kn_values

GPTQ-for-Qwen_hf/quant/quant_linear.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.cuda.amp import custom_bwd, custom_fwd
+try:
+    import triton
+    import triton.language as tl
+    from . import custom_autotune
+    # code based https://github.com/fpgaminer/GPTQ-triton
+    @custom_autotune.autotune(
+        configs=[
+            triton.Config({
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),
+            triton.Config({
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),
+            triton.Config({
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),
+            triton.Config({
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),
+            triton.Config({
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=4, num_warps=4),
+            triton.Config({
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }, num_stages=2, num_warps=8),
+            triton.Config({
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 64,
+                'GROUP_SIZE_M': 8
+            }, num_stages=3, num_warps=8),
+            triton.Config({
+                'BLOCK_SIZE_M': 32,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 128,
+                'GROUP_SIZE_M': 8
+            }, num_stages=2, num_warps=4),
+        ],
+        key=['M', 'N', 'K'],
+        nearest_power_of_two=True,
+        prune_configs_by={
+            'early_config_prune': custom_autotune.matmul248_kernel_config_pruner,
+            'perf_model': None,
+            'top_k': None,
+        },
+    )
+    @triton.jit
+    def matmul_248_kernel(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr, g_ptr, M, N, K, bits, maxq, stride_am, stride_ak, stride_bk, stride_bn, stride_cm, stride_cn, stride_scales, stride_zeros,
+                          BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+        """
+        Compute the matrix multiplication C = A x B.
+        A is of shape (M, K) float16
+        B is of shape (K//8, N) int32
+        C is of shape (M, N) float16
+        scales is of shape (G, N) float16
+        zeros is of shape (G, N) float16
+        g_ptr is of shape (K) int32
+        """
+        infearure_per_bits = 32 // bits
+        pid = tl.program_id(axis=0)
+        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+        num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + (pid % group_size_m)
+        pid_n = (pid % num_pid_in_group) // group_size_m
+        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        offs_k = tl.arange(0, BLOCK_SIZE_K)
+        a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        a_mask = (offs_am[:, None] < M)
+        # b_ptrs is set up such that it repeats elements along the K axis 8 times
+        b_ptrs = b_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+        g_ptrs = g_ptr + offs_k
+        # shifter is used to extract the N bits of each element in the 32-bit word from B
+        scales_ptrs = scales_ptr + offs_bn[None, :]
+        zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
+        shifter = (offs_k % infearure_per_bits) * bits
+        zeros_shifter = (offs_bn % infearure_per_bits) * bits
+        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for k in range(0, num_pid_k):
+            g_idx = tl.load(g_ptrs)
+            # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+            scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+            zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+            zeros = (zeros >> zeros_shifter[None, :]) & maxq
+            zeros = (zeros + 1)
+            a = tl.load(a_ptrs, mask=a_mask, other=0.)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+            b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+            # Now we need to unpack b (which is N-bit values) into 32-bit values
+            b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+            b = (b - zeros) * scales  # Scale and shift
+            accumulator += tl.dot(a, b)
+            a_ptrs += BLOCK_SIZE_K
+            b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+            g_ptrs += BLOCK_SIZE_K
+        c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+        c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+        tl.store(c_ptrs, accumulator, mask=c_mask)
+    @custom_autotune.autotune(configs=[
+        triton.Config({
+            'BLOCK_SIZE_M': 64,
+            'BLOCK_SIZE_N': 32,
+            'BLOCK_SIZE_K': 256,
+            'GROUP_SIZE_M': 8
+        }, num_stages=4, num_warps=4),
+        triton.Config({
+            'BLOCK_SIZE_M': 128,
+            'BLOCK_SIZE_N': 32,
+            'BLOCK_SIZE_K': 128,
+            'GROUP_SIZE_M': 8
+        }, num_stages=4, num_warps=4),
+        triton.Config({
+            'BLOCK_SIZE_M': 64,
+            'BLOCK_SIZE_N': 32,
+            'BLOCK_SIZE_K': 128,
+            'GROUP_SIZE_M': 8
+        }, num_stages=4, num_warps=4),
+        triton.Config({
+            'BLOCK_SIZE_M': 128,
+            'BLOCK_SIZE_N': 32,
+            'BLOCK_SIZE_K': 32,
+            'GROUP_SIZE_M': 8
+        }, num_stages=4, num_warps=4),
+        triton.Config({
+            'BLOCK_SIZE_M': 64,
+            'BLOCK_SIZE_N': 32,
+            'BLOCK_SIZE_K': 64,
+            'GROUP_SIZE_M': 8
+        }, num_stages=4, num_warps=4),
+        triton.Config({
+            'BLOCK_SIZE_M': 64,
+            'BLOCK_SIZE_N': 32,
+            'BLOCK_SIZE_K': 128,
+            'GROUP_SIZE_M': 8
+        }, num_stages=2, num_warps=8),
+        triton.Config({
+            'BLOCK_SIZE_M': 64,
+            'BLOCK_SIZE_N': 64,
+            'BLOCK_SIZE_K': 64,
+            'GROUP_SIZE_M': 8
+        }, num_stages=3, num_warps=8),
+        triton.Config({
+            'BLOCK_SIZE_M': 32,
+            'BLOCK_SIZE_N': 128,
+            'BLOCK_SIZE_K': 32,
+            'GROUP_SIZE_M': 8
+        }, num_stages=2, num_warps=4),
+    ],
+                              key=['M', 'N', 'K'],
+                              nearest_power_of_two=True)
+    @triton.jit
+    def transpose_matmul_248_kernel(a_ptr, b_ptr, c_ptr, scales_ptr, zeros_ptr, g_ptr, M, N, K, bits, maxq, stride_am, stride_ak, stride_bk, stride_bn, stride_cm, stride_cn, stride_scales,
+                                    stride_zeros, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+        """
+        Compute the matrix multiplication C = A x B.
+        A is of shape (M, N) float16
+        B is of shape (K//8, N) int32
+        C is of shape (M, K) float16
+        scales is of shape (G, N) float16
+        zeros is of shape (G, N) float16
+        g_ptr is of shape (K) int32
+        """
+        infearure_per_bits = 32 // bits
+        pid = tl.program_id(axis=0)
+        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+        num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+        num_pid_in_group = GROUP_SIZE_M * num_pid_k
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + (pid % group_size_m)
+        pid_k = (pid % num_pid_in_group) // group_size_m
+        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        offs_bk = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+        offs_n = tl.arange(0, BLOCK_SIZE_N)
+        a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_N)
+        a_mask = (offs_am[:, None] < M)
+        # b_ptrs is set up such that it repeats elements along the K axis 8 times
+        b_ptrs = b_ptr + ((offs_bk[:, None] // infearure_per_bits) * stride_bk + offs_n[None, :] * stride_bn)  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+        g_ptrs = g_ptr + offs_bk
+        g_idx = tl.load(g_ptrs)
+        # shifter is used to extract the N bits of each element in the 32-bit word from B
+        scales_ptrs = scales_ptr + offs_n[None, :] + g_idx[:, None] * stride_scales
+        zeros_ptrs = zeros_ptr + (offs_n[None, :] // infearure_per_bits) + g_idx[:, None] * stride_zeros
+        shifter = (offs_bk % infearure_per_bits) * bits
+        zeros_shifter = (offs_n % infearure_per_bits) * bits
+        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_K), dtype=tl.float32)
+        for n in range(0, num_pid_n):
+            # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+            scales = tl.load(scales_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+            zeros = tl.load(zeros_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+            zeros = (zeros >> zeros_shifter[None, :]) & maxq
+            zeros = (zeros + 1)
+            a = tl.load(a_ptrs, mask=a_mask, other=0.)  # (BLOCK_SIZE_M, BLOCK_SIZE_N)
+            b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+            # Now we need to unpack b (which is N-bit values) into 32-bit values
+            b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+            b = (b - zeros) * scales  # Scale and shift
+            b = tl.trans(b)
+            accumulator += tl.dot(a, b)
+            a_ptrs += BLOCK_SIZE_N
+            b_ptrs += BLOCK_SIZE_N
+            scales_ptrs += BLOCK_SIZE_N
+            zeros_ptrs += (BLOCK_SIZE_N // infearure_per_bits)
+        c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bk[None, :]
+        c_mask = (offs_am[:, None] < M) & (offs_bk[None, :] < K)
+        tl.store(c_ptrs, accumulator, mask=c_mask)
+except:
+    print('triton not installed.')
+def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16)
+        grid = lambda META: (triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(qweight.shape[1], META['BLOCK_SIZE_N']), )
+        matmul_248_kernel[grid](input, qweight, output, scales, qzeros, g_idx, input.shape[0], qweight.shape[1], input.shape[1], bits, maxq, input.stride(0), input.stride(1), qweight.stride(0),
+                                qweight.stride(1), output.stride(0), output.stride(1), scales.stride(0), qzeros.stride(0))
+        return output
+def transpose_matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output_dim = (qweight.shape[0] * 32) // bits
+        output = torch.empty((input.shape[0], output_dim), device=input.device, dtype=torch.float16)
+        grid = lambda META: (triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(output_dim, META['BLOCK_SIZE_K']), )
+        transpose_matmul_248_kernel[grid](input, qweight, output, scales, qzeros, g_idx, input.shape[0], qweight.shape[1], output_dim, bits, maxq, input.stride(0), input.stride(1), qweight.stride(0),
+                                          qweight.stride(1), output.stride(0), output.stride(1), scales.stride(0), qzeros.stride(0))
+        return output
+class QuantLinearFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        ctx.save_for_backward(qweight, scales, qzeros, g_idx)
+        ctx.bits, ctx.maxq = bits, maxq
+        return output
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+        qweight, scales, qzeros, g_idx = ctx.saved_tensors
+        bits, maxq = ctx.bits, ctx.maxq
+        grad_input = None
+        if ctx.needs_input_grad[0]:
+            grad_input = transpose_matmul248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
+        return grad_input, None, None, None, None, None, None
+class QuantLinear(nn.Module):
+    def __init__(self, bits, groupsize, infeatures, outfeatures, bias):
+        super().__init__()
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.bits = bits
+        self.maxq = 2**self.bits - 1
+        self.groupsize = groupsize if groupsize != -1 else infeatures
+        self.register_buffer('qweight', torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32))
+        self.register_buffer('qzeros', torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures // 32 * self.bits), dtype=torch.int32))
+        self.register_buffer('scales', torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures), dtype=torch.float16))
+        self.register_buffer('g_idx', torch.tensor([i // self.groupsize for i in range(infeatures)], dtype=torch.int32))
+        if bias:
+            self.register_buffer('bias', torch.zeros((outfeatures), dtype=torch.float16))
+        else:
+            self.bias = None
+    def pack(self, linear, scales, zeros, g_idx=None):
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(torch.round((linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[:, None])
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.outfeatures, )
+        out = QuantLinearFunction.apply(x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, self.g_idx, self.bits, self.maxq)
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
+def make_quant_linear(module, names, bits, groupsize, name=''):
+    if isinstance(module, QuantLinear):
+        return
+    for attr in dir(module):
+        tmp = getattr(module, attr)
+        name1 = name + '.' + attr if name != '' else attr
+        if name1 in names:
+            delattr(module, attr)
+            setattr(module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features, tmp.bias is not None))
+    for name1, child in module.named_children():
+        make_quant_linear(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)
+def autotune_warmup_linear(model, transpose=False):
+    """
+    Pre-tunes the quantized kernel
+    """
+    from tqdm import tqdm
+    kn_values = {}
+    for _, m in model.named_modules():
+        if not isinstance(m, QuantLinear):
+            continue
+        k = m.infeatures
+        n = m.outfeatures
+        if (k, n) not in kn_values:
+            kn_values[(k, n)] = (m.qweight.cuda(), m.scales.cuda(), m.qzeros.cuda(), m.g_idx.cuda(), m.bits, m.maxq)
+    print(f'Found {len(kn_values)} unique KN Linear values.')
+    print('Warming up autotune cache ...')
+    with torch.no_grad():
+        for m in tqdm(range(0, 12)):
+            m = 2**m  # [1, 2048]
+            for (k, n), (qweight, scales, qzeros, g_idx, bits, maxq) in kn_values.items():
+                a = torch.randn(m, k, dtype=torch.float16, device='cuda')
+                matmul248(a, qweight, scales, qzeros, g_idx, bits, maxq)
+                if transpose:
+                    a = torch.randn(m, n, dtype=torch.float16, device='cuda')
+                    transpose_matmul248(a, qweight, scales, qzeros, g_idx, bits, maxq)
+    del kn_values

GPTQ-for-Qwen_hf/quant/quantizer.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import math
+class Quantizer(nn.Module):
+    def __init__(self, shape=1):
+        super(Quantizer, self).__init__()
+        self.register_buffer('maxq', torch.tensor(0))
+        self.register_buffer('scale', torch.zeros(shape))
+        self.register_buffer('zero', torch.zeros(shape))
+    def configure(self, bits, perchannel=False, sym=True, mse=False, norm=2.4, grid=100, maxshrink=.8, trits=False):
+        self.maxq = torch.tensor(2**bits - 1)
+        self.perchannel = perchannel
+        self.sym = sym
+        self.mse = mse
+        self.norm = norm
+        self.grid = grid
+        self.maxshrink = maxshrink
+        if trits:
+            self.maxq = torch.tensor(-1)
+        self.scale = torch.zeros_like(self.scale)
+    def _quantize(self, x, scale, zero, maxq):
+        if maxq < 0:
+            return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
+        q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+        return scale * (q - zero)
+    def find_params(self, x, weight=False):
+        dev = x.device
+        self.maxq = self.maxq.to(dev)
+        shape = x.shape
+        if self.perchannel:
+            if weight:
+                x = x.flatten(1)
+            else:
+                if len(shape) == 4:
+                    x = x.permute([1, 0, 2, 3])
+                    x = x.flatten(1)
+                if len(shape) == 3:
+                    x = x.reshape((-1, shape[-1])).t()
+                if len(shape) == 2:
+                    x = x.t()
+        else:
+            x = x.flatten().unsqueeze(0)
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+        if self.maxq < 0:
+            self.scale = xmax
+            self.zero = xmin
+        else:
+            self.scale = (xmax - xmin) / self.maxq
+            if self.sym:
+                self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+            else:
+                self.zero = torch.round(-xmin / self.scale)
+        if self.mse:
+            best = torch.full([x.shape[0]], float('inf'), device=dev)
+            for i in range(int(self.maxshrink * self.grid)):
+                p = 1 - i / self.grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / self.maxq
+                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
+                q = self._quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
+                q -= x
+                q.abs_()
+                q.pow_(self.norm)
+                err = torch.sum(q, 1)
+                tmp = err < best
+                if torch.any(tmp):
+                    best[tmp] = err[tmp]
+                    self.scale[tmp] = scale1[tmp]
+                    self.zero[tmp] = zero1[tmp]
+        if not self.perchannel:
+            if weight:
+                tmp = shape[0]
+            else:
+                tmp = shape[1] if len(shape) != 3 else shape[2]
+            self.scale = self.scale.repeat(tmp)
+            self.zero = self.zero.repeat(tmp)
+        if weight:
+            shape = [-1] + [1] * (len(shape) - 1)
+            self.scale = self.scale.reshape(shape)
+            self.zero = self.zero.reshape(shape)
+            return
+        if len(shape) == 4:
+            self.scale = self.scale.reshape((1, -1, 1, 1))
+            self.zero = self.zero.reshape((1, -1, 1, 1))
+        if len(shape) == 3:
+            self.scale = self.scale.reshape((1, 1, -1))
+            self.zero = self.zero.reshape((1, 1, -1))
+        if len(shape) == 2:
+            self.scale = self.scale.unsqueeze(0)
+            self.zero = self.zero.unsqueeze(0)
+    def quantize(self, x):
+        if self.ready():
+            return self._quantize(x, self.scale, self.zero, self.maxq)
+        return x
+    def enabled(self):
+        return self.maxq > 0
+    def ready(self):
+        return torch.all(self.scale != 0)

GPTQ-for-Qwen_hf/quant/triton_norm.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import torch
+from torch import nn
+import triton
+import triton.language as tl
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
+@triton.jit
+def rms_norm_fwd_fused(
+    X,  # pointer to the input
+    Y,  # pointer to the output
+    W,  # pointer to the weights
+    stride,  # how much to increase the pointer when moving by 1 row
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    BLOCK_SIZE: tl.constexpr,
+):
+    # Map the program id to the row of X and Y it should compute.
+    row = tl.program_id(0)
+    Y += row * stride
+    X += row * stride
+    # Compute variance
+    _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
+        x = tl.where(cols < N, x, 0.)
+        _var += x * x
+    var = tl.sum(_var, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    # Normalize and apply linear transformation
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        w = tl.load(W + cols, mask=mask)
+        x = tl.load(X + cols, mask=mask, other=0.).to(tl.float32)
+        x_hat = x * rstd
+        y = x_hat * w
+        # Write output
+        tl.store(Y + cols, y, mask=mask)
+class TritonLlamaRMSNorm(nn.Module):
+    def __init__(self, weight, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = weight
+        self.variance_epsilon = eps
+    def forward(self, x):
+        with torch.cuda.device(x.device):
+            y = torch.empty_like(x)
+            # reshape input data into 2D tensor
+            x_arg = x.reshape(-1, x.shape[-1])
+            M, N = x_arg.shape
+            # Less than 64KB per feature: enqueue fused kernel
+            MAX_FUSED_SIZE = 65536 // x.element_size()
+            BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+            if N > BLOCK_SIZE:
+                raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+            # heuristics for number of warps
+            num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
+            # enqueue kernel
+            rms_norm_fwd_fused[(M,)](x_arg, y, self.weight,
+                                    x_arg.stride(0), N, self.variance_epsilon,
+                                    BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps)
+        return y
+def make_quant_norm(model):
+    """
+    Replace all LlamaRMSNorm modules with TritonLlamaRMSNorm modules
+    """
+    for name, m in model.named_modules():
+        if not isinstance(m, LlamaRMSNorm):
+            continue
+        norm = TritonLlamaRMSNorm(m.weight, m.variance_epsilon)
+        if '.' in name:
+            parent_name = name.rsplit('.', 1)[0]
+            child_name = name[len(parent_name) + 1:]
+            parent = model.get_submodule(parent_name)
+        else:
+            parent_name = ''
+            parent = model
+            child_name = name
+        #print(f"Replacing {name} with quant_attn; parent: {parent_name}, child's name: {child_name}")
+        setattr(parent, child_name, norm)

GPTQ-for-Qwen_hf/qwen.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import argparse
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+import quant
+import sys
+from utils import find_layers, DEV, set_seed, get_wikitext2, get_ptb, get_c4, get_ptb_new, get_c4_new, get_loaders, export_quant_table, gen_conditions
+from texttable import Texttable
+import tqdm
+class Evaluator:
+    def __init__(self, dataset, tokenizer, device, n_samples=40):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.device = device
+        self.dataset = tokenizer(
+            "\n\n".join(dataset["text"]), return_tensors="pt"
+        ).input_ids.to(device)
+        self.n_samples = n_samples
+    @torch.no_grad()
+    def evaluate(self, model):
+        model.eval()
+        nlls = []
+        for i in tqdm.tqdm(range(self.n_samples), desc="Evaluating..."):
+            batch = self.dataset[:, (i * 2048) : ((i + 1) * 2048)].to(model.device)
+            with torch.no_grad():
+                lm_logits = model(batch).logits
+            shift_logits = lm_logits[:, :-1, :].contiguous().float()
+            shift_labels = self.dataset[:, (i * 2048) : ((i + 1) * 2048)][:, 1:]
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+            )
+            neg_log_likelihood = loss.float() * 2048
+            nlls.append(neg_log_likelihood)
+        return torch.exp(torch.stack(nlls).sum() / (self.n_samples * 2048))
+def get_qwen(model):
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import Qwen3ForCausalLM
+    model = Qwen3ForCausalLM.from_pretrained(model, torch_dtype=torch.bfloat16,device_map="auto")
+    model.seqlen = 2048
+    return model
+def load_quant(model, checkpoint, wbits, groupsize=-1, fused_mlp=True, eval=True, warmup_autotune=True):
+    from transformers import Qwen3Config, Qwen3ForCausalLM, modeling_utils
+    config = Qwen3Config.from_pretrained(model)
+    # model = Qwen3Config.from_pretrained(model, torch_dtype=torch.bfloat16, device_map="auto")
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+    torch.set_default_dtype(torch.half)
+    modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = Qwen3ForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    if eval:
+        model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    quant.make_quant_linear(model, layers, wbits, groupsize)
+    del layers
+    print('Loading model ...')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+    if eval:
+        quant.make_quant_attn(model)
+        quant.make_quant_norm(model)
+        if fused_mlp:
+            quant.make_fused_mlp(model)
+    if warmup_autotune:
+        quant.autotune_warmup_linear(model, transpose=not (eval))
+        if eval and fused_mlp:
+            quant.autotune_warmup_fused(model)
+    model.seqlen = 2048
+    print('Done.')
+    return model
+def Qwen_multigpu(model, gpus, gpu_dist):
+    model.model.embed_tokens = model.model.embed_tokens.to(gpus[0])
+    if hasattr(model.model, 'norm') and model.model.norm:
+        model.model.norm = model.model.norm.to(gpus[0])
+    import copy
+    model.lm_head = copy.deepcopy(model.lm_head).to(gpus[0])
+    cache = {'mask': None, 'position_ids': None}
+    class MoveModule(nn.Module):
+        def __init__(self, module, invalidate_cache):
+            super().__init__()
+            self.module = module
+            self.dev = next(iter(self.module.parameters())).device
+            self.invalidate_cache=invalidate_cache
+        def forward(self, *inp, **kwargs):
+            inp = list(inp)
+            if inp[0].device != self.dev:
+                inp[0] = inp[0].to(self.dev)
+            if cache['mask'] is None or cache['mask'].device != self.dev or self.invalidate_cache:
+                cache['mask'] = kwargs['attention_mask'].to(self.dev)
+            kwargs['attention_mask'] = cache['mask']
+            if cache['position_ids'] is None or cache['position_ids'].device != self.dev or self.invalidate_cache:
+                cache['position_ids'] = kwargs['position_ids'].to(self.dev)
+            kwargs['position_ids'] = cache['position_ids']
+            tmp = self.module(*inp, **kwargs)
+            return tmp
+    layers = model.model.layers
+    from math import ceil
+    if not gpu_dist:
+        pergpu = ceil(len(layers) / len(gpus))
+        for i in range(len(layers)):
+            layers[i] = MoveModule(layers[i].to(0 if i == 0 or i == len(layers) -1 else gpus[(i-1) // pergpu]), i==0)
+    else:
+        assert gpu_dist[0] >= 2, "At least two layers must be on GPU 0."
+        assigned_gpus = [0] * (gpu_dist[0]-1)
+        for i in range(1, len(gpu_dist)):
+            assigned_gpus = assigned_gpus + [i] * gpu_dist[i]
+        remaining_assignments = len(layers)-len(assigned_gpus) - 1
+        if remaining_assignments > 0:
+            assigned_gpus = assigned_gpus + [-1] * remaining_assignments
+        assigned_gpus = assigned_gpus + [0]
+        for i in range(len(layers)):
+            layers[i] = MoveModule(layers[i].to(gpus[assigned_gpus[i]]), i==0)
+    model.gpus = gpus
+def benchmark(model, input_ids, check=False):
+    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
+    torch.cuda.synchronize()
+    cache = {'past': None}
+    def clear_past(i):
+        def tmp(layer, inp, out):
+            if cache['past']:
+                cache['past'][i] = None
+        return tmp
+    for i, layer in enumerate(model.model.layers):
+        layer.register_forward_hook(clear_past(i))
+    print('Benchmarking ...')
+    if check:
+        loss = nn.CrossEntropyLoss()
+        tot = 0.
+    def sync():
+        if hasattr(model, 'gpus'):
+            for gpu in model.gpus:
+                torch.cuda.synchronize(gpu)
+        else:
+            torch.cuda.synchronize()
+    max_memory = 0
+    with torch.no_grad():
+        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
+        times = []
+        for i in range(input_ids.numel()):
+            tick = time.time()
+            out = model(input_ids[:, i:i + 1], past_key_values=cache['past'], attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)))
+            sync()
+            times.append(time.time() - tick)
+            print(i, times[-1])
+            if hasattr(model, 'gpus'):
+                mem_allocated = sum(torch.cuda.memory_allocated(gpu) for gpu in model.gpus) / 1024 / 1024
+            else:
+                mem_allocated = torch.cuda.memory_allocated() / 1024 / 1024
+            max_memory = max(max_memory, mem_allocated)
+            if check and i != input_ids.numel() - 1:
+                tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
+            cache['past'] = list(out.past_key_values)
+            del out
+        sync()
+        print('Median:', np.median(times))
+        if check:
+            print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+            print('max memory(MiB):', max_memory)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model', type=str, help='qwen model to load')
+    parser.add_argument('--eval', action='store_true', help='evaluate quantized model.')
+    parser.add_argument('--test-generation', action='store_true', help='test generation.')
+    parser.add_argument('--groupsize', type=int, default=-1, help='Groupsize to use for quantization; default uses full row.')
+    parser.add_argument('--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], help='#bits to use for quantization; use 16 for evaluating base model.')
+    parser.add_argument('--load', type=str, default='', help='Load quantized model.')
+    parser.add_argument('--check', action='store_true', help='Whether to compute perplexity during benchmarking for verification.')
+    parser.add_argument('--true-sequential', action='store_true', help='Whether to run in true sequential model.')
+    parser.add_argument('--new-eval', action='store_true', help='Whether to use the new PTB and C4 eval')
+    parser.add_argument('--layers-dist', type=str, default='', help='Distribution of layers across GPUs. e.g. 2:1:1 for 2 layers on GPU 0, 1 layer on GPU 1, and 1 layer on GPU 2. Any remaining layers will be assigned to your last GPU.')
+    parser.add_argument('--observe',
+                        action='store_true',
+                        help='Auto upgrade layer precision to higher precision, for example int2 to int4, groupsize 128 to 64. \
+            When this feature enabled, `--save` or `--save_safetensors` would be disable.')
+    parser.add_argument('--quant-directory', type=str, default=None, help='Specify the directory for export quantization parameters to toml format. `None` means no export by default.')
+    args = parser.parse_args()
+    if args.layers_dist:
+        gpu_dist = [int(x) for x in args.layers_dist.split(':')]
+    else:
+        gpu_dist = []
+    if type(args.load) is not str:
+        args.load = args.load.as_posix()
+    if args.load:
+        model = load_quant(args.model, args.load, args.wbits, args.groupsize)
+    else:
+        model = get_qwen(args.model)
+        model.eval()
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    layers = model.model.layers
+    if args.test_generation:
+        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+        if len(gpus) > 1:
+            Qwen_multigpu(model, gpus, gpu_dist)
+        else:
+            model = model.to(DEV)
+        from transformers import AutoTokenizer, TextStreamer
+        tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)
+        input_ids = tokenizer(["The capital of New Mexico is"], return_tensors="pt").input_ids.to(gpus[0])
+        streamer = TextStreamer(tokenizer)
+        with torch.no_grad():
+            generated_ids = model.generate(input_ids, streamer=streamer)
+    model = model.to(DEV)
+    if args.eval:
+        # sys.path.append("../eval_my")
+        # print(sys.path)
+        from eval_my.evaluate_ import eval_ours
+        eval_ours(model, tokenizer)

GPTQ-for-Qwen_hf/qwen_gptq_0.6B_loadtest.log ADDED Viewed

@@ -0,0 +1,37 @@
  0%|          | 0/12 [00:00<?, ?it/s]
  8%|▊         | 1/12 [00:02<00:23,  2.18s/it]
 17%|█▋        | 2/12 [00:03<00:18,  1.86s/it]
 25%|██▌       | 3/12 [00:05<00:15,  1.76s/it]
 33%|███▎      | 4/12 [00:07<00:13,  1.71s/it]
 42%|████▏     | 5/12 [00:08<00:11,  1.68s/it]
 50%|█████     | 6/12 [00:10<00:10,  1.67s/it]
 58%|█████▊    | 7/12 [00:12<00:08,  1.67s/it]
 67%|██████▋   | 8/12 [00:14<00:07,  1.76s/it]
 75%|███████▌  | 9/12 [00:16<00:05,  1.84s/it]
 83%|████████▎ | 10/12 [00:18<00:03,  1.92s/it]
 92%|█████████▏| 11/12 [00:20<00:02,  2.01s/it]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/146 [00:00<?, ?it/s]
  1%|          | 1/146 [00:00<01:01,  2.36it/s]
  1%|▏         | 2/146 [00:00<00:37,  3.89it/s]
  2%|▏         | 3/146 [00:00<00:28,  4.98it/s]
  3%|▎         | 4/146 [00:00<00:24,  5.72it/s]
  3%|▎         | 5/146 [00:00<00:22,  6.24it/s]
  4%|▍         | 6/146 [00:01<00:21,  6.61it/s]
  5%|▍         | 7/146 [00:01<00:20,  6.86it/s]
  5%|▌         | 8/146 [00:01<00:19,  7.04it/s]
  6%|▌         | 9/146 [00:01<00:19,  7.16it/s]
  7%|▋         | 10/146 [00:01<00:18,  7.24it/s]
  8%|▊         | 11/146 [00:01<00:18,  7.31it/s]
  8%|▊         | 12/146 [00:01<00:18,  7.35it/s]
  9%|▉         | 13/146 [00:02<00:18,  7.37it/s]
 10%|▉         | 14/146 [00:02<00:17,  7.39it/s]
 10%|█         | 15/146 [00:02<00:17,  7.41it/s]
 11%|█         | 16/146 [00:02<00:17,  7.34it/s]
 12%|█▏        | 17/146 [00:02<00:17,  7.37it/s]
 12%|█▏        | 18/146 [00:02<00:17,  7.39it/s]
 13%|█▎        | 19/146 [00:02<00:17,  7.40it/s]
 14%|█▎        | 20/146 [00:02<00:17,  7.40it/s]
 14%|█▍        | 21/146 [00:03<00:16,  7.40it/s]
 15%|█▌        | 22/146 [00:03<00:16,  7.42it/s]
 16%|█▌        | 23/146 [00:03<00:16,  7.43it/s]
 16%|█▋        | 24/146 [00:03<00:16,  7.43it/s]
 17%|█▋        | 25/146 [00:03<00:16,  7.43it/s]
 18%|█▊        | 26/146 [00:03<00:16,  7.44it/s]
 18%|█▊        | 27/146 [00:03<00:15,  7.44it/s]
 19%|█▉        | 28/146 [00:04<00:15,  7.45it/s]
 20%|█▉        | 29/146 [00:04<00:15,  7.45it/s]
 21%|██        | 30/146 [00:04<00:15,  7.45it/s]
 21%|██        | 31/146 [00:04<00:15,  7.45it/s]
 22%|██▏       | 32/146 [00:04<00:15,  7.45it/s]
 23%|██▎       | 33/146 [00:04<00:15,  7.43it/s]
 23%|██▎       | 34/146 [00:04<00:15,  7.44it/s]
 24%|██▍       | 35/146 [00:05<00:14,  7.44it/s]
 25%|██▍       | 36/146 [00:05<00:14,  7.44it/s]
 25%|██▌       | 37/146 [00:05<00:14,  7.44it/s]
 26%|██▌       | 38/146 [00:05<00:14,  7.44it/s]
 27%|██▋       | 39/146 [00:05<00:14,  7.44it/s]
 27%|██▋       | 40/146 [00:05<00:14,  7.44it/s]
 28%|██▊       | 41/146 [00:05<00:14,  7.45it/s]
 29%|██▉       | 42/146 [00:05<00:13,  7.44it/s]
 29%|██▉       | 43/146 [00:06<00:13,  7.45it/s]
 30%|███       | 44/146 [00:06<00:13,  7.45it/s]
 31%|███       | 45/146 [00:06<00:13,  7.45it/s]
 32%|███▏      | 46/146 [00:06<00:13,  7.45it/s]
 32%|███▏      | 47/146 [00:06<00:13,  7.45it/s]
 33%|███▎      | 48/146 [00:06<00:13,  7.44it/s]
 34%|███▎      | 49/146 [00:06<00:13,  7.45it/s]
 34%|███▍      | 50/146 [00:07<00:12,  7.45it/s]
 35%|███▍      | 51/146 [00:07<00:12,  7.45it/s]
 36%|███▌      | 52/146 [00:07<00:12,  7.45it/s]
 36%|███▋      | 53/146 [00:07<00:12,  7.45it/s]
 37%|███▋      | 54/146 [00:07<00:12,  7.45it/s]
 38%|███▊      | 55/146 [00:07<00:12,  7.45it/s]
 38%|███▊      | 56/146 [00:07<00:12,  7.45it/s]
 39%|███▉      | 57/146 [00:07<00:11,  7.45it/s]
 40%|███▉      | 58/146 [00:08<00:11,  7.46it/s]
 40%|████      | 59/146 [00:08<00:11,  7.45it/s]
 41%|████      | 60/146 [00:08<00:11,  7.45it/s]
 42%|████▏     | 61/146 [00:08<00:11,  7.46it/s]
 42%|████▏     | 62/146 [00:08<00:11,  7.45it/s]
 43%|████▎     | 63/146 [00:08<00:11,  7.45it/s]
 44%|████▍     | 64/146 [00:08<00:10,  7.46it/s]
 45%|████▍     | 65/146 [00:09<00:10,  7.46it/s]
 45%|████▌     | 66/146 [00:09<00:10,  7.46it/s]
 46%|████▌     | 67/146 [00:09<00:10,  7.45it/s]
 47%|████▋     | 68/146 [00:09<00:10,  7.45it/s]
 47%|████▋     | 69/146 [00:09<00:10,  7.46it/s]
 48%|████▊     | 70/146 [00:09<00:10,  7.46it/s]
 49%|████▊     | 71/146 [00:09<00:10,  7.45it/s]
 49%|████▉     | 72/146 [00:09<00:09,  7.45it/s]
 50%|█████     | 73/146 [00:10<00:09,  7.45it/s]
 51%|█████     | 74/146 [00:10<00:09,  7.45it/s]
 51%|█████▏    | 75/146 [00:10<00:09,  7.45it/s]
 52%|█████▏    | 76/146 [00:10<00:09,  7.45it/s]
 53%|█████▎    | 77/146 [00:10<00:09,  7.45it/s]
 53%|█████▎    | 78/146 [00:10<00:09,  7.45it/s]
 54%|█████▍    | 79/146 [00:10<00:08,  7.46it/s]
 55%|█████▍    | 80/146 [00:11<00:08,  7.46it/s]
 55%|█████▌    | 81/146 [00:11<00:08,  7.46it/s]
 56%|█████▌    | 82/146 [00:11<00:08,  7.46it/s]
 57%|█████▋    | 83/146 [00:11<00:08,  7.46it/s]
 58%|█████▊    | 84/146 [00:11<00:08,  7.45it/s]
 58%|█████▊    | 85/146 [00:11<00:08,  7.45it/s]
 59%|█████▉    | 86/146 [00:11<00:08,  7.45it/s]
 60%|█████▉    | 87/146 [00:11<00:07,  7.45it/s]
 60%|██████    | 88/146 [00:12<00:07,  7.45it/s]
 61%|██████    | 89/146 [00:12<00:07,  7.46it/s]
 62%|██████▏   | 90/146 [00:12<00:07,  7.46it/s]
 62%|██████▏   | 91/146 [00:12<00:07,  7.45it/s]
 63%|██████▎   | 92/146 [00:12<00:07,  7.45it/s]
 64%|██████▎   | 93/146 [00:12<00:07,  7.45it/s]
 64%|██████▍   | 94/146 [00:12<00:06,  7.45it/s]
 65%|██████▌   | 95/146 [00:13<00:06,  7.45it/s]
 66%|██████▌   | 96/146 [00:13<00:06,  7.46it/s]
 66%|██████▋   | 97/146 [00:13<00:06,  7.46it/s]
 67%|██████▋   | 98/146 [00:13<00:06,  7.46it/s]
 68%|██████▊   | 99/146 [00:13<00:06,  7.45it/s]
 68%|██████▊   | 100/146 [00:13<00:06,  7.46it/s]
 69%|██████▉   | 101/146 [00:13<00:06,  7.45it/s]
 70%|██████▉   | 102/146 [00:13<00:05,  7.45it/s]
 71%|███████   | 103/146 [00:14<00:05,  7.45it/s]
 71%|███████   | 104/146 [00:14<00:05,  7.45it/s]
 72%|███████▏  | 105/146 [00:14<00:05,  7.46it/s]
 73%|███████▎  | 106/146 [00:14<00:05,  7.46it/s]
 73%|███████▎  | 107/146 [00:14<00:05,  7.46it/s]
 74%|███████▍  | 108/146 [00:14<00:05,  7.46it/s]
 75%|███████▍  | 109/146 [00:14<00:04,  7.46it/s]
 75%|███████▌  | 110/146 [00:15<00:04,  7.46it/s]
 76%|███████▌  | 111/146 [00:15<00:04,  7.46it/s]
 77%|███████▋  | 112/146 [00:15<00:04,  7.46it/s]
 77%|███████▋  | 113/146 [00:15<00:04,  7.45it/s]
 78%|███████▊  | 114/146 [00:15<00:04,  7.45it/s]
 79%|███████▉  | 115/146 [00:15<00:04,  7.45it/s]
 79%|███████▉  | 116/146 [00:15<00:04,  7.46it/s]
 80%|████████  | 117/146 [00:16<00:03,  7.46it/s]
 81%|████████  | 118/146 [00:16<00:03,  7.46it/s]
 82%|████████▏ | 119/146 [00:16<00:03,  7.46it/s]
 82%|████████▏ | 120/146 [00:16<00:03,  7.46it/s]
 83%|████████▎ | 121/146 [00:16<00:03,  7.46it/s]
 84%|████████▎ | 122/146 [00:16<00:03,  7.46it/s]
 84%|████████▍ | 123/146 [00:16<00:03,  7.46it/s]
 85%|████████▍ | 124/146 [00:16<00:02,  7.46it/s]
 86%|████████▌ | 125/146 [00:17<00:02,  7.46it/s]
 86%|████████▋ | 126/146 [00:17<00:02,  7.46it/s]
 87%|████████▋ | 127/146 [00:17<00:02,  7.46it/s]
 88%|████████▊ | 128/146 [00:17<00:02,  7.45it/s]
 88%|████████▊ | 129/146 [00:17<00:02,  7.45it/s]
 89%|████████▉ | 130/146 [00:17<00:02,  7.46it/s]
 90%|████████▉ | 131/146 [00:17<00:02,  7.46it/s]
 90%|█████████ | 132/146 [00:18<00:01,  7.46it/s]
 91%|█████████ | 133/146 [00:18<00:01,  7.46it/s]
 92%|█████████▏| 134/146 [00:18<00:01,  7.46it/s]
 92%|█████████▏| 135/146 [00:18<00:01,  7.46it/s]
 93%|█████████▎| 136/146 [00:18<00:01,  7.46it/s]
 94%|█████████▍| 137/146 [00:18<00:01,  7.46it/s]
 95%|█████████▍| 138/146 [00:18<00:01,  7.45it/s]
 95%|█████████▌| 139/146 [00:18<00:00,  7.46it/s]
 96%|█████████▌| 140/146 [00:19<00:00,  7.45it/s]
 97%|█████████▋| 141/146 [00:19<00:00,  7.45it/s]
 97%|█████████▋| 142/146 [00:19<00:00,  7.45it/s]
 98%|█████████▊| 143/146 [00:19<00:00,  7.45it/s]
 99%|█████████▊| 144/146 [00:19<00:00,  7.45it/s]
 99%|█████████▉| 145/146 [00:19<00:00,  7.46it/s]

+/mnt/data/lin/pretrained_models/GPTQ-for-Qwen_hf/quant/quant_linear.py:285: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  @custom_fwd(cast_inputs=torch.float16)
+/mnt/data/lin/pretrained_models/GPTQ-for-Qwen_hf/quant/quant_linear.py:294: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
+/home/beihang/lin/pretrained_models/GPTQ-for-Qwen_hf/qwen.py:94: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  model.load_state_dict(torch.load(checkpoint))
+Loading model ...
+Found 5 unique KN Linear values.
+Warming up autotune cache ...
  0%|          | 0/12 [00:00<?, ?it/s]
  8%|▊         | 1/12 [00:02<00:23,  2.18s/it]
 17%|█▋        | 2/12 [00:03<00:18,  1.86s/it]
 25%|██▌       | 3/12 [00:05<00:15,  1.76s/it]
 33%|███▎      | 4/12 [00:07<00:13,  1.71s/it]
 42%|████▏     | 5/12 [00:08<00:11,  1.68s/it]
 50%|█████     | 6/12 [00:10<00:10,  1.67s/it]
 58%|█████▊    | 7/12 [00:12<00:08,  1.67s/it]
 67%|██████▋   | 8/12 [00:14<00:07,  1.76s/it]
 75%|███████▌  | 9/12 [00:16<00:05,  1.84s/it]
 83%|████████▎ | 10/12 [00:18<00:03,  1.92s/it]
 92%|█████████▏| 11/12 [00:20<00:02,  2.01s/it]
+Found 0 unique fused mlp KN values.
+Warming up autotune cache ...
  0%|          | 0/12 [00:00<?, ?it/s]
+2025-05-10:22:35:04,025 WARNING  [huggingface.py:98] `pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
+2025-05-10:22:35:04,048 WARNING  [huggingface.py:279] Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration
+Token indices sequence length is longer than the specified maximum sequence length for this model (299078 > 131072). Running this sequence through the model will result in indexing errors
+Done.
+bos/eos tokens updated: tokenizer.bos_token_id=1,  tokenizer.eos_token_id=2
  0%|          | 0/146 [00:00<?, ?it/s]
  1%|          | 1/146 [00:00<01:01,  2.36it/s]
  1%|▏         | 2/146 [00:00<00:37,  3.89it/s]
  2%|▏         | 3/146 [00:00<00:28,  4.98it/s]
  3%|▎         | 4/146 [00:00<00:24,  5.72it/s]
  3%|▎         | 5/146 [00:00<00:22,  6.24it/s]
  4%|▍         | 6/146 [00:01<00:21,  6.61it/s]
  5%|▍         | 7/146 [00:01<00:20,  6.86it/s]
  5%|▌         | 8/146 [00:01<00:19,  7.04it/s]
  6%|▌         | 9/146 [00:01<00:19,  7.16it/s]
  7%|▋         | 10/146 [00:01<00:18,  7.24it/s]
  8%|▊         | 11/146 [00:01<00:18,  7.31it/s]
  8%|▊         | 12/146 [00:01<00:18,  7.35it/s]
  9%|▉         | 13/146 [00:02<00:18,  7.37it/s]
 10%|▉         | 14/146 [00:02<00:17,  7.39it/s]
 10%|█         | 15/146 [00:02<00:17,  7.41it/s]
 11%|█         | 16/146 [00:02<00:17,  7.34it/s]
 12%|█▏        | 17/146 [00:02<00:17,  7.37it/s]
 12%|█▏        | 18/146 [00:02<00:17,  7.39it/s]
 13%|█▎        | 19/146 [00:02<00:17,  7.40it/s]
 14%|█▎        | 20/146 [00:02<00:17,  7.40it/s]
 14%|█▍        | 21/146 [00:03<00:16,  7.40it/s]
 15%|█▌        | 22/146 [00:03<00:16,  7.42it/s]
 16%|█▌        | 23/146 [00:03<00:16,  7.43it/s]
 16%|█▋        | 24/146 [00:03<00:16,  7.43it/s]
 17%|█▋        | 25/146 [00:03<00:16,  7.43it/s]
 18%|█▊        | 26/146 [00:03<00:16,  7.44it/s]
 18%|█▊        | 27/146 [00:03<00:15,  7.44it/s]
 19%|█▉        | 28/146 [00:04<00:15,  7.45it/s]
 20%|█▉        | 29/146 [00:04<00:15,  7.45it/s]
 21%|██        | 30/146 [00:04<00:15,  7.45it/s]
 21%|██        | 31/146 [00:04<00:15,  7.45it/s]
 22%|██▏       | 32/146 [00:04<00:15,  7.45it/s]
 23%|██▎       | 33/146 [00:04<00:15,  7.43it/s]
 23%|██▎       | 34/146 [00:04<00:15,  7.44it/s]
 24%|██▍       | 35/146 [00:05<00:14,  7.44it/s]
 25%|██▍       | 36/146 [00:05<00:14,  7.44it/s]
 25%|██▌       | 37/146 [00:05<00:14,  7.44it/s]
 26%|██▌       | 38/146 [00:05<00:14,  7.44it/s]
 27%|██▋       | 39/146 [00:05<00:14,  7.44it/s]
 27%|██▋       | 40/146 [00:05<00:14,  7.44it/s]
 28%|██▊       | 41/146 [00:05<00:14,  7.45it/s]
 29%|██▉       | 42/146 [00:05<00:13,  7.44it/s]
 29%|██▉       | 43/146 [00:06<00:13,  7.45it/s]
 30%|███       | 44/146 [00:06<00:13,  7.45it/s]
 31%|███       | 45/146 [00:06<00:13,  7.45it/s]
 32%|███▏      | 46/146 [00:06<00:13,  7.45it/s]
 32%|███▏      | 47/146 [00:06<00:13,  7.45it/s]
 33%|███▎      | 48/146 [00:06<00:13,  7.44it/s]
 34%|███▎      | 49/146 [00:06<00:13,  7.45it/s]
 34%|███▍      | 50/146 [00:07<00:12,  7.45it/s]
 35%|███▍      | 51/146 [00:07<00:12,  7.45it/s]
 36%|███▌      | 52/146 [00:07<00:12,  7.45it/s]
 36%|███▋      | 53/146 [00:07<00:12,  7.45it/s]
 37%|███▋      | 54/146 [00:07<00:12,  7.45it/s]
 38%|███▊      | 55/146 [00:07<00:12,  7.45it/s]
 38%|███▊      | 56/146 [00:07<00:12,  7.45it/s]
 39%|███▉      | 57/146 [00:07<00:11,  7.45it/s]
 40%|███▉      | 58/146 [00:08<00:11,  7.46it/s]
 40%|████      | 59/146 [00:08<00:11,  7.45it/s]
 41%|████      | 60/146 [00:08<00:11,  7.45it/s]
 42%|████▏     | 61/146 [00:08<00:11,  7.46it/s]
 42%|████▏     | 62/146 [00:08<00:11,  7.45it/s]
 43%|████▎     | 63/146 [00:08<00:11,  7.45it/s]
 44%|████▍     | 64/146 [00:08<00:10,  7.46it/s]
 45%|████▍     | 65/146 [00:09<00:10,  7.46it/s]
 45%|████▌     | 66/146 [00:09<00:10,  7.46it/s]
 46%|████▌     | 67/146 [00:09<00:10,  7.45it/s]
 47%|████▋     | 68/146 [00:09<00:10,  7.45it/s]
 47%|████▋     | 69/146 [00:09<00:10,  7.46it/s]
 48%|████▊     | 70/146 [00:09<00:10,  7.46it/s]
 49%|████▊     | 71/146 [00:09<00:10,  7.45it/s]
 49%|████▉     | 72/146 [00:09<00:09,  7.45it/s]
 50%|█████     | 73/146 [00:10<00:09,  7.45it/s]
 51%|█████     | 74/146 [00:10<00:09,  7.45it/s]
 51%|█████▏    | 75/146 [00:10<00:09,  7.45it/s]
 52%|█████▏    | 76/146 [00:10<00:09,  7.45it/s]
 53%|█████▎    | 77/146 [00:10<00:09,  7.45it/s]
 53%|█████▎    | 78/146 [00:10<00:09,  7.45it/s]
 54%|█████▍    | 79/146 [00:10<00:08,  7.46it/s]
 55%|█████▍    | 80/146 [00:11<00:08,  7.46it/s]
 55%|█████▌    | 81/146 [00:11<00:08,  7.46it/s]
 56%|█████▌    | 82/146 [00:11<00:08,  7.46it/s]
 57%|█████▋    | 83/146 [00:11<00:08,  7.46it/s]
 58%|█████▊    | 84/146 [00:11<00:08,  7.45it/s]
 58%|█████▊    | 85/146 [00:11<00:08,  7.45it/s]
 59%|█████▉    | 86/146 [00:11<00:08,  7.45it/s]
 60%|█████▉    | 87/146 [00:11<00:07,  7.45it/s]
 60%|██████    | 88/146 [00:12<00:07,  7.45it/s]
 61%|██████    | 89/146 [00:12<00:07,  7.46it/s]
 62%|██████▏   | 90/146 [00:12<00:07,  7.46it/s]
 62%|██████▏   | 91/146 [00:12<00:07,  7.45it/s]
 63%|██████▎   | 92/146 [00:12<00:07,  7.45it/s]
 64%|██████▎   | 93/146 [00:12<00:07,  7.45it/s]
 64%|██████▍   | 94/146 [00:12<00:06,  7.45it/s]
 65%|██████▌   | 95/146 [00:13<00:06,  7.45it/s]
 66%|██████▌   | 96/146 [00:13<00:06,  7.46it/s]
 66%|██████▋   | 97/146 [00:13<00:06,  7.46it/s]
 67%|██████▋   | 98/146 [00:13<00:06,  7.46it/s]
 68%|██████▊   | 99/146 [00:13<00:06,  7.45it/s]
 68%|██████▊   | 100/146 [00:13<00:06,  7.46it/s]
 69%|██████▉   | 101/146 [00:13<00:06,  7.45it/s]
 70%|██████▉   | 102/146 [00:13<00:05,  7.45it/s]
 71%|███████   | 103/146 [00:14<00:05,  7.45it/s]
 71%|███████   | 104/146 [00:14<00:05,  7.45it/s]
 72%|███████▏  | 105/146 [00:14<00:05,  7.46it/s]
 73%|███████▎  | 106/146 [00:14<00:05,  7.46it/s]
 73%|███████▎  | 107/146 [00:14<00:05,  7.46it/s]
 74%|███████▍  | 108/146 [00:14<00:05,  7.46it/s]
 75%|███████▍  | 109/146 [00:14<00:04,  7.46it/s]
 75%|███████▌  | 110/146 [00:15<00:04,  7.46it/s]
 76%|███████▌  | 111/146 [00:15<00:04,  7.46it/s]
 77%|███████▋  | 112/146 [00:15<00:04,  7.46it/s]
 77%|███████▋  | 113/146 [00:15<00:04,  7.45it/s]
 78%|███████▊  | 114/146 [00:15<00:04,  7.45it/s]
 79%|███████▉  | 115/146 [00:15<00:04,  7.45it/s]
 79%|███████▉  | 116/146 [00:15<00:04,  7.46it/s]
 80%|████████  | 117/146 [00:16<00:03,  7.46it/s]
 81%|████████  | 118/146 [00:16<00:03,  7.46it/s]
 82%|████████▏ | 119/146 [00:16<00:03,  7.46it/s]
 82%|████████▏ | 120/146 [00:16<00:03,  7.46it/s]
 83%|████████▎ | 121/146 [00:16<00:03,  7.46it/s]
 84%|████████▎ | 122/146 [00:16<00:03,  7.46it/s]
 84%|████████▍ | 123/146 [00:16<00:03,  7.46it/s]
 85%|████████▍ | 124/146 [00:16<00:02,  7.46it/s]
 86%|████████▌ | 125/146 [00:17<00:02,  7.46it/s]
 86%|████████▋ | 126/146 [00:17<00:02,  7.46it/s]
 87%|████████▋ | 127/146 [00:17<00:02,  7.46it/s]
 88%|████████▊ | 128/146 [00:17<00:02,  7.45it/s]
 88%|████████▊ | 129/146 [00:17<00:02,  7.45it/s]
 89%|████████▉ | 130/146 [00:17<00:02,  7.46it/s]
 90%|████████▉ | 131/146 [00:17<00:02,  7.46it/s]
 90%|█████████ | 132/146 [00:18<00:01,  7.46it/s]
 91%|█████████ | 133/146 [00:18<00:01,  7.46it/s]
 92%|█████████▏| 134/146 [00:18<00:01,  7.46it/s]
 92%|█████████▏| 135/146 [00:18<00:01,  7.46it/s]
 93%|█████████▎| 136/146 [00:18<00:01,  7.46it/s]
 94%|█████████▍| 137/146 [00:18<00:01,  7.46it/s]
 95%|█████████▍| 138/146 [00:18<00:01,  7.45it/s]
 95%|█████████▌| 139/146 [00:18<00:00,  7.46it/s]
 96%|█████████▌| 140/146 [00:19<00:00,  7.45it/s]
 97%|█████████▋| 141/146 [00:19<00:00,  7.45it/s]
 97%|█████████▋| 142/146 [00:19<00:00,  7.45it/s]
 98%|█████████▊| 143/146 [00:19<00:00,  7.45it/s]
 99%|█████████▊| 144/146 [00:19<00:00,  7.45it/s]
 99%|█████████▉| 145/146 [00:19<00:00,  7.46it/s]
+wikitext2 18.208261489868164
+Traceback (most recent call last):
+  File "/home/beihang/lin/pretrained_models/GPTQ-for-Qwen_hf/qwen.py", line 288, in <module>
+    eval_ours(model, tokenizer)
+  File "/mnt/data/lin/pretrained_models/GPTQ-for-Qwen_hf/eval_my/evaluate_.py", line 163, in eval_ours
+    results = evaluate_model(
+  File "/home/beihang/anaconda3/envs/qwen3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+    return func(*args, **kwargs)
+  File "/mnt/data/lin/pretrained_models/GPTQ-for-Qwen_hf/eval_my/evaluate_.py", line 108, in evaluate_model
+    testloader = get_eval_loaders(dataset, tokenizer)
+  File "/mnt/data/lin/pretrained_models/GPTQ-for-Qwen_hf/eval_my/datautils.py", line 67, in get_eval_loaders
+    return get_c4(tokenizer)
+  File "/mnt/data/lin/pretrained_models/GPTQ-for-Qwen_hf/eval_my/datautils.py", line 690, in get_c4
+    valdata = load_from_disk('eval_my/ppl_datasets/allenai/c4/allenai--c4/validation')
+  File "/home/beihang/anaconda3/envs/qwen3/lib/python3.10/site-packages/datasets/load.py", line 2694, in load_from_disk
+    raise FileNotFoundError(f"Directory {dataset_path} not found")
+FileNotFoundError: Directory eval_my/ppl_datasets/allenai/c4/allenai--c4/validation not found

GPTQ-for-Qwen_hf/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+safetensors==0.3.1
+datasets==2.10.1
+sentencepiece
+git+https://github.com/huggingface/transformers
+accelerate==0.20.3
+triton==2.0.0
+texttable
+toml
+numpy
+protobuf==3.20.2

GPTQ-for-Qwen_hf/test.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ CUDA_VISIBLE_DEVICES=6 python /home/beihang/lin/pretrained_models/GPTQ-for-Qwen_hf/qwen.py /home/beihang/lin/pretrained_models/Qwen3/Qwen3-0.6B --wbits 4 --groupsize -1 --eval --load /home/beihang/lin/pretrained_models/eval_my/GPTQ-for-LLaMa-triton/gptq_0.6B_w4_perchannel.pth 2>&1 \| tee /home/beihang/lin/pretrained_models/GPTQ-for-Qwen_hf/qwen_gptq_0.6B_loadtest.log

GPTQ-for-Qwen_hf/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .modelutils import DEV, find_layers, gen_conditions, torch_snr_error
+from .datautils import set_seed, get_wikitext2, get_ptb, get_c4, get_ptb_new, get_c4_new, get_loaders
+from .export import export_quant_table

GPTQ-for-Qwen_hf/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (485 Bytes). View file

GPTQ-for-Qwen_hf/utils/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (488 Bytes). View file

GPTQ-for-Qwen_hf/utils/__pycache__/datautils.cpython-310.pyc ADDED Viewed

Binary file (4.31 kB). View file

GPTQ-for-Qwen_hf/utils/__pycache__/datautils.cpython-38.pyc ADDED Viewed

Binary file (5.15 kB). View file

GPTQ-for-Qwen_hf/utils/__pycache__/export.cpython-310.pyc ADDED Viewed

Binary file (1.21 kB). View file

GPTQ-for-Qwen_hf/utils/__pycache__/export.cpython-38.pyc ADDED Viewed

Binary file (1.19 kB). View file

GPTQ-for-Qwen_hf/utils/__pycache__/modelutils.cpython-310.pyc ADDED Viewed

Binary file (2.31 kB). View file

GPTQ-for-Qwen_hf/utils/__pycache__/modelutils.cpython-38.pyc ADDED Viewed

Binary file (2.3 kB). View file

GPTQ-for-Qwen_hf/utils/datautils.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import numpy as np
+import torch
+def set_seed(seed):
+    np.random.seed(seed)
+    torch.random.manual_seed(seed)
+def get_wikitext2(nsamples, seed, seqlen, model):
+    from datasets import load_dataset
+    traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
+    testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
+    from transformers import AutoTokenizer
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
+    except:
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+    trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt')
+    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')
+    import random
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    return trainloader, testenc
+def get_ptb(nsamples, seed, seqlen, model):
+    from datasets import load_dataset
+    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
+    valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation')
+    from transformers import AutoTokenizer
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
+    except:
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+    trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt')
+    testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt')
+    import random
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    return trainloader, testenc
+# def get_c4(nsamples, seed, seqlen, model):
+#     from datasets import load_dataset
+#     traindata = load_dataset('allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train', use_auth_token=False)
+#     valdata = load_dataset('allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation', use_auth_token=False)
+#     from transformers import AutoTokenizer
+#     try:
+#         tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
+#     except:
+#         tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+#     import random
+#     random.seed(seed)
+#     trainloader = []
+#     for _ in range(nsamples):
+#         while True:
+#             i = random.randint(0, len(traindata) - 1)
+#             trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+#             if trainenc.input_ids.shape[1] >= seqlen:
+#                 break
+#         i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+#         j = i + seqlen
+#         inp = trainenc.input_ids[:, i:j]
+#         tar = inp.clone()
+#         tar[:, :-1] = -100
+#         trainloader.append((inp, tar))
+#     import random
+#     random.seed(0)
+#     valenc = []
+#     for _ in range(256):
+#         while True:
+#             i = random.randint(0, len(valdata) - 1)
+#             tmp = tokenizer(valdata[i]['text'], return_tensors='pt')
+#             if tmp.input_ids.shape[1] >= seqlen:
+#                 break
+#         i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
+#         j = i + seqlen
+#         valenc.append(tmp.input_ids[:, i:j])
+#     valenc = torch.hstack(valenc)
+#     class TokenizerWrapper:
+#         def __init__(self, input_ids):
+#             self.input_ids = input_ids
+#     valenc = TokenizerWrapper(valenc)
+#     return trainloader, valenc
+class TokenizerWrapper:
+    def __init__(self, input_ids):
+        self.input_ids = input_ids
+import numpy as np
+import torch
+from datasets import load_dataset, load_from_disk
+from transformers import AutoTokenizer, LlamaTokenizer
+import os
+import random
+def get_c4(nsamples, seed, seqlen, model, tokenizer):
+    # traindata = load_dataset(
+    #     'allenai/c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train'
+    # )
+    # valdata = load_dataset(
+    #     'allenai/c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation'
+    # )
+    alldata = load_from_disk('/home/beihang/lin/pretrained_models/c4/allenai--c4')
+    traindata = alldata['train']
+    valdata = alldata['validation']
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        while True:
+            i = random.randint(0, len(traindata) - 1)
+            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+            if trainenc.input_ids.shape[1] > seqlen:
+                break
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt')
+    valenc = valenc.input_ids[:, :(256 * seqlen)]
+    valenc = TokenizerWrapper(valenc)
+    return trainloader, valenc
+def get_ptb_new(nsamples, seed, seqlen, model):
+    from datasets import load_dataset
+    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
+    testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')
+    from transformers import AutoTokenizer
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
+    except:
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+    trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt')
+    testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt')
+    import random
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    return trainloader, testenc
+def get_c4_new(nsamples, seed, seqlen, model):
+    from datasets import load_dataset
+    traindata = load_dataset('allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train')
+    valdata = load_dataset('allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation')
+    from transformers import AutoTokenizer
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
+    except:
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+    import random
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        while True:
+            i = random.randint(0, len(traindata) - 1)
+            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+            if trainenc.input_ids.shape[1] >= seqlen:
+                break
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt')
+    valenc = valenc.input_ids[:, :(256 * seqlen)]
+    class TokenizerWrapper:
+        def __init__(self, input_ids):
+            self.input_ids = input_ids
+    valenc = TokenizerWrapper(valenc)
+    return trainloader, valenc
+def get_loaders(name, nsamples=128, seed=0, seqlen=2048, model='',tokenizer = None):
+    if 'wikitext2' in name:
+        return get_wikitext2(nsamples, seed, seqlen, model)
+    if 'ptb' in name:
+        if 'new' in name:
+            return get_ptb_new(nsamples, seed, seqlen, model)
+        return get_ptb(nsamples, seed, seqlen, model)
+    if 'c4' in name:
+        if 'new' in name:
+            return get_c4_new(nsamples, seed, seqlen, model)
+        return get_c4(nsamples, seed, seqlen, model,tokenizer=tokenizer)

GPTQ-for-Qwen_hf/utils/export.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import numpy as np
+import toml
+import os
+def export_quant_table(quantizers: dict, quant_dir: str, format: str = 'toml'):
+    table = {}
+    def save_tensor(name: str, tensor):
+        np.save(os.path.join(quant_dir, name), tensor.numpy())
+        return '{}.npy'.format(name)
+    for key, value in quantizers.items():
+        quantizer = value[0]
+        dump = dict()
+        sym = quantizer.sym
+        if not sym:
+            dump['zero'] = save_tensor(name=key + '.zero', tensor=value[2])
+        dump['scale'] = save_tensor(name=key + '.scale', tensor=value[1])
+        dump['wbits'] = value[4]
+        dump['groupsize'] = value[5]
+        if value[5] > 0:
+            dump['group_ids'] = save_tensor(name=key + '.group_ids', tensor=value[3])
+        dump['sym'] = sym
+        dump['perchannel'] = quantizer.perchannel
+        table[key] = dump
+    if not os.path.exists(quant_dir):
+        os.mkdir(quant_dir)
+    with open(os.path.join(quant_dir, 'quant.toml'), 'w') as f:
+        toml.dump(table, f)

GPTQ-for-Qwen_hf/utils/modelutils.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import torch.nn as nn
+DEV = torch.device('cuda:0')
+def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+    if type(module) in layers:
+        return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1))
+    return res
+def gen_conditions(_wbits, _groupsize):
+    wbits = _wbits
+    groupsize = _groupsize
+    conditions = []
+    while True:
+        if wbits >= 8:
+            if groupsize == -1 or groupsize == 32:
+                break
+        if groupsize > 32:
+            groupsize /= 2
+        else:
+            wbits *= 2
+            groupsize = _groupsize
+        conditions.append((int(wbits), int(groupsize)))
+    return conditions
+# copy from https://github.com/openppl-public/ppq/blob/master/ppq/quantization/measure/norm.py
+def torch_snr_error(y_pred: torch.Tensor, y_real: torch.Tensor, reduction: str = 'mean') -> torch.Tensor:
+    """
+    Compute SNR between y_pred(tensor) and y_real(tensor)
+    SNR can be calcualted as following equation:
+        SNR(pred, real) = (pred - real) ^ 2 / (real) ^ 2
+    if x and y are matrixs, SNR error over matrix should be the mean value of SNR error over all elements.
+        SNR(pred, real) = mean((pred - real) ^ 2 / (real) ^ 2)
+    Args:
+        y_pred (torch.Tensor): _description_
+        y_real (torch.Tensor): _description_
+        reduction (str, optional): _description_. Defaults to 'mean'.
+    Raises:
+        ValueError: _description_
+        ValueError: _description_
+    Returns:
+        torch.Tensor: _description_
+    """
+    y_pred = y_pred.type(torch.float32)
+    y_real = y_real.type(torch.float32)
+    if y_pred.shape != y_real.shape:
+        raise ValueError(f'Can not compute snr loss for tensors with different shape. '
+                         f'({y_pred.shape} and {y_real.shape})')
+    reduction = str(reduction).lower()
+    if y_pred.ndim == 1:
+        y_pred = y_pred.unsqueeze(0)
+        y_real = y_real.unsqueeze(0)
+    y_pred = y_pred.flatten(start_dim=1)
+    y_real = y_real.flatten(start_dim=1)
+    noise_power = torch.pow(y_pred - y_real, 2).sum(dim=-1)
+    signal_power = torch.pow(y_real, 2).sum(dim=-1)
+    snr = (noise_power) / (signal_power + 1e-7)
+    if reduction == 'mean':
+        return torch.mean(snr)
+    elif reduction == 'sum':
+        return torch.sum(snr)
+    elif reduction == 'none':
+        return snr
+    else:
+        raise ValueError(f'Unsupported reduction method.')