A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor
import subprocess, sys def pip(*pkgs): subprocess.check_call([sys.executable, “-m”, “pip”, “install”, “-q”, *pkgs]) pip(“llmcompressor”, “compressed-tensors”, “transformers>=4.45”, “accelerate”, “datasets”) import os, gc, time, json, math from pathlib import Path import torch from transformers import AutoModelForCausalLM, AutoTokenizer from datasets import load_dataset assert torch.cuda.is_available(), \ “Enable a GPU: Runtime > Change runtime type > T4 GPU” print(“GPU:”, torch.cuda.get_device_name(0), “| CUDA:”,…
