Take a small language model (e.g., distilbert-base-uncased or TinyLlama-1.1B), apply 8-bit and 4-bit quantization, and compare:
pip install transformers optimum[onnxruntime] torch onnx onnxruntime
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
torch.cuda.memory_allocated() or similar.from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
# First export to ONNX
from optimum.onnxruntime import ORTModelForSequenceClassification
model_ort = ORTModelForSequenceClassification.from_pretrained(model_name, from_transformers=True)
model_ort.save_pretrained("./onnx_model")
# Configure dynamic INT8 quantization
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
quantizer = ORTQuantizer.from_pretrained("./onnx_model")
quantizer.quantize(save_dir="./quantized_model", quantization_config=dqconfig)
from optimum.onnxruntime import ORTModelForSequenceClassification
model_quant = ORTModelForSequenceClassification.from_pretrained("./quantized_model")
# Repeat size, time, and memory measurements
import torch
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
model_4bit = AutoModelForSequenceClassification.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto"
)
# Measure size, time, memory, and accuracy again
| Model | Size (MB) | Latency (ms) | VRAM Usage (MB) | Accuracy (%) |
|---|---|---|---|---|
| Original FP32 | 267 | 45 | 1024 | 91.2 |
| Quantized INT8 | 67 | 28 | 256 | 90.8 |
| Quantized NF4 | 34 | 35* | 128 | 90.1 |
(* 4-bit quantization may be slower on some hardware due to decompression overhead)