vllm.model_executor.layers.quantization.utils.quant_utils ¶

This file is used for /tests and /benchmarks

FP4_DTYPE `module-attribute` ¶

FP4_DTYPE = uint8

FP8_DTYPE `module-attribute` ¶

FP8_DTYPE = fp8_dtype()

SUPPORTED_GPTQ_QUANT_TYPES `module-attribute` ¶

SUPPORTED_GPTQ_QUANT_TYPES = [uint4b8, uint8b128]

SUPPORTED_GROUP_SIZES `module-attribute` ¶

SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]

kDynamic128Scale `module-attribute` ¶

kDynamic128Scale = ScaleDesc(
    float32, False, GroupShape(1, 128)
)

kDynamic64Scale `module-attribute` ¶

kDynamic64Scale = ScaleDesc(
    float32, False, GroupShape(1, 64)
)

kDynamicTensorScale `module-attribute` ¶

kDynamicTensorScale = ScaleDesc(float32, False, PER_TENSOR)

kDynamicTokenScale `module-attribute` ¶

kDynamicTokenScale = ScaleDesc(float32, False, PER_TOKEN)

kFp8Dynamic128Sym `module-attribute` ¶

kFp8Dynamic128Sym = QuantKey(
    FP8_DTYPE, kDynamic128Scale, symmetric=True
)

kFp8Dynamic64Sym `module-attribute` ¶

kFp8Dynamic64Sym = QuantKey(
    FP8_DTYPE, kDynamic64Scale, symmetric=True
)

kFp8DynamicTensorSym `module-attribute` ¶

kFp8DynamicTensorSym = QuantKey(
    FP8_DTYPE, kDynamicTensorScale, symmetric=True
)

kFp8DynamicTokenSym `module-attribute` ¶

kFp8DynamicTokenSym = QuantKey(
    FP8_DTYPE, kDynamicTokenScale, symmetric=True
)

kFp8Static128BlockSym `module-attribute` ¶

kFp8Static128BlockSym = QuantKey(
    FP8_DTYPE, kStatic128BlockScale, symmetric=True
)

kFp8StaticChannelSym `module-attribute` ¶

kFp8StaticChannelSym = QuantKey(
    FP8_DTYPE, kStaticChannelScale, symmetric=True
)

kFp8StaticTensorSym `module-attribute` ¶

kFp8StaticTensorSym = QuantKey(
    FP8_DTYPE, kStaticTensorScale, symmetric=True
)

kFp8StaticTokenSym `module-attribute` ¶

kFp8StaticTokenSym = QuantKey(
    FP8_DTYPE, kStaticTokenScale, symmetric=True
)

kNvfp4Dynamic `module-attribute` ¶

kNvfp4Dynamic = QuantKey(
    FP4_DTYPE,
    scale=kNvfp4DynamicGroupScale,
    scale2=kStaticTensorScale,
)

kNvfp4DynamicGroupScale `module-attribute` ¶

kNvfp4DynamicGroupScale = ScaleDesc(
    FP8_DTYPE, False, GroupShape(1, 16)
)

kNvfp4Static `module-attribute` ¶

kNvfp4Static = QuantKey(
    FP4_DTYPE,
    scale=kNvfp4StaticGroupScale,
    scale2=kStaticTensorScale,
)

kNvfp4StaticGroupScale `module-attribute` ¶

kNvfp4StaticGroupScale = ScaleDesc(
    FP8_DTYPE, True, GroupShape(1, 16)
)

kStatic128BlockScale `module-attribute` ¶

kStatic128BlockScale = ScaleDesc(
    float32, True, GroupShape(128, 128)
)

kStaticChannelScale `module-attribute` ¶

kStaticChannelScale = ScaleDesc(float32, True, PER_CHANNEL)

kStaticTensorScale `module-attribute` ¶

kStaticTensorScale = ScaleDesc(float32, True, PER_TENSOR)

kStaticTokenScale `module-attribute` ¶

kStaticTokenScale = ScaleDesc(float32, True, PER_TOKEN)

GroupShape ¶

Bases: _GroupShape

This class describes the quantization group shape. It includes static members for common shapes (per-tensor, per-token).

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

class GroupShape(_GroupShape):
    """
    This class describes the quantization group shape.
    It includes static members for common shapes (per-tensor, per-token).
    """

    # Aliases for common quantization group shapes
    PER_TENSOR: ClassVar["GroupShape"]
    PER_TOKEN: ClassVar["GroupShape"]
    PER_CHANNEL: ClassVar["GroupShape"]

    def is_per_tensor(self) -> bool:
        return self.row == -1 and self.col == -1

    def is_per_token(self) -> bool:
        return self.row == 1 and self.col == -1

    def is_per_channel(self) -> bool:
        return self.row == -1 and self.col == 1

    def is_per_group(self) -> bool:
        return self.row == 1 and self.col >= 1

PER_CHANNEL `class-attribute` ¶

PER_CHANNEL: GroupShape

PER_TENSOR `class-attribute` ¶

PER_TENSOR: GroupShape

PER_TOKEN `class-attribute` ¶

PER_TOKEN: GroupShape

is_per_channel ¶

is_per_channel() -> bool

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def is_per_channel(self) -> bool:
    return self.row == -1 and self.col == 1

is_per_group ¶

is_per_group() -> bool

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def is_per_group(self) -> bool:
    return self.row == 1 and self.col >= 1

is_per_tensor ¶

is_per_tensor() -> bool

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def is_per_tensor(self) -> bool:
    return self.row == -1 and self.col == -1

is_per_token ¶

is_per_token() -> bool

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def is_per_token(self) -> bool:
    return self.row == 1 and self.col == -1

QuantKey `dataclass` ¶

Class for identifying the type of quantization. dtype: quantized data type scale: scale descriptor scale2: second-level scale descriptor symmetric: symmetric if True, asymmetric if False

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

@dataclass(frozen=True)
class QuantKey:
    """
    Class for identifying the type of quantization.
    dtype: quantized data type
    scale: scale descriptor
    scale2: second-level scale descriptor
    symmetric: symmetric if True, asymmetric if False
    """

    dtype: torch.dtype
    scale: ScaleDesc
    scale2: ScaleDesc | None = None
    symmetric: bool = True

    def __str__(self):
        scale2_str = f"scale2({self.scale2})," if self.scale2 else ""
        return (
            f"QuantKey({fx.graph.dtype_abbrs[self.dtype]},"
            f"scale({self.scale}),{scale2_str}"
            f"{'a' if not self.symmetric else ''}symmetric)"
        )

dtype `instance-attribute` ¶

dtype: dtype

scale `instance-attribute` ¶

scale: ScaleDesc

scale2 `class-attribute` `instance-attribute` ¶

scale2: ScaleDesc | None = None

symmetric `class-attribute` `instance-attribute` ¶

symmetric: bool = True

init ¶

__init__(
    dtype: dtype,
    scale: ScaleDesc,
    scale2: ScaleDesc | None = None,
    symmetric: bool = True,
) -> None

str ¶

__str__()

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def __str__(self):
    scale2_str = f"scale2({self.scale2})," if self.scale2 else ""
    return (
        f"QuantKey({fx.graph.dtype_abbrs[self.dtype]},"
        f"scale({self.scale}),{scale2_str}"
        f"{'a' if not self.symmetric else ''}symmetric)"
    )

ScaleDesc `dataclass` ¶

Class for describing a single quantization scaling factor. dtype: data type of the scale static: static scale if True, dynamic if False group_shape: group shape of the scale

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

@dataclass(frozen=True)
class ScaleDesc:
    """
    Class for describing a single quantization scaling factor.
    dtype: data type of the scale
    static: static scale if True, dynamic if False
    group_shape: group shape of the scale
    """

    dtype: torch.dtype
    static: bool
    group_shape: GroupShape

    def __str__(self):
        d = {
            GroupShape.PER_TENSOR: "per_tensor",
            GroupShape.PER_TOKEN: "per_token",
            GroupShape.PER_CHANNEL: "per_channel",
        }
        group_shape = d.get(self.group_shape, str(self.group_shape))
        return (
            f"{fx.graph.dtype_abbrs[self.dtype]},"
            f"{'static' if self.static else 'dynamic'},{group_shape}"
        )

dtype `instance-attribute` ¶

dtype: dtype

group_shape `instance-attribute` ¶

group_shape: GroupShape

static `instance-attribute` ¶

static: bool

init ¶

__init__(
    dtype: dtype, static: bool, group_shape: GroupShape
) -> None

str ¶

__str__()

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def __str__(self):
    d = {
        GroupShape.PER_TENSOR: "per_tensor",
        GroupShape.PER_TOKEN: "per_token",
        GroupShape.PER_CHANNEL: "per_channel",
    }
    group_shape = d.get(self.group_shape, str(self.group_shape))
    return (
        f"{fx.graph.dtype_abbrs[self.dtype]},"
        f"{'static' if self.static else 'dynamic'},{group_shape}"
    )

_GroupShape ¶

Bases: NamedTuple

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

class _GroupShape(NamedTuple):
    row: int
    col: int

col `instance-attribute` ¶

col: int

row `instance-attribute` ¶

row: int

_normalize_quant_group_shape ¶

_normalize_quant_group_shape(
    x: Tensor, group_shape: GroupShape
)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape):
    # -1 means full extent
    return (
        group_shape[0] if group_shape[0] > 0 else x.shape[-2],
        group_shape[1] if group_shape[1] > 0 else x.shape[-1],
    )

awq_pack ¶

awq_pack(
    q_w: Tensor, num_bits: int, size_k: int, size_n: int
)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def awq_pack(
    q_w: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    assert q_w.shape == (size_k, size_n)

    # Interleave column dim (for the dequantize code) and pack it to int32
    if num_bits == 4:
        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
    elif num_bits == 8:
        interleave = numpy.array([0, 2, 1, 3])
    else:
        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))

    q_w = q_w.reshape((-1, len(interleave)))[:, interleave].ravel()
    q_w = q_w.reshape((-1, size_n)).contiguous()

    return pack_cols(q_w, num_bits, size_k, size_n)

convert_bf16_scales_to_fp8 ¶

convert_bf16_scales_to_fp8(
    quant_fp8: Callable, scales: Tensor
) -> tuple[Tensor, Tensor]

Convert a BF16 scale tensor into the pair of (fp8_scales, channel_scales) expected by W4A8 GEMM kernels.

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def convert_bf16_scales_to_fp8(
    quant_fp8: Callable, scales: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Convert a BF16 scale tensor into the pair of (fp8_scales, channel_scales)
    expected by W4A8 GEMM kernels.
    """
    assert scales.is_contiguous(), (
        f"scale tensor must be contiguous, got {scales.stride()=}"
    )
    assert scales.is_cuda, "scales must be on gpu"

    orig_shape = scales.shape
    k_groups = orig_shape[-1]
    flat_scales = scales.view(-1, k_groups)

    fp8_scales, chan_scales = quant_fp8(flat_scales)
    fp8_scales = (fp8_scales.float() / 8.0).to(torch.float8_e4m3fn)
    chan_scales *= 8.0

    # restore original shape
    fp8_scales = fp8_scales.view(orig_shape)
    chan_scales = chan_scales.view(orig_shape[:-1], -1)

    return fp8_scales, chan_scales

convert_packed_uint4b8_to_signed_int4_inplace ¶

convert_packed_uint4b8_to_signed_int4_inplace(
    t: Tensor,
) -> Tensor

Convert int4b8 (packed to int32) to signed int4

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def convert_packed_uint4b8_to_signed_int4_inplace(t: torch.Tensor) -> torch.Tensor:
    """
    Convert int4b8 (packed to int32) to signed int4
    """
    assert t.is_cuda, "tensor must be on gpu"
    assert t.dtype == torch.int32, f"expected int32 packed weights but got {t.dtype}"

    # loop through the 8 4-bit nibbles in each int32 entry
    for i in range(8):
        shift = 4 * i
        # extract the i-th 4-bit nibble
        nib = (t >> shift) & 0xF
        # clear the original nibble by masking out
        t &= ~(0xF << shift)
        # convert int4b8 [0..15] to signed int4 [-8..7] by subtracting 8
        # and update in-place
        t |= ((nib - 8) & 0xF) << shift

    return t

cutlass_fp4_supported ¶

cutlass_fp4_supported() -> bool

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def cutlass_fp4_supported() -> bool:
    if not current_platform.is_cuda():
        return False
    capability_tuple = current_platform.get_device_capability()
    capability = -1 if capability_tuple is None else capability_tuple.to_int()
    return cutlass_scaled_mm_supports_fp4(capability)

get_and_maybe_dequant_weights ¶

get_and_maybe_dequant_weights(
    layer: LinearBase, out_dtype: dtype = float32
)

Return layer's unquantized weights in [out, in] layout

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def get_and_maybe_dequant_weights(
    layer: "LinearBase", out_dtype: torch.dtype = torch.float32
):
    """Return layer's unquantized weights in [out, in] layout"""
    from vllm.model_executor.layers.linear import UnquantizedLinearMethod
    from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod

    weight = get_attribute_fallback(layer, ["weight", "qweight", "weight_packed"])

    # Unquantized layer: just return base weights
    if layer.quant_method is None or isinstance(
        layer.quant_method, UnquantizedLinearMethod
    ):
        return weight.to(out_dtype)

    # Simple Fp8 case: rescale with tensor or block weight scales
    if (
        isinstance(layer.quant_method, Fp8LinearMethod)
        and not layer.quant_method.use_marlin
        # DeepGEMM transforms the scales using `transform_sf_into_required_layout` into
        # a layout that is not compatible with `scaled_dequantize`.
        and not layer.quant_method.use_deep_gemm
    ):
        weight_scales = get_attribute_fallback(
            layer, ["weight_scale", "weight_scale_inv"]
        )
        dequant_weights = scaled_dequantize(
            weight,
            weight_scales,
            group_shape=layer.weight_block_size,
            out_dtype=out_dtype,
        )
        # per-tensor scaling stores weights in [in, out] layout
        if not layer.quant_method.block_quant:
            dequant_weights = dequant_weights.T
        return dequant_weights

    # NOTE: Most generic base case
    # - Call the layer with identity matrix which returns unquantized weights.
    # - Must be used with extra care when dealing with static activation quantization:
    #   quantizing 1.0 may lead to over/underflows
    # - Should only be used offline, since it's O(N^3)
    assert hasattr(layer, "input_size_per_partition")
    eye = torch.eye(
        layer.input_size_per_partition,
        dtype=out_dtype,
        device=weight.device,
    )
    dequant_weights = layer.quant_method.apply(layer, eye, bias=None).to(out_dtype)
    return dequant_weights.T

get_attribute_fallback ¶

get_attribute_fallback(obj, attributes: list[str])

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def get_attribute_fallback(obj, attributes: list[str]):
    for attr in attributes:
        if hasattr(obj, attr):
            return getattr(obj, attr)
    raise AttributeError(f"'{obj}' has no recognized attributes: {attributes}.")

get_fp8_min_max ¶

get_fp8_min_max() -> tuple[float, float]

Get the min and max values for FP8 quantization.

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def get_fp8_min_max() -> tuple[float, float]:
    """Get the min and max values for FP8 quantization."""
    # Using the default value (240.0) from pytorch will cause accuracy
    # issue on dynamic quantization models on ROCm. Here, use 224.0 for fnuz
    # on ROCm platforms that use the torch.float8_e4m3fnuz dtype.
    if current_platform.is_fp8_fnuz():
        return -224.0, 224.0
    finfo = torch.finfo(current_platform.fp8_dtype())
    return finfo.min, finfo.max

get_pack_factor ¶

get_pack_factor(num_bits)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def get_pack_factor(num_bits):
    assert 32 % num_bits == 0, f"Unsupported num_bits = {num_bits}"
    return 32 // num_bits

gptq_pack ¶

gptq_pack(
    q_w: Tensor, num_bits: int, size_k: int, size_n: int
)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def gptq_pack(
    q_w: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    return pack_rows(q_w, num_bits, size_k, size_n)

gptq_quantize_weights ¶

gptq_quantize_weights(
    w: Tensor,
    quant_type: ScalarType,
    group_size: int,
    act_order: bool,
    test_perm: Tensor | None = None,
)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def gptq_quantize_weights(
    w: torch.Tensor,
    quant_type: ScalarType,
    group_size: int,
    act_order: bool,
    test_perm: torch.Tensor | None = None,
):
    size_k, _ = w.shape

    assert w.is_floating_point(), "w must be float"
    assert quant_type in SUPPORTED_GPTQ_QUANT_TYPES, (
        f"Unsupported gptq type = {quant_type}"
    )
    assert group_size in SUPPORTED_GROUP_SIZES + [size_k], (
        f"Unsupported groupsize = {group_size}"
    )

    w_ref, w_q, w_s, _ = quantize_weights(w, quant_type, group_size)

    # Apply act_order
    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
    if act_order:
        assert group_size < size_k, (
            "For act_order, groupsize = {} must be less than size_k = {}".format(
                group_size, size_k
            )
        )

        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size, test_perm)

    return w_ref, w_q, w_s, g_idx, rand_perm

group_broadcast ¶

group_broadcast(t, shape)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def group_broadcast(t, shape):
    for i, s in enumerate(shape):
        # If tensor has fewer dimensions than target shape, treat missing
        # dimensions as size 1 (standard PyTorch broadcasting behavior)
        t_dim_size = t.shape[i] if i < t.ndim else 1
        if t_dim_size != s and t_dim_size != 1:
            assert s % t_dim_size == 0
            t = (
                t.unsqueeze(i + 1)
                .expand(*t.shape[: i + 1], s // t_dim_size, *t.shape[i + 1 :])
                .flatten(i, i + 1)
            )
    return t

is_layer_skipped ¶

is_layer_skipped(
    prefix: str,
    ignored_layers: list[str],
    fused_mapping: Mapping[
        str, list[str]
    ] = MappingProxyType({}),
    *,
    skip_with_substr: bool = False,
) -> bool

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def is_layer_skipped(
    prefix: str,
    ignored_layers: list[str],
    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
    *,
    skip_with_substr: bool = False,
) -> bool:
    def prefix_full_match(prefix: str, ignored_layers: list[str]) -> bool:
        return prefix in ignored_layers

    # For case like: ignored_layers = ["self_attn"]
    def substr_match(prefix: str, ignored_layers: list[str]) -> bool:
        return any(layer in prefix for layer in ignored_layers)

    match_func = substr_match if skip_with_substr else prefix_full_match

    # prefix: model.layers.0.self_attn.q_proj
    # proj_name: q_proj
    proj_name = prefix.split(".")[-1]

    # Fused layers like gate_up_proj or qkv_proj will not be fused
    # in the safetensors checkpoint. So, we convert the name
    # from the fused version to unfused + check to make sure that
    # each shard of the fused layer has the same scheme.
    if proj_name in fused_mapping:
        shard_prefixes = [
            prefix.replace(proj_name, shard_proj_name)
            for shard_proj_name in fused_mapping[proj_name]
        ]

        is_skipped = None
        for shard_prefix in shard_prefixes:
            is_shard_skipped = match_func(shard_prefix, ignored_layers)

            if is_skipped is None:
                is_skipped = is_shard_skipped
            elif is_shard_skipped != is_skipped:
                raise ValueError(
                    f"Detected some but not all shards of {prefix} "
                    "are quantized. All shards of fused layers "
                    "to have the same precision."
                )
    elif "experts" in prefix and not skip_with_substr:
        expert_ignore_layers = filter(
            lambda layer_name: "experts" in layer_name, ignored_layers
        )
        return any(
            prefix in layer_name if not skip_with_substr else layer_name in prefix
            for layer_name in expert_ignore_layers
        )
    else:
        is_skipped = match_func(prefix, ignored_layers)

    assert is_skipped is not None
    return is_skipped

pack_cols ¶

pack_cols(
    q_w: Tensor, num_bits: int, size_k: int, size_n: int
)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def pack_cols(
    q_w: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    assert q_w.shape == (size_k, size_n)

    pack_factor = get_pack_factor(num_bits)
    assert size_n % pack_factor == 0

    orig_device = q_w.device

    q_w = q_w.cpu().numpy().astype(numpy.uint32)

    q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)

    for i in range(pack_factor):
        q_res |= q_w[:, i::pack_factor] << num_bits * i

    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
    q_res = q_res.contiguous()

    return q_res

pack_quantized_values_into_int32 ¶

pack_quantized_values_into_int32(
    w_q: Tensor, wtype: ScalarType, packed_dim: int = 0
)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def pack_quantized_values_into_int32(
    w_q: torch.Tensor, wtype: ScalarType, packed_dim: int = 0
):
    # move dim to pack to the end
    perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
    inv_perm = tuple(perm.index(i) for i in range(len(perm)))
    w_q_perm = w_q.permute(perm)

    pack_factor = 32 // wtype.size_bits
    mask = (1 << wtype.size_bits) - 1

    new_shape_perm = list(w_q_perm.shape)
    assert w_q_perm.shape[-1] % pack_factor == 0
    new_shape_perm[-1] //= pack_factor

    res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device)
    for i in range(pack_factor):
        res |= (w_q_perm[..., i::pack_factor] & mask) << wtype.size_bits * i

    return res.permute(inv_perm)

pack_rows ¶

pack_rows(
    q_w: Tensor, num_bits: int, size_k: int, size_n: int
)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def pack_rows(
    q_w: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    assert q_w.shape == (size_k, size_n)

    pack_factor = get_pack_factor(num_bits)
    assert size_k % pack_factor == 0

    orig_device = q_w.device

    q_w = q_w.cpu().numpy().astype(numpy.uint32)

    q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)

    for i in range(pack_factor):
        q_res |= q_w[i::pack_factor, :] << num_bits * i

    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
    return q_res

permute_rows ¶

permute_rows(
    q_w: Tensor,
    w_ref: Tensor,
    group_size: int,
    test_perm: Tensor | None = None,
)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def permute_rows(
    q_w: torch.Tensor,
    w_ref: torch.Tensor,
    group_size: int,
    test_perm: torch.Tensor | None = None,
):
    assert q_w.shape == w_ref.shape

    orig_device = q_w.device
    k_size, _ = q_w.shape

    g_idx = torch.zeros((k_size,), dtype=torch.int32)
    for i in range(k_size):
        g_idx[i] = i // group_size

    # Simulate act_order by doing a random permutation on K
    rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)

    g_idx = g_idx[rand_perm].contiguous()
    q_w = q_w[rand_perm, :].contiguous()
    w_ref = w_ref[rand_perm, :].contiguous()

    return (
        w_ref.to(device=orig_device),
        q_w.to(device=orig_device),
        g_idx.to(device=orig_device),
        rand_perm.to(device=orig_device),
    )

prep_scale_for_group_broadcast ¶

prep_scale_for_group_broadcast(
    scale: Tensor, x: Tensor, group_shape: GroupShape | None
) -> Tensor

Prepare the input quantization scale for group broadcasting.

Parameters:

Name	Type	Description	Default
`scale`	`Tensor`	The scale tensor (scalar or 1D).	required
`x`	`Tensor`	Target tensor whose shape determines broadcast dimensions.	required
`group_shape`	`GroupShape \| None`	GroupShape to broadcast over.	required

Returns:

Type	Description
`Tensor`	scale reshaped for correct broadcasting.

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def prep_scale_for_group_broadcast(
    scale: torch.Tensor,
    x: torch.Tensor,
    group_shape: GroupShape | None,
) -> torch.Tensor:
    """
    Prepare the input quantization scale for group broadcasting.

    Args:
        scale: The scale tensor (scalar or 1D).
        x: Target tensor whose shape determines broadcast dimensions.
        group_shape: GroupShape to broadcast over.

    Returns:
        scale reshaped for correct broadcasting.
    """
    if scale.numel() == 1:
        # For per-tensor quant, keep the scale as a scalar (not reshaped to (1, 1)).
        # This avoids misclassifying it as channelwise quant in Fp8LinearOp.apply,
        # where the "per_tensor_activations" check relies on "x_scale.dim() < 2":
        #   per_tensor_activations = (x_scale.numel() == 1) and x_scale.dim() < 2
        # For all other cases, reshape scalar scales to (1, 1) for broadcasting.
        return (
            scale
            if group_shape is not None and group_shape.is_per_tensor()
            else scale.reshape(1, 1)
        )
    if scale.ndim == 1:
        assert group_shape is not None, (
            "group_shape must be provided to correctly broadcast 1D scale"
        )
        rows, cols = _normalize_quant_group_shape(x, group_shape)
        # Determine broadcasting dimension: either rows or columns match group size
        if rows == x.shape[-2]:
            scale = scale.unsqueeze(-2)
        elif cols == x.shape[-1]:
            scale = scale.unsqueeze(-1)
        else:
            raise ValueError(
                f"1D scale with shape {scale.shape} cannot be broadcast to x with shape"
                f" {x.shape}, group_shape={(rows, cols)}"
            )
    return scale

quantize_weights ¶

quantize_weights(
    w: Tensor,
    quant_type: ScalarType,
    group_size: int | None,
    zero_points: bool = False,
    ref_zero_points_after_scales: bool = False,
)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def quantize_weights(
    w: torch.Tensor,
    quant_type: ScalarType,
    group_size: int | None,
    zero_points: bool = False,
    ref_zero_points_after_scales: bool = False,
):
    assert quant_type.is_integer(), (
        "Floating point quantization may work but has not been tested"
    )
    assert not zero_points or group_size is not None, (
        "to have group zero points, group_size must be provided "
        "(-1 group_size is channelwise)"
    )

    orig_device = w.device
    orig_type = w.dtype
    size_k, size_n = w.shape

    assert w.is_floating_point(), "w must be float"

    if group_size == -1:
        group_size = size_k

    # Reshape to [groupsize, -1]
    if group_size is not None and group_size < size_k:
        w = w.reshape((-1, group_size, size_n))
        w = w.permute(1, 0, 2)
        w = w.reshape((group_size, -1))

    # Compute scale for each group
    max_val = torch.max(w, 0, keepdim=True).values
    min_val = torch.min(w, 0, keepdim=True).values

    max_q_val = quant_type.max()
    min_q_val = quant_type.min()

    w_s = torch.Tensor([1.0]).to(w.device)  # unscaled case
    maybe_w_zp = None
    if group_size is not None:
        if zero_points:
            assert not quant_type.is_signed() and quant_type.max() > 0
            w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
            maybe_w_zp = (
                torch.round(torch.abs(min_val / w_s)).clamp(min_q_val, max_q_val).int()
            )
        else:
            # If the bias is such that there are no possible negative/positive
            #  values, set the max value to inf to avoid divide by 0
            w_s = torch.max(
                abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)),
            )

    # Quantize
    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
    w_q = torch.clamp(w_q, min_q_val, max_q_val)

    # Compute ref (dequantized)
    # For some kernels (namely Machete) the zero-points are applied after the
    # scales are applied, for this case computing the reference in similar way
    # allows us to use tighter error tolerances in our unit tests.
    if ref_zero_points_after_scales and maybe_w_zp is not None:
        w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
    else:
        w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s

    if quant_type.has_bias():
        w_q += quant_type.bias

    # Restore original shapes
    if group_size is not None and group_size < size_k:

        def reshape_w(w):
            w = w.reshape((group_size, -1, size_n))
            w = w.permute(1, 0, 2)
            w = w.reshape((size_k, size_n)).contiguous()
            return w

        w_q = reshape_w(w_q)
        w_ref = reshape_w(w_ref)
        w_s = w_s.reshape((-1, size_n)).contiguous()

    if maybe_w_zp is not None:
        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
        maybe_w_zp = maybe_w_zp.to(device=orig_device)

    return (
        w_ref.to(device=orig_device),
        w_q.to(device=orig_device),
        w_s if group_size is not None else None,
        maybe_w_zp,
    )

scaled_dequantize ¶

scaled_dequantize(
    x_q: Tensor,
    x_s: Tensor,
    group_shape: GroupShape | None = None,
    out_dtype: dtype = float32,
) -> Tensor

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def scaled_dequantize(
    x_q: torch.Tensor,
    x_s: torch.Tensor,
    group_shape: GroupShape | None = None,
    out_dtype: torch.dtype = torch.float32,
) -> torch.Tensor:
    x_s = prep_scale_for_group_broadcast(x_s, x_q, group_shape)
    if group_shape is not None:
        assert x_s.shape[-1] == x_q.shape[-1] // group_shape[1]
        assert x_s.shape[-2] == x_q.shape[-2] // group_shape[0]
    x_s = group_broadcast(x_s.to(torch.float32), x_q.shape)
    return (x_q.to(torch.float32) * x_s).to(out_dtype)

scaled_quantize ¶

scaled_quantize(
    x: Tensor,
    group_shape: GroupShape,
    quant_dtype: dtype,
    compute_dtype: dtype | None = None,
) -> tuple[Tensor, Tensor]

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Input tensor to quantize	required
`group_shape`	`GroupShape`	Shape of quantization groups	required
`quant_dtype`	`dtype`	Target quantized dtype (e.g., torch.float8_e4m3fn)	required
`compute_dtype`	`dtype \| None`	Optional dtype for intermediate computations. If None, uses input dtype. Use torch.float32 for higher precision.	`None`

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def scaled_quantize(
    x: torch.Tensor,
    group_shape: GroupShape,
    quant_dtype: torch.dtype,
    compute_dtype: torch.dtype | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Args:
        x: Input tensor to quantize
        group_shape: Shape of quantization groups
        quant_dtype: Target quantized dtype (e.g., torch.float8_e4m3fn)
        compute_dtype: Optional dtype for intermediate computations.
            If None, uses input dtype. Use torch.float32 for higher precision.
    """
    group_shape = _normalize_quant_group_shape(x, group_shape)
    assert quant_dtype.is_floating_point, (
        "currently `scaled_quantize` only supports floating point dtypes "
        "but could be extended to support other dtypes"
    )

    finfo = torch.finfo(quant_dtype)

    # Convert to compute dtype if specified
    x_compute = x if compute_dtype is None else x.to(compute_dtype)

    # Reshape (M, N) into (BLK_M, BLOCK_SIZE_M, BLK_N, BLOCK_SIZE_N)
    assert x.ndim == 2
    assert x.shape[0] % group_shape[0] == 0 and x.shape[1] % group_shape[1] == 0
    blk_m, blk_n = x.shape[0] // group_shape[0], x.shape[1] // group_shape[1]
    x_blkd = x_compute.reshape(blk_m, group_shape[0], blk_n, group_shape[1])

    # Permute to (BLK_M, BLK_N, BLOCK_SIZE_M, BLOCK_SIZE_N)
    x_blkd_permd = x_blkd.permute(0, 2, 1, 3)
    # Flatten to (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N)
    x_blkd_permd = x_blkd_permd.flatten(start_dim=2)

    # Compute scales
    min_val, max_val = x_blkd_permd.aminmax(dim=-1)
    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
    _, fp8_max = get_fp8_min_max()
    scale = fp8_max / amax

    # Apply scale and convert from:
    # (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N) to (M, N)
    x_scl_sat = (
        (x_blkd_permd * scale.unsqueeze(-1))
        .clamp(min=finfo.min, max=finfo.max)
        .reshape(blk_m, blk_n, group_shape[0], group_shape[1])
        .permute(0, 2, 1, 3)
        .reshape(x.shape)
    )

    return x_scl_sat.to(quant_dtype).contiguous(), scale.float().reciprocal()

sort_weights ¶

sort_weights(q_w: Tensor, g_idx: Tensor)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
    orig_device = q_w.device

    sort_indices = torch.argsort(g_idx).to(dtype=torch.int32)  # Sort based on g_idx

    g_idx = g_idx[sort_indices].contiguous()
    q_w = q_w[sort_indices, :].contiguous()

    return (
        q_w.to(device=orig_device),
        g_idx.to(device=orig_device),
        sort_indices.to(device=orig_device),
    )

swizzle_blockscale ¶

swizzle_blockscale(scale: Tensor) -> Tensor

Pad and block-interleave the FP4 block-scales so that they match the data layout expected by the CUTLASS / FlashInfer kernels.

Parameters¶

scale: torch.Tensor

Returns¶

torch.Tensor The swizzled tensor with the same logical shape as scale.

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor:
    """
    Pad and block-interleave the FP4 block-scales so that they match the data
    layout expected by the CUTLASS / FlashInfer kernels.

    Parameters
    ----------
    scale: torch.Tensor

    Returns
    -------
    torch.Tensor
        The swizzled tensor with the same logical shape as *scale*.
    """
    assert scale.dtype == torch.float8_e4m3fn, (
        "swizzle_blockscale expects the input tensor to be in "
        "torch.float8_e4m3fn format."
    )

    scale_ndim = scale.ndim
    if scale_ndim == 2:
        scale = scale.unsqueeze(0)  # (1, M, K)
    assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales."

    B, M, K = scale.shape

    def _round_up(x: int, m: int) -> int:
        return (x + m - 1) // m * m

    M_padded = _round_up(M, 128)
    K_padded = _round_up(K, 4)

    padded = torch.zeros(
        (B, M_padded, K_padded), dtype=scale.dtype, device=scale.device
    )
    padded[:B, :M, :K] = scale

    # Reshape / permute to the layout required by the kernel.
    padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4)
    swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda()

    if scale_ndim == 2:
        return swizzled.reshape(M_padded, K_padded)
    return swizzled.reshape(B, M_padded, K_padded)

unpack_cols ¶

unpack_cols(
    packed_q_w: Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def unpack_cols(
    packed_q_w: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    pack_factor = get_pack_factor(num_bits)
    assert size_n % pack_factor == 0
    assert packed_q_w.shape == (size_k, size_n // pack_factor), (
        "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
            packed_q_w.shape, size_k, size_n, pack_factor
        )
    )

    orig_device = packed_q_w.device

    packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
    q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)

    mask = (1 << num_bits) - 1
    for i in range(pack_factor):
        vals = packed_q_w_cpu & mask
        packed_q_w_cpu >>= num_bits
        q_res[:, i::pack_factor] = vals

    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
    q_res = q_res.contiguous()

    return q_res

unpack_quantized_values_into_int32 ¶

unpack_quantized_values_into_int32(
    w_q: Tensor, wtype: ScalarType, packed_dim: int = 0
)

Source code in vllm/model_executor/layers/quantization/utils/quant_utils.py

def unpack_quantized_values_into_int32(
    w_q: torch.Tensor, wtype: ScalarType, packed_dim: int = 0
):
    # move dim to pack to the end
    perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
    inv_perm = tuple(perm.index(i) for i in range(len(perm)))
    w_q_perm = w_q.permute(perm)

    pack_factor = 32 // wtype.size_bits
    mask = (1 << wtype.size_bits) - 1

    new_shape_perm = list(w_q_perm.shape)
    new_shape_perm[-1] *= pack_factor

    res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device)
    for i in range(pack_factor):
        res[..., i::pack_factor] = (w_q_perm >> wtype.size_bits * i) & mask

    return res.permute(inv_perm)

vllm.model_executor.layers.quantization.utils.quant_utils ¶

FP4_DTYPE module-attribute ¶

FP8_DTYPE module-attribute ¶

SUPPORTED_GPTQ_QUANT_TYPES module-attribute ¶

SUPPORTED_GROUP_SIZES module-attribute ¶

kDynamic128Scale module-attribute ¶

kDynamic64Scale module-attribute ¶

kDynamicTensorScale module-attribute ¶

kDynamicTokenScale module-attribute ¶

kFp8Dynamic128Sym module-attribute ¶

kFp8Dynamic64Sym module-attribute ¶

kFp8DynamicTensorSym module-attribute ¶

kFp8DynamicTokenSym module-attribute ¶

kFp8Static128BlockSym module-attribute ¶

kFp8StaticChannelSym module-attribute ¶

kFp8StaticTensorSym module-attribute ¶

kFp8StaticTokenSym module-attribute ¶

kNvfp4Dynamic module-attribute ¶

kNvfp4DynamicGroupScale module-attribute ¶

kNvfp4Static module-attribute ¶

kNvfp4StaticGroupScale module-attribute ¶

kStatic128BlockScale module-attribute ¶

kStaticChannelScale module-attribute ¶

kStaticTensorScale module-attribute ¶

kStaticTokenScale module-attribute ¶

GroupShape ¶

PER_CHANNEL class-attribute ¶

PER_TENSOR class-attribute ¶

PER_TOKEN class-attribute ¶

is_per_channel ¶

is_per_group ¶

is_per_tensor ¶

is_per_token ¶

QuantKey dataclass ¶

dtype instance-attribute ¶

scale instance-attribute ¶

scale2 class-attribute instance-attribute ¶

symmetric class-attribute instance-attribute ¶

__init__ ¶

__str__ ¶

ScaleDesc dataclass ¶

dtype instance-attribute ¶

group_shape instance-attribute ¶

static instance-attribute ¶

__init__ ¶

__str__ ¶

_GroupShape ¶

col instance-attribute ¶

row instance-attribute ¶

_normalize_quant_group_shape ¶

awq_pack ¶

convert_bf16_scales_to_fp8 ¶

convert_packed_uint4b8_to_signed_int4_inplace ¶

cutlass_fp4_supported ¶

get_and_maybe_dequant_weights ¶

get_attribute_fallback ¶

get_fp8_min_max ¶

get_pack_factor ¶

gptq_pack ¶

gptq_quantize_weights ¶

group_broadcast ¶

is_layer_skipped ¶

pack_cols ¶

pack_quantized_values_into_int32 ¶

pack_rows ¶

permute_rows ¶

prep_scale_for_group_broadcast ¶

quantize_weights ¶

scaled_dequantize ¶

scaled_quantize ¶

sort_weights ¶

swizzle_blockscale ¶

Parameters¶

Returns¶

unpack_cols ¶

unpack_quantized_values_into_int32 ¶

FP4_DTYPE `module-attribute` ¶

FP8_DTYPE `module-attribute` ¶

SUPPORTED_GPTQ_QUANT_TYPES `module-attribute` ¶

SUPPORTED_GROUP_SIZES `module-attribute` ¶

kDynamic128Scale `module-attribute` ¶

kDynamic64Scale `module-attribute` ¶

kDynamicTensorScale `module-attribute` ¶

kDynamicTokenScale `module-attribute` ¶

kFp8Dynamic128Sym `module-attribute` ¶

kFp8Dynamic64Sym `module-attribute` ¶

kFp8DynamicTensorSym `module-attribute` ¶

kFp8DynamicTokenSym `module-attribute` ¶

kFp8Static128BlockSym `module-attribute` ¶

kFp8StaticChannelSym `module-attribute` ¶

kFp8StaticTensorSym `module-attribute` ¶

kFp8StaticTokenSym `module-attribute` ¶

kNvfp4Dynamic `module-attribute` ¶

kNvfp4DynamicGroupScale `module-attribute` ¶

kNvfp4Static `module-attribute` ¶

kNvfp4StaticGroupScale `module-attribute` ¶

kStatic128BlockScale `module-attribute` ¶

kStaticChannelScale `module-attribute` ¶

kStaticTensorScale `module-attribute` ¶

kStaticTokenScale `module-attribute` ¶

PER_CHANNEL `class-attribute` ¶

PER_TENSOR `class-attribute` ¶

PER_TOKEN `class-attribute` ¶

QuantKey `dataclass` ¶

dtype `instance-attribute` ¶

scale `instance-attribute` ¶

scale2 `class-attribute` `instance-attribute` ¶

symmetric `class-attribute` `instance-attribute` ¶

init ¶

str ¶

ScaleDesc `dataclass` ¶

dtype `instance-attribute` ¶

group_shape `instance-attribute` ¶

static `instance-attribute` ¶

init ¶

str ¶

col `instance-attribute` ¶

row `instance-attribute` ¶