Enable dequant fp8 weights quantized per-channel with compressed-tensor method

mandy-li · mandy-li · commit 5a6a1f52bd65 · 2025-11-26T10:09:55.000-08:00
Signed-off-by: mandy-li &lt;mandy.j.li@intel.com&gt;
diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh
@@ -162,6 +162,15 @@ run_compressed_w4a16_moe_gidx_test() {
     echo "✅ Test with compressed w4a16 MoE with g_idx passed."
 }
 
+# Llama-3.3-70B-Instruct-FP8-dynamic + INC dynamic quant
+run_llama3_70b_inc_dynamic_quant_test() {
+    echo "➡️ Testing Llama-3.3-70B-Instruct-FP8-dynamic + inc dynamic quant in torch.compile mode ..."
+    QUANT_CONFIG="${VLLM_GAUDI_PREFIX}/tests/models/language/generation/inc_maxabs_dynamic_quant.json" \
+    HABANA_VISIBLE_DEVICES=all RUNTIME_SCALE_PATCHING=0 VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=0 \
+    python -u "${VLLM_GAUDI_PREFIX}/tests/full_tests/generate.py" --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --max-model-len 2048
+    echo "✅ Test with Llama-3.3-70B-Instruct-FP8-dynamic + inc dynamic quant in torch.compile mode passed."
+}
+
 # GSM8K on granite-8b
 run_gsm8k_granite_test() {
     echo "➡️ Testing GSM8K on granite-8b..."
@@ -304,6 +313,7 @@ launch_all_tests() {
     run_spec_decode_ngram_test
     run_spec_decode_eagle3_test
     run_spec_decode_eagle3_num_spec_2_test
+    run_llama3_70b_inc_dynamic_quant_test
     #run_embedding_model_test
     echo "🎉 All test suites passed successfully!"
 }
diff --git a/tests/models/language/generation/inc_maxabs_dynamic_quant.json b/tests/models/language/generation/inc_maxabs_dynamic_quant.json
@@ -0,0 +1,8 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW",
+    "dynamic_quantization": "True",
+    "scale_format": "CONST",
+    "dump_stats_path": ""
+}
diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py
@@ -767,6 +767,12 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def bind_dequant_func(layer):
+    # For INC path, we attach the dequant func to the layer
+    layer.get_dequant_weights_func = types.MethodType(get_dequant_weights_func, layer)
+    return layer
+
+
 def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False):
     weight, orig_M, orig_N = pad_block_fp8_weight_naive(layer.weight.data, layer.weight_scale_inv.data,
                                                         layer.quant_config.weight_block_size)
diff --git a/vllm_gaudi/ops/hpu_compressed_tensors.py b/vllm_gaudi/ops/hpu_compressed_tensors.py
@@ -88,6 +88,13 @@ def get_hpu_scheme(self, layer: torch.nn.Module):
             raise ValueError(f"{scheme_classname} compressed format is not supported on HPU")
         return hpu_scheme
 
+    def dequant_fp8_weight(self, layer: torch.nn.Module) -> torch.Tensor:
+        if layer.scheme.strategy == QuantizationStrategy.CHANNEL:  # weights were quantized per-channel
+            dequant_weight = layer.weight.to(layer.weight_scale.dtype) * layer.weight_scale.squeeze()
+            return dequant_weight.to(torch.bfloat16).t()
+        else:
+            raise NotImplementedError("Implemented per-channel dequantization only")
+
 
 @CustomOp.register_oot(name='CompressedTensorsW8A16Fp8')
 class HPUCompressedTensorsW8A8Fp8(CompressedTensorsScheme):
@@ -115,6 +122,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             # required by torch.compile to be torch.nn.Parameter
             layer.input_scale = torch.nn.Parameter(layer.input_scale.data, requires_grad=False)
 
+        # bind dequant function to layer for per-channel quantization
+        if layer.scheme.strategy == QuantizationStrategy.CHANNEL:
+            hpu_ops.bind_dequant_func(layer)
+
     def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: list[int],
                        input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs):
         """