Enable dequant fp8 weights quantized per-channel with compressed-tensor method

mandy-li · mandy-li · commit 438e32731582 · 2025-11-23T20:48:31.000-08:00
Signed-off-by: mandy-li &lt;mandy.j.li@intel.com&gt;
diff --git a/vllm_gaudi/ops/hpu_compressed_tensors.py b/vllm_gaudi/ops/hpu_compressed_tensors.py
@@ -88,6 +88,19 @@ def get_hpu_scheme(self, layer: torch.nn.Module):
             raise ValueError(f"{scheme_classname} compressed format is not supported on HPU")
         return hpu_scheme
 
+    def dequant_fp8_weight(self, layer: torch.nn.Module) -> torch.Tensor:
+        if layer.scheme.strategy == QuantizationStrategy.CHANNEL.value:  # weights were quantized per-channel
+            dequant_weight = layer.weight.to(
+                layer.weight_scale.dtype) * layer.weight_scale.squeeze()
+            return dequant_weight.to(torch.bfloat16).t()
+        else:
+            raise NotImplementedError(
+                "Implemented per-channel dequantization only")
+
+    def get_dequant_weights_func(
+        self, ) -> Optional[Callable[[torch.nn.Module], torch.Tensor]]:
+        return self.dequant_fp8_weight
+
 
 @CustomOp.register_oot(name='CompressedTensorsW8A16Fp8')
 class HPUCompressedTensorsW8A8Fp8(CompressedTensorsScheme):