daslab-testing
/

CloverLM

@@ -72,15 +72,28 @@ class QuartetIILinearMethod(LinearMethodBase):
         layer.register_parameter("weight", weight)
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        from scipy.linalg import hadamard as scipy_hadamard
-        device = layer.weight.device
-        had_np = scipy_hadamard(128) * 128 ** -0.5
-        layer.had = torch.tensor(
-            had_np, dtype=torch.bfloat16, device=device, requires_grad=False,
-        )
-        layer.scratch_amax = torch.empty(
-            (), dtype=torch.uint32, device=device,
-        )
     def apply(
         self,
@@ -91,9 +104,8 @@ class QuartetIILinearMethod(LinearMethodBase):
         from quartet2.quant import quant_fp4, NVFP4QuantMode
         from quartet2.linear import abs_max, _fp4_mm
-        weight = layer.weight
         orig_shape = x.shape
-        out_features = weight.shape[0]
         flat_x = x.reshape(-1, x.shape[-1])
         num_rows = flat_x.shape[0]
@@ -104,38 +116,22 @@ class QuartetIILinearMethod(LinearMethodBase):
         else:
             pad_rows = 0
-        w_remainder = out_features % 128
-        if w_remainder != 0:
-            w_pad = 128 - w_remainder
-            weight = F.pad(weight, (0, 0, 0, w_pad))
-        else:
-            w_pad = 0
         input_amax = abs_max(flat_x)
-        weight_amax = abs_max(weight)
-        mode = NVFP4QuantMode.FOUR_SIX
-        scale_override = 1.0
         input_fp4 = quant_fp4(
             flat_x, amax=input_amax,
-            scale_override=scale_override, mode=mode,
-        )
-        weight_fp4 = quant_fp4(
-            weight, amax=weight_amax,
-            scale_override=scale_override, mode=mode,
         )
-        alpha = input_fp4.tensor_scale * weight_fp4.tensor_scale
         output = _fp4_mm(
-            input_fp4.fp4, weight_fp4.fp4,
-            input_fp4.micro_scales, weight_fp4.micro_scales,
             alpha,
         )
         if pad_rows > 0:
             output = output[:num_rows]
-        if w_pad > 0:
             output = output[:, :out_features]
         output = output.reshape(*orig_shape[:-1], output.shape[-1])

         layer.register_parameter("weight", weight)
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        from quartet2.quant import quant_fp4, NVFP4QuantMode
+        from quartet2.linear import abs_max
+        weight = layer.weight.data
+        device = weight.device
+        out_features = weight.shape[0]
+        w_remainder = out_features % 128
+        if w_remainder != 0:
+            w_pad = 128 - w_remainder
+            weight = F.pad(weight, (0, 0, 0, w_pad))
+        else:
+            w_pad = 0
+        mode = NVFP4QuantMode.FOUR_SIX
+        weight_amax = abs_max(weight)
+        wq = quant_fp4(weight, amax=weight_amax, scale_override=1.0, mode=mode)
+        layer.weight_fp4 = wq.fp4
+        layer.weight_micro_scales = wq.micro_scales
+        layer.weight_tensor_scale = wq.tensor_scale
+        layer.w_pad = w_pad
     def apply(
         self,
         from quartet2.quant import quant_fp4, NVFP4QuantMode
         from quartet2.linear import abs_max, _fp4_mm
         orig_shape = x.shape
+        out_features = layer.weight.shape[0]
         flat_x = x.reshape(-1, x.shape[-1])
         num_rows = flat_x.shape[0]
         else:
             pad_rows = 0
         input_amax = abs_max(flat_x)
         input_fp4 = quant_fp4(
             flat_x, amax=input_amax,
+            scale_override=1.0, mode=NVFP4QuantMode.FOUR_SIX,
         )
+        alpha = input_fp4.tensor_scale * layer.weight_tensor_scale
         output = _fp4_mm(
+            input_fp4.fp4, layer.weight_fp4,
+            input_fp4.micro_scales, layer.weight_micro_scales,
             alpha,
         )
         if pad_rows > 0:
             output = output[:num_rows]
+        if layer.w_pad > 0:
             output = output[:, :out_features]
         output = output.reshape(*orig_shape[:-1], output.shape[-1])