vllm-project · kylesayrs · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -6,6 +6,7 @@
 from loguru import logger
 from safetensors import safe_open
 from torch.nn import Module
+from transformers import PreTrainedModel
 
 from llmcompressor.core import active_session, create_session, pre_initialize_structure
 from llmcompressor.typing import Processor
@@ -14,20 +15,19 @@
 
 __all__ = [
     "initialize_recipe",
-    "save_model_and_recipe",
     "copy_python_files_from_model_cache",
     "fallback_to_cpu",
     "parse_dtype",
     "get_session_model",
     "get_completed_stages",
     "save_completed_stages",
+    "save_checkpoint",
 ]
 
 
 def initialize_recipe(model: Module, recipe_path: str):
     """
     Initializes a recipe that has been previously applied to the model
-
     :param model: PyTorch model to apply structure to
     :param recipe_path: path to recipe to apply to the model
     """
@@ -49,43 +49,22 @@ def initialize_recipe(model: Module, recipe_path: str):
     logger.info(f"Applied {msg} to the model")
 
 
-def save_model_and_recipe(
-    model: Module,
+def save_checkpoint(
     save_path: str,
-    processor: Optional[Processor] = None,
-    save_safetensors: bool = False,
-    save_compressed: bool = False,
+    model: PreTrainedModel,
+    processor: Processor,
+    save_safetensors: bool = True,
+    save_compressed: bool = True,
 ):
-    """
-    Save a model, processor and the currently loaded recipe to file
-
-    :param model: pytorch model to save
-    :param save_path: path to save output to
-    :param processor: model processor or tokenizer to save
-    :param save_safetensors: whether to save as safetensors or pickle (bin)
-    :param save_compressed: whether to compress sparse weights on disk
-    """
-    # avoid circular import
-    from llmcompressor.transformers.utils.helpers import RECIPE_FILE_NAME
-
+    # saving the model also saves the recipe
     model.save_pretrained(
-        save_path, save_compressed=save_compressed, safe_serialization=save_safetensors
+        save_path,
+        save_safetensors=save_safetensors,
+        save_compressed=save_compressed,
     )
-
     if processor is not None:
         processor.save_pretrained(save_path)
 
-    logger.info("Saving output to {}".format(os.path.abspath(save_path)))
-
-    recipe_path = os.path.join(save_path, RECIPE_FILE_NAME)
-    session = active_session()
-    recipe_yaml_str = session.get_serialized_recipe()
-    with open(recipe_path, "w") as fp:
-        fp.write(recipe_yaml_str)
-
-    # copy python files from cache dir to save_path if any
-    copy_python_files_from_model_cache(model, save_path)
-
 
 def fallback_to_cpu(device: str) -> str:
     """

diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py
@@ -17,6 +17,7 @@
 from llmcompressor.pytorch.model_load.helpers import (
     get_completed_stages,
     get_session_model,
+    save_checkpoint,
     save_completed_stages,
 )
 from llmcompressor.pytorch.utils import tensors_to_device
@@ -27,7 +28,7 @@
     make_dataset_splits,
 )
 from llmcompressor.typing import Processor
-from llmcompressor.utils.fsdp.helpers import is_fsdp_model, save_model_and_recipe
+from llmcompressor.utils.fsdp.helpers import is_fsdp_model
 
 
 class StageRunner:
@@ -261,17 +262,20 @@ def run_sequential_stages(self, checkpoint: Optional[str] = None):
                 self.train(checkpoint=checkpoint, stage=stage_name)
             checkpoint = None
 
+            # save model between stages
             if (
                 self._training_args.output_dir
                 != TrainingArguments.__dataclass_fields__["output_dir"].default
+                and self.trainer.accelerator.is_main_process
             ):
-                save_model_and_recipe(
-                    model=self.trainer.model,
+                save_checkpoint(
                     save_path=self._output_dir,
+                    model=self.trainer.model,
                     processor=self.processor,
                     save_safetensors=self._training_args.save_safetensors,
                     save_compressed=self._model_args.save_compressed,
                 )
+            self.trainer.accelerator.wait_for_everyone()
 
             # save stage to checkpoint dir
             if self.trainer.accelerator.is_main_process:

diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -24,15 +24,13 @@
 from llmcompressor.modifiers.distillation.utils.pytorch.model_wrapper import (
     KDModelWrapper,
 )
-from llmcompressor.pytorch.model_load.helpers import get_session_model
+from llmcompressor.pytorch.model_load.helpers import get_session_model, save_checkpoint
 from llmcompressor.pytorch.utils import ModuleSparsificationInfo
-from llmcompressor.transformers import RECIPE_FILE_NAME
 from llmcompressor.transformers.finetune.callbacks import (
     DisableHalfPrecisionCallback,
     TrainingLoopCallbacks,
 )
 from llmcompressor.utils.fsdp.context import summon_full_params_context
-from llmcompressor.utils.fsdp.helpers import is_fsdp_model, save_pretrained_fsdp
 from llmcompressor.utils.pytorch import qat_active
 
 if TYPE_CHECKING:
@@ -66,8 +64,8 @@ class SessionManagerMixIn:
     def __init__(
         self,
         recipe: str,
+        data_args: "DatasetArguments",
         model_args: "ModelArguments",
-        data_args: Optional["DatasetArguments"] = None,
         teacher: Optional[Union[Module, str]] = None,
         recipe_args: Optional[Union[Dict[str, Any], str]] = None,
         **kwargs,
@@ -185,7 +183,6 @@ def initialize_structure(self, stage: Optional[str] = None):
         """
         Initialize any recipe structural changes such as quantization on the model,
         return immediately if session has already been initialized
-
         :param stage: Optional stage of recipe to run, or None to run all stages
         """
         session = active_session()
@@ -415,7 +412,6 @@ def evaluate(self, *args, **kwargs):
         Run a sparsification evaluation cycle.
         Runs initialize_structure for the sparse session before calling
         super().evaluate() and finalization of the session after.
-
         :param args: positional args to pass to super().evaluate()
         :param kwargs: keyword args to pass to super().evaluate()
         :return: the output from super.evaluate()
@@ -432,12 +428,12 @@ def predict(self, *args, **kwargs):
         Run a sparsification prediction cycle.
         Runs initialize_structure for the sparse session before calling
         super().predict() and finalization of the session after.
-
         :param args: positional args to pass to super().predict()
         :param kwargs: keyword args to pass to super().predict()
         :return: the output from super.predict()
         """
         self.initialize_structure()
+
         output = super().predict(*args, **kwargs)
         self.finalize_session()
 
@@ -483,44 +479,19 @@ def save_model(self, output_dir: str, _internal_call=False, _is_oneshot=False):
 
         # knowledge distillation requires making wrappers transparent during
         if isinstance(self.model, KDModelWrapper):
-            self.model.prepare_for_save()
+            self.model.prepare_for_save()  # TODO: move to finalize
 
-        if not is_fsdp_model(self.model):
-            self.model.save_pretrained(
+        # save checkpoint
+        self.save_state()
+        if self.accelerator.is_main_process:
+            processor = getattr(self, "processing_class", self.tokenizer)
+            save_checkpoint(
                 output_dir,
-                save_compressed=self.model_args.save_compressed,
-                safe_serialization=self.args.save_safetensors,
-            )
-        else:  # FSDP model
-            save_pretrained_fsdp(
                 model=self.model,
-                accelerator=self.accelerator,
-                output_dir=output_dir,
+                processor=processor,
+                save_safetensors=self.args.save_safetensors,
                 save_compressed=self.model_args.save_compressed,
-                save_safetensors=self.metadata.get("save_safetensors", False),
             )
-
-        self.save_state()
-        processor = getattr(self, "processing_class", self.tokenizer)
-        if processor is not None:
-            processor.save_pretrained(output_dir)
-
-        if not self.recipe:
-            return
-
-        if self.accelerator.is_main_process:
-            # save recipe, will contain modifiers from the model's original recipe as
-            # well as those added from self.recipe
-            recipe_path = os.path.join(output_dir, RECIPE_FILE_NAME)
-            session = active_session()
-            recipe_yaml_str = session.get_serialized_recipe()
-            with open(recipe_path, "w") as fp:
-                fp.write(recipe_yaml_str)
-
-            logger.info(
-                f"Saved LLM Compressor recipe with model state to {recipe_path}"
-            )
-
         self.accelerator.wait_for_everyone()
 
         if isinstance(self.model, KDModelWrapper):

diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -45,12 +45,12 @@
     get_session_model,
     initialize_recipe,
     parse_dtype,
+    save_checkpoint,
 )
 from llmcompressor.recipe import Recipe, StageRunType
 from llmcompressor.transformers.finetune.runner import StageRunner
 from llmcompressor.transformers.finetune.trainer import Trainer
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
-    modify_fsdp_model_save_pretrained,
     modify_save_pretrained,
     patch_tied_tensors_bug,
 )
@@ -418,7 +418,10 @@ def main(
 
     # wrap model.save_pretrained
     if is_fsdp_model(model):
-        modify_fsdp_model_save_pretrained(trainer, processor)
+        raise NotImplementedError(
+            "FSDP models are not supported in the current release but will be "
+            "suported in future releases of LLM Compressor"
+        )
     else:
         modify_save_pretrained(model)
 
@@ -455,16 +458,19 @@ def main(
         stage_runner.predict()
 
     # save if model was provided as a string or custom output_dir was set
-
     if isinstance(model_args.model, str) or (
         training_args.output_dir
         != TrainingArguments.__dataclass_fields__["output_dir"].default
+        and trainer.accelerator.is_main_process
     ):
-        model.save_pretrained(
-            training_args.output_dir, save_compressed=model_args.save_compressed
+        save_checkpoint(
+            save_path=training_args.output_dir,
+            model=model,
+            processor=processor,
+            save_safetensors=True,
+            save_compressed=model_args.save_compressed,
         )
-        if processor is not None:
-            processor.save_pretrained(training_args.output_dir)
+    trainer.accelerator.wait_for_everyone()
 
     # Clean up the CompressionSession before exit if requested
     if recipe_args.clear_sparse_session: