huggingface · NouamaneTazi · May 27, 2024 · May 27, 2024 · May 27, 2024 · May 30, 2024
diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
@@ -337,6 +337,9 @@ class Config:
     data_stages: Optional[List[DatasetStageArgs]] = None
     profiler: Optional[ProfilerArgs] = None
     lighteval: Optional[LightEvalConfig] = None
+    # If you want to signal the training script to stop, you just need to touch the following file
+    # We force users to set one in order to programmatically be able to remove it.
+    kill_switch_path: Optional[Path] = None
 
     @classmethod
     def create_empty(cls):
@@ -345,6 +348,9 @@ def create_empty(cls):
 
     def __post_init__(self):
         # Some final sanity checks across separate arguments sections:
+        if self.general is not None and os.environ.get("SLURM_JOB_ID", None) is not None:
+            self.run = self.general.run.replace("%j", os.environ["SLURM_JOB_ID"])
+
         if self.profiler is not None and self.profiler.profiler_export_path is not None:
             assert self.tokens.train_steps < 10
 
@@ -376,9 +382,13 @@ def __post_init__(self):
                 for i in range(len(self.data_stages) - 1)
             ), "The stages are not sorted by start_training_step in increasing order"
 
-        # # if lighteval, we need tokenizer to be defined
-        # if self.checkpoints.lighteval is not None:
-        #     assert self.tokenizer.tokenizer_name_or_path is not None
+        # if self.lighteval is not None:
+        #     # assert self.tokenizer.tokenizer_name_or_path is not None
+        #     if self.lighteval.checkpoints_path is None:
+        #         self.lighteval.checkpoints_path = self.checkpoints.checkpoints_path
+
+        if isinstance(self.kill_switch_path, str):
+            self.kill_switch_path = Path(self.kill_switch_path)
 
     @property
     def global_batch_size(self):

diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
@@ -2,6 +2,7 @@
 import json
 import os
 import shutil
+import sys
 import time
 from dataclasses import asdict
 from pathlib import Path
@@ -281,7 +282,12 @@ def pre_training(self, *args, **kwargs):
             )
 
     def post_train_step(self):
-        pass
+        # Kill switch
+        self.check_kill_switch(save_ckpt=True)
+
+        # # Update our background upload/removal of checkpoints
+        # if self.s3_mover is not None:
+        #     self.s3_mover.update()
 
     def post_training(self):
         pass
@@ -895,6 +901,21 @@ def _mark_tied_parameters(
     ):
         mark_tied_parameters(model=model, parallel_context=parallel_context, parallel_config=parallel_config)
 
+    def check_kill_switch(self, save_ckpt: bool):
+        if self.config.kill_switch_path and self.config.kill_switch_path.exists():
+            log_rank(
+                f"Detected kill switch at {self.config.kill_switch_path}. Exiting",
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
+            )
+
+            # Save checkpoint
+            if save_ckpt:
+                self.save_checkpoint()
+            dist.barrier()
+            sys.exit(0)
+
 
 def mark_tied_parameters(
     model: NanotronModel, parallel_context: ParallelContext, parallel_config: Optional[ParallelismArgs] = None