tensorflow · CrickWu · Jul 7, 2023 · Jul 7, 2023
@@ -272,7 +272,7 @@ class TrainerConfig(base_config.Config):
   recovery_max_trials: int = 0
   validation_summary_subdir: str = "validation"
   # Preemption on-demand checkpoint.
-  preemption_on_demand_checkpoint: bool = True
+  preemption_on_demand_checkpoint: bool = False
 
 
 @dataclasses.dataclass

@@ -53,7 +53,7 @@ def _run_experiment_with_preemption_recovery(params, model_dir):
           **params.runtime.model_parallelism())
       with distribution_strategy.scope():
         task = task_factory.get_task(params.task, logging_dir=model_dir)
-      preemption_watcher = tf.distribute.experimental.PreemptionWatcher()
+      preemption_watcher = None
 
       train_lib.run_experiment(
           distribution_strategy=distribution_strategy,

@@ -46,7 +46,7 @@ def _run_experiment_with_preemption_recovery(params, model_dir):
           tpu_address=params.runtime.tpu)
       with distribution_strategy.scope():
         task = task_factory.get_task(params.task, logging_dir=model_dir)
-      preemption_watcher = tf.distribute.experimental.PreemptionWatcher()
+      preemption_watcher = None
 
       train_lib.run_experiment(
           distribution_strategy=distribution_strategy,