From a4ce96cac95eb0f59fd898936fe30018728da641 Mon Sep 17 00:00:00 2001 From: lruizcalico Date: Wed, 19 Jun 2024 16:17:07 -0700 Subject: [PATCH 1/4] add gcs upload for composer train --- src/baskerville/trainer.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index 75e538e..91f00d9 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -17,7 +17,8 @@ import numpy as np import tensorflow as tf - +import tempfile +from baskerville.helpers.gcs_utils import is_gcs_path, upload_folder_gcs from baskerville import metrics @@ -119,6 +120,15 @@ def __init__( self.batch_size = self.train_data[0].batch_size self.compiled = False + # if log_dir is in gcs then create a local temp dir + if is_gcs_path(self.log_dir): + folder_name = self.log_dir.split("/")[-1] + self.log_dir = tempfile.mkdtemp() + folder_name + self.gcs_log_dir = log_dir + self.gcs = True + else: + self.gcs = False + # early stopping self.patience = self.params.get("patience", 20) @@ -498,6 +508,10 @@ def eval_step1_distr(xd, yd): print(" - valid_r2: %.4f" % valid_r2[di].result().numpy(), end="") early_stop_stat = valid_r[di].result().numpy() + # upload to gcs + if self.gcs: + upload_folder_gcs(self.log_dir, self.gcs_log_dir) + # checkpoint managers[di].save() model.save( @@ -697,6 +711,10 @@ def eval_step_distr(xd, yd): end="", ) + # upload to gcs + if self.gcs: + upload_folder_gcs(self.log_dir, self.gcs_log_dir) + # checkpoint manager.save() seqnn_model.save("%s/model_check.h5" % self.out_dir) From 7116151cac7acc2010c62d062a67a40725c73b56 Mon Sep 17 00:00:00 2001 From: lruizcalico Date: Wed, 19 Jun 2024 21:21:09 -0700 Subject: [PATCH 2/4] fix folder name --- src/baskerville/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index 91f00d9..33ec730 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -122,8 +122,8 @@ def __init__( # if log_dir is in gcs then create a local temp dir if is_gcs_path(self.log_dir): - folder_name = self.log_dir.split("/")[-1] - self.log_dir = tempfile.mkdtemp() + folder_name + folder_name = "/".join(self.log_dir.split("/")[3:]) + self.log_dir = tempfile.mkdtemp() + "/" + folder_name self.gcs_log_dir = log_dir self.gcs = True else: From e4ce9af45c047cee3f44c216e36390b38b2c4919 Mon Sep 17 00:00:00 2001 From: lruizcalico Date: Thu, 20 Jun 2024 19:52:31 -0700 Subject: [PATCH 3/4] upload train valid folders --- src/baskerville/trainer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index 33ec730..16a89d7 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -510,8 +510,8 @@ def eval_step1_distr(xd, yd): # upload to gcs if self.gcs: - upload_folder_gcs(self.log_dir, self.gcs_log_dir) - + upload_folder_gcs(train_log_dir, self.gcs_log_dir + "/train") + upload_folder_gcs(valid_log_dir, self.gcs_log_dir + "/valid") # checkpoint managers[di].save() model.save( @@ -713,7 +713,8 @@ def eval_step_distr(xd, yd): # upload to gcs if self.gcs: - upload_folder_gcs(self.log_dir, self.gcs_log_dir) + upload_folder_gcs(train_log_dir, self.gcs_log_dir + "/train") + upload_folder_gcs(valid_log_dir, self.gcs_log_dir + "/valid") # checkpoint manager.save() From 4acf8a29b05d2461474fb8f091d7349b3f12a274 Mon Sep 17 00:00:00 2001 From: lruizcalico Date: Thu, 20 Jun 2024 21:07:24 -0700 Subject: [PATCH 4/4] fix folder --- src/baskerville/trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index 16a89d7..edeb0df 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -510,8 +510,8 @@ def eval_step1_distr(xd, yd): # upload to gcs if self.gcs: - upload_folder_gcs(train_log_dir, self.gcs_log_dir + "/train") - upload_folder_gcs(valid_log_dir, self.gcs_log_dir + "/valid") + upload_folder_gcs(train_log_dir, self.gcs_log_dir) + upload_folder_gcs(valid_log_dir, self.gcs_log_dir) # checkpoint managers[di].save() model.save( @@ -713,8 +713,8 @@ def eval_step_distr(xd, yd): # upload to gcs if self.gcs: - upload_folder_gcs(train_log_dir, self.gcs_log_dir + "/train") - upload_folder_gcs(valid_log_dir, self.gcs_log_dir + "/valid") + upload_folder_gcs(train_log_dir, self.gcs_log_dir) + upload_folder_gcs(valid_log_dir, self.gcs_log_dir) # checkpoint manager.save()