NREL · nmerket · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024
diff --git a/buildstockbatch/hpc.py b/buildstockbatch/hpc.py
@@ -586,13 +586,13 @@ def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False)
         print(f"Submitting job to {n_workers} {memory}MB memory nodes using {n_procs} cores in each.")
         # Throw an error if the files already exist.
 
-        if not upload_only:
-            for subdir in ("parquet", "results_csvs"):
-                subdirpath = pathlib.Path(self.output_dir, "results", subdir)
-                if subdirpath.exists():
-                    raise FileExistsError(
-                        f"{subdirpath} already exists. This means you may have run postprocessing already. If you are sure you want to rerun, delete that directory and try again."
-                    )  # noqa E501
+        # if not upload_only:
+        #     for subdir in ("parquet", "results_csvs"):
+        #         subdirpath = pathlib.Path(self.output_dir, "results", subdir)
+        #         if subdirpath.exists():
+        #             raise FileExistsError(
+        #                 f"{subdirpath} already exists. This means you may have run postprocessing already. If you are sure you want to rerun, delete that directory and try again."
+        #             )  # noqa E501
 
         # Move old output logs and config to make way for new ones
         for filename in (

diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py
@@ -363,9 +363,14 @@ def get_upgrade_list(cfg):
 
 
 def write_metadata_files(fs, parquet_root_dir, partition_columns):
+    common_metadata_filename = f"{parquet_root_dir}/_common_metadata"
+    metadata_filename = f"{parquet_root_dir}/_metadata"
+    for filename in [common_metadata_filename, metadata_filename]:
+        if fs.exists(filename):
+            fs.rm(filename)
     df = dd.read_parquet(parquet_root_dir)
     sch = pa.Schema.from_pandas(df._meta_nonempty)
-    parquet.write_metadata(sch, f"{parquet_root_dir}/_common_metadata")
+    parquet.write_metadata(sch, common_metadata_filename)
     logger.info(f"Written _common_metadata to {parquet_root_dir}")
 
     if partition_columns:
@@ -405,7 +410,7 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True):
 
     # create the postprocessing results directories
     for dr in dirs:
-        fs.makedirs(dr)
+        fs.makedirs(dr, exist_ok=True)
 
     # Results "CSV"
     results_json_files = fs.glob(f"{sim_output_dir}/results_job*.json.gz")
@@ -428,35 +433,51 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True):
     ]
     results_df = dd.from_delayed(delayed_results_dfs, verify_meta=False)
 
+    checkpoint_filename = f"{results_dir}/checkpoints.json"
+    if fs.exists(checkpoint_filename):
+        with fs.open(checkpoint_filename, "r") as f:
+            checkpoint = json.load(f)
+    else:
+        checkpoint = {"upgrades_processed": []}
+
     if do_timeseries:
         # Look at all the parquet files to see what columns are in all of them.
-        logger.info("Collecting all the columns in timeseries parquet files.")
-        do_timeseries = False
-        all_ts_cols = set()
-        for upgrade_folder in fs.glob(f"{ts_in_dir}/up*"):
-            ts_filenames = fs.ls(upgrade_folder)
-            if ts_filenames:
-                do_timeseries = True
-                logger.info(f"Found {len(ts_filenames)} files for upgrade {Path(upgrade_folder).name}.")
-                files_bag = db.from_sequence(ts_filenames, partition_size=100)
-                all_ts_cols |= files_bag.map(partial(get_cols, fs)).fold(lambda x, y: x.union(y)).compute()
-                logger.info("Collected all the columns")
-            else:
-                logger.info(f"There are no timeseries files for upgrade {Path(upgrade_folder).name}.")
-
-        # Sort the columns
-        all_ts_cols_sorted = ["building_id"] + sorted(x for x in all_ts_cols if x.startswith("time"))
-        all_ts_cols.difference_update(all_ts_cols_sorted)
-        all_ts_cols_sorted.extend(sorted(x for x in all_ts_cols if not x.endswith("]")))
-        all_ts_cols.difference_update(all_ts_cols_sorted)
-        all_ts_cols_sorted.extend(sorted(all_ts_cols))
+        if checkpoint.get("all_ts_cols") is not None:
+            all_ts_cols_sorted = checkpoint["all_ts_cols"]
+        else:
+            logger.info("Collecting all the columns in timeseries parquet files.")
+            do_timeseries = False
+            all_ts_cols = set()
+            for upgrade_folder in fs.glob(f"{ts_in_dir}/up*"):
+                ts_filenames = fs.ls(upgrade_folder)
+                if ts_filenames:
+                    do_timeseries = True
+                    logger.info(f"Found {len(ts_filenames)} files for upgrade {Path(upgrade_folder).name}.")
+                    files_bag = db.from_sequence(ts_filenames, partition_size=100)
+                    all_ts_cols |= files_bag.map(partial(get_cols, fs)).fold(lambda x, y: x.union(y)).compute()
+                    logger.info("Collected all the columns")
+                else:
+                    logger.info(f"There are no timeseries files for upgrade {Path(upgrade_folder).name}.")
+
+            # Sort the columns
+            all_ts_cols_sorted = ["building_id"] + sorted(x for x in all_ts_cols if x.startswith("time"))
+            all_ts_cols.difference_update(all_ts_cols_sorted)
+            all_ts_cols_sorted.extend(sorted(x for x in all_ts_cols if not x.endswith("]")))
+            all_ts_cols.difference_update(all_ts_cols_sorted)
+            all_ts_cols_sorted.extend(sorted(all_ts_cols))
+            checkpoint["all_ts_cols"] = all_ts_cols_sorted
+            with fs.open(checkpoint_filename, "w") as f:
+                json.dump(checkpoint, f)
         logger.info(f"Got {len(all_ts_cols_sorted)} columns in total")
         logger.info(f"The columns are: {all_ts_cols_sorted}")
     else:
         logger.warning("There are no timeseries files for any upgrades.")
 
     results_df_groups = results_df.groupby("upgrade")
     upgrade_list = get_upgrade_list(cfg)
+    for upgrade_id in checkpoint["upgrades_processed"]:
+        logger.info(f"Upgrade {upgrade_id} has already been processed, skipping.")
+    upgrade_list = sorted(set(upgrade_list).difference(checkpoint["upgrades_processed"]))
     partition_columns = cfg.get("postprocessing", {}).get("partition_columns", [])
     partition_columns = [c.lower() for c in partition_columns]
     df_partition_columns = [f"build_existing_model.{c}" for c in partition_columns]
@@ -509,6 +530,8 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True):
         else:
             results_parquet_dir = f"{parquet_dir}/upgrades/upgrade={upgrade_id}"
 
+        if fs.exists(results_parquet_dir):
+            fs.rm(results_parquet_dir, recursive=True)
         fs.makedirs(results_parquet_dir)
         parquet_filename = f"{results_parquet_dir}/results_up{upgrade_id:02d}.parquet"
         logger.info(f"Writing {parquet_filename}")
@@ -560,6 +583,8 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True):
                 assert isinstance(fs, S3FileSystem)
                 ts_out_loc = f"s3://{ts_dir}/upgrade={upgrade_id}/"
 
+            if fs.exists(ts_out_loc):
+                fs.rm(ts_out_loc, recursive=True)
             fs.makedirs(ts_out_loc)
             logger.info(f"Created directory {ts_out_loc} for writing. Now concatenating ...")
 
@@ -596,7 +621,10 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True):
                     )
 
             logger.info(f"Finished combining and saving timeseries for upgrade{upgrade_id}.")
-    logger.info("All aggregation completed. ")
+        checkpoint["upgrades_processed"].append(upgrade_id)
+        with fs.open(checkpoint_filename, "w") as f:
+            json.dump(checkpoint, f)
+    logger.info("All aggregation completed.")
     if do_timeseries:
         logger.info("Writing timeseries metadata files")
         write_metadata_files(fs, ts_dir, partition_columns)

diff --git a/buildstockbatch/schemas/v0.3.yaml b/buildstockbatch/schemas/v0.3.yaml
@@ -50,7 +50,7 @@ hpc-postprocessing-spec:
   time: int(required=True)
   n_workers: int(min=1, max=32, required=False)
   node_memory_mb: int(min=85248, max=751616, required=False)
-  n_procs: int(min=1, max=36, required=False)
+  n_procs: int(min=1, max=52, required=False)
   parquet_memory_mb: int(min=100, max=4096, required=False)
 
 

diff --git a/docs/installation.rst b/docs/installation.rst
@@ -139,7 +139,9 @@ You can get a list of installed environments by looking in the envs directory
 
    ls /kfs2/shared-projects/buildstock/envs
 
-Developer Installaion
+.. _kestrel_dev_install:
+
+Developer Installation
 ......................
 
 For those doing development work on buildstockbatch (not most users), a new
@@ -194,6 +196,8 @@ You can get a list of installed environments by looking in the envs directory
 
    ls /shared-projects/buildstock/envs
 
+.. _eagle_dev_install:
+
 Developer installation
 ......................