Skip to content

Commit

Permalink
Perf: remove redundant checks on data integrity (deepmodeling#4433)
Browse files Browse the repository at this point in the history
Systems are aggregated here

https://github.com/deepmodeling/deepmd-kit/blob/f343a3b212edab5525502e0261f3068c0b6fb1f6/deepmd/utils/data_system.py#L802
and later initialized here

https://github.com/deepmodeling/deepmd-kit/blob/f343a3b212edab5525502e0261f3068c0b6fb1f6/deepmd/utils/data_system.py#L809-L810

This process will instantiate `DeepmdData` class, and it will perform
data integrity checks

https://github.com/deepmodeling/deepmd-kit/blob/e695a91ca6f7a1c9c830ab1c58b7b7a05db3da23/deepmd/utils/data.py#L80-L82

Besides, the checking process enumerates all items for all ranks, which
is unnecessary and quite slow. So this PR removes this check.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced flexibility in defining test sizes by allowing percentage
input for the `test_size` parameter.
- Introduced a new method to automatically compute test sizes based on
the specified percentage of total data.
- Improved path handling to accept both string and Path inputs,
enhancing usability.

- **Bug Fixes**
- Improved error handling for invalid paths, ensuring users receive
clear feedback when files are not found.

- **Deprecation Notice**
- The `get_test` method is now deprecated, with new logic implemented
for loading test data when necessary.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Chun Cai <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jinzhe Zeng <[email protected]>
Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
(cherry picked from commit 3917cf0)
  • Loading branch information
caic99 authored and njzjz committed Dec 22, 2024
1 parent 5420de7 commit 24232a9
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 26 deletions.
2 changes: 2 additions & 0 deletions deepmd/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def __init__(
) -> None:
"""Constructor."""
root = DPPath(sys_path)
if not root.is_dir():
raise FileNotFoundError(f"System {sys_path} is not found!")
self.dirs = root.glob(set_prefix + ".*")
if not len(self.dirs):
raise FileNotFoundError(f"No {set_prefix}.* is found in {sys_path}")
Expand Down
22 changes: 2 additions & 20 deletions deepmd/utils/data_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@
from deepmd.utils.out_stat import (
compute_stats_from_redu,
)
from deepmd.utils.path import (
DPPath,
)

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -103,6 +100,8 @@ def __init__(
del rcut
self.system_dirs = systems
self.nsystems = len(self.system_dirs)
if self.nsystems <= 0:
raise ValueError("No systems provided")
self.data_systems = []
for ii in self.system_dirs:
self.data_systems.append(
Expand Down Expand Up @@ -755,23 +754,6 @@ def process_systems(systems: Union[str, list[str]]) -> list[str]:
systems = expand_sys_str(systems)
elif isinstance(systems, list):
systems = systems.copy()
help_msg = "Please check your setting for data systems"
# check length of systems
if len(systems) == 0:
msg = "cannot find valid a data system"
log.fatal(msg)
raise OSError(msg, help_msg)
# roughly check all items in systems are valid
for ii in systems:
ii = DPPath(ii)
if not ii.is_dir():
msg = f"dir {ii} is not a valid dir"
log.fatal(msg)
raise OSError(msg, help_msg)
if not (ii / "type.raw").is_file():
msg = f"dir {ii} is not a valid data system dir"
log.fatal(msg)
raise OSError(msg, help_msg)
return systems


Expand Down
12 changes: 6 additions & 6 deletions deepmd/utils/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from typing import (
ClassVar,
Optional,
Union,
)

import h5py
Expand Down Expand Up @@ -157,19 +158,16 @@ class DPOSPath(DPPath):
Parameters
----------
path : str
path : Union[str, Path]
path
mode : str, optional
mode, by default "r"
"""

def __init__(self, path: str, mode: str = "r") -> None:
def __init__(self, path: Union[str, Path], mode: str = "r") -> None:
super().__init__()
self.mode = mode
if isinstance(path, Path):
self.path = path
else:
self.path = Path(path)
self.path = Path(path)

def load_numpy(self) -> np.ndarray:
"""Load NumPy array.
Expand Down Expand Up @@ -300,6 +298,8 @@ def __init__(self, path: str, mode: str = "r") -> None:
# so we do not support file names containing #...
s = path.split("#")
self.root_path = s[0]
if not os.path.isfile(self.root_path):
raise FileNotFoundError(f"{self.root_path} not found")
self.root = self._load_h5py(s[0], mode)
# h5 path: default is the root path
self._name = s[1] if len(s) > 1 else "/"
Expand Down

0 comments on commit 24232a9

Please sign in to comment.