Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleaned and extended function that extracts datetimes from paths #2181

Merged
merged 6 commits into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion esmvalcore/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,7 +862,7 @@ def _update_timerange(self):
dataset.facets.pop('timerange')
dataset.supplementaries = []
check.data_availability(dataset)
intervals = [_get_start_end_date(f.name) for f in dataset.files]
intervals = [_get_start_end_date(f) for f in dataset.files]

min_date = min(interval[0] for interval in intervals)
max_date = max(interval[1] for interval in intervals)
Expand Down
2 changes: 1 addition & 1 deletion esmvalcore/esgf/_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def select_by_time(files, timerange):
for file in files:
start_date, end_date = _parse_period(timerange)
try:
start, end = _get_start_end_date(file.name)
start, end = _get_start_end_date(file)
except ValueError:
# If start and end year cannot be read from the filename
# just select everything.
Expand Down
162 changes: 74 additions & 88 deletions esmvalcore/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import re
from glob import glob
from pathlib import Path
from typing import Any, Union
from typing import TYPE_CHECKING, Any, Union

import iris
import isodate
Expand All @@ -17,17 +17,19 @@
from .exceptions import RecipeError
from .typing import Facets, FacetValue

if TYPE_CHECKING:
from .esgf import ESGFFile

logger = logging.getLogger(__name__)


def _get_from_pattern(pattern, date_range_pattern, stem, group):
"""Get time, date or datetime from date range patterns in file names."""
#
# Next string allows to test that there is an allowed delimiter (or
# string start or end) close to date range (or to single date)
start_point = end_point = None
context = r"(?:^|[-_]|$)"
#

# First check for a block of two potential dates
date_range_pattern_with_context = context + date_range_pattern + context
daterange = re.search(date_range_pattern_with_context, stem)
Expand All @@ -37,6 +39,7 @@ def _get_from_pattern(pattern, date_range_pattern, stem, group):
date_range_pattern_with_context = (context + date_range_pattern +
context)
daterange = re.search(date_range_pattern_with_context, stem)

if daterange:
start_point = daterange.group(group)
end_group = '_'.join([group, 'end'])
Expand All @@ -59,41 +62,72 @@ def _get_from_pattern(pattern, date_range_pattern, stem, group):
return start_point, end_point


def _get_start_end_date(filename):
def _get_start_end_date(
file: str | Path | LocalFile | ESGFFile
) -> tuple[str, str]:
"""Get the start and end dates as a string from a file name.

Examples of allowed dates : 1980, 198001, 19801231,
1980123123, 19801231T23, 19801231T2359, 19801231T235959,
19801231T235959Z (ISO 8601).
Examples of allowed dates: 1980, 198001, 1980-01, 19801231, 1980-12-31,
1980123123, 19801231T23, 19801231T2359, 19801231T235959, 19801231T235959Z
(ISO 8601).

Dates must be surrounded by '-', '_' or '.' (the latter is used by CMIP3
data), or string start or string end (after removing filename suffix).

Look first for two dates separated by '-', '_' or '_cat_' (the latter is
used by CMIP3 data), then for one single date, and if there are multiple,
for one date at start or end.

Parameters
----------
file:
The file to read the start and end data from.

Returns
-------
tuple[str, str]
The start and end date.

Dates must be surrounded by - or _ or string start or string end
(after removing filename suffix).
Raises
------
ValueError
Start or end date cannot be determined.

Look first for two dates separated by - or _, then for one single
date, and if they are multiple, for one date at start or end.
"""
stem = Path(filename).stem
if hasattr(file, 'name'): # Path, LocalFile, ESGFFile
stem = Path(file.name).stem
else: # str
stem = Path(file).stem

start_date = end_date = None
#

# Build regex
time_pattern = (r"(?P<hour>[0-2][0-9]"
r"(?P<minute>[0-5][0-9]"
r"(?P<second>[0-5][0-9])?)?Z?)")
date_pattern = (r"(?P<year>[0-9]{4})"
r"(?P<month>[01][0-9]"
r"(?P<day>[0-3][0-9]"
r"(?P<month>-?[01][0-9]"
r"(?P<day>-?[0-3][0-9]"
rf"(T?{time_pattern})?)?)?")
datetime_pattern = (rf"(?P<datetime>{date_pattern})")
#
end_datetime_pattern = datetime_pattern.replace(">", "_end>")
date_range_pattern = datetime_pattern + r"[-_]" + end_datetime_pattern

# Dates can either be delimited by '-', '_', or '_cat_' (the latter for
# CMIP3)
date_range_pattern = (
datetime_pattern + r"[-_](?:cat_)?" + end_datetime_pattern
)

# Find dates using the regex
start_date, end_date = _get_from_pattern(datetime_pattern,
date_range_pattern, stem,
'datetime')

# As final resort, try to get the dates from the file contents
if (start_date is None or end_date is None) and Path(filename).exists():
logger.debug("Must load file %s for daterange ", filename)
cubes = iris.load(filename)
if ((start_date is None or end_date is None) and
isinstance(file, (str, Path)) and Path(file).exists()):
logger.debug("Must load file %s for daterange ", file)
cubes = iris.load(file)

for cube in cubes:
logger.debug(cube)
Expand All @@ -109,12 +143,30 @@ def _get_start_end_date(filename):
break

if start_date is None or end_date is None:
raise ValueError(f'File {filename} dates do not match a recognized '
'pattern and time can not be read from the file')
raise ValueError(
f"File {file} datetimes do not match a recognized pattern and "
f"time coordinate can not be read from the file"
)

# Remove potential '-' characters from datetimes
start_date = start_date.replace('-', '')
end_date = end_date.replace('-', '')
bouweandela marked this conversation as resolved.
Show resolved Hide resolved

return start_date, end_date


def _get_start_end_year(
file: str | Path | LocalFile | ESGFFile
) -> tuple[int, int]:
"""Get the start and end year as int from a file name.

See :func:`_get_start_end_date`.

"""
(start_date, end_date) = _get_start_end_date(file)
return (int(start_date[:4]), int(end_date[:4]))


def _dates_to_timerange(start_date, end_date):
"""Convert ``start_date`` and ``end_date`` to ``timerange``.

Expand Down Expand Up @@ -162,72 +214,6 @@ def _replace_years_with_timerange(variable):
variable.pop('end_year', None)


def _get_start_end_year(file):
"""Get the start and end year from a file name.

Examples of allowed dates : 1980, 198001, 19801231,
1980123123, 19801231T23, 19801231T2359, 19801231T235959,
19801231T235959Z (ISO 8601).

Dates must be surrounded by - or _ or string start or string end
(after removing filename suffix).

Look first for two dates separated by - or _, then for one single
date, and if they are multiple, for one date at start or end.

Parameters
----------
file: LocalFile or esmvalcore.esgf.ESGFFile
The file to read the start and end year from.

Returns
-------
tuple[int, int]
The start and end year.

Raises
------
ValueError
When start or end year cannot be determined.

"""
start_year = end_year = None

time_pattern = (r"(?P<hour>[0-2][0-9]"
r"(?P<minute>[0-5][0-9]"
r"(?P<second>[0-5][0-9])?)?Z?)")
date_pattern = (r"(?P<year>[0-9]{4})"
r"(?P<month>[01][0-9]"
r"(?P<day>[0-3][0-9]"
rf"(T?{time_pattern})?)?)?")

end_date_pattern = date_pattern.replace(">", "_end>")
date_range_pattern = date_pattern + r"[-_]" + end_date_pattern
start_year, end_year = _get_from_pattern(date_pattern, date_range_pattern,
Path(file.name).stem, 'year')
# As final resort, try to get the dates from the file contents
if ((start_year is None or end_year is None) and isinstance(file, Path)
and file.exists()):
logger.debug("Must load file %s for daterange ", file)
cubes = iris.load(file)

for cube in cubes:
logger.debug(cube)
try:
time = cube.coord('time')
except iris.exceptions.CoordinateNotFoundError:
continue
start_year = time.cell(0).point.year
end_year = time.cell(-1).point.year
break

if start_year is None or end_year is None:
raise ValueError(f'File {file} dates do not match a recognized '
'pattern and time can not be read from the file')

return int(start_year), int(end_year)


def _parse_period(timerange):
"""Parse `timerange` values given as duration periods.

Expand Down
Loading