Restructure library into record and batch

earthobservations · Dec 22, 2023 · 72b7181 · 72b7181
1 parent b18ab70
commit 72b7181
Show file tree

Hide file tree

Showing 19 changed files with 416 additions and 537 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-src/isd/_version.py
+isd/_version.py
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/examples/check_timesteps.py b/examples/check_timesteps.py
@@ -1,30 +1,44 @@
 """Given a directory of ISD files, checks that all timesteps monotonically increase."""
-
-import os
+import logging
+from pathlib import Path
 import sys
+from typing import Union
 
 import tqdm
 
-import isd.io
-
-directory = sys.argv[1]
-paths = [os.path.join(directory, file_name) for file_name in os.listdir(directory)]
-all_monotonic = True
-bad_paths = []
-for path in tqdm.tqdm(paths):
-    data_frame = isd.io.read_to_data_frame(path)
-    min = data_frame.timestamp.min()
-    max = data_frame.timestamp.max()
-    is_monotonic = data_frame.timestamp.is_monotonic
-    if not is_monotonic:
-        all_monotonic = False
-        bad_paths.append(path)
-    tqdm.tqdm.write(f"{path}: min={min}, max={max}, is_monotonic={is_monotonic}")
-
-if all_monotonic:
-    print("All files have monotonically increasing timestamps!")
-else:
-    print("Not all files have monotonically increasing timestamps, here they are:")
-    for path in bad_paths:
-        print(f"    - {path}")
-    sys.exit(1)
+from isd import Batch
+
+logging.basicConfig(level=logging.INFO)
+
+log = logging.getLogger(__name__)
+
+
+def main(path: Union[str, Path]) -> None:
+    path = Path(path)
+    file_names = list(path.glob("*"))
+    all_monotonic = True
+    bad_files = []
+    for file_name in tqdm.tqdm(file_names):
+        df = Batch.from_path(file_name).to_df()
+        ts_min = df.datetime.min()
+        ts_max = df.datetime.max()
+        is_monotonic = df.datetime.is_monotonic_increasing
+        if not is_monotonic:
+            all_monotonic = False
+            bad_files.append(file_name)
+        log.info(
+            f"{file_name}: min={ts_min}, max={ts_max}, is_monotonic={is_monotonic}"
+        )
+
+    if all_monotonic:
+        print("All files have monotonically increasing timestamps!")
+    else:
+        print("Not all files have monotonically increasing timestamps, here they are:")
+        for file_name in bad_files:
+            print(f"    - {file_name}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    directory = sys.argv[1]
+    main(directory)
diff --git a/isd/__init__.py b/isd/__init__.py
@@ -0,0 +1,5 @@
+from isd.errors import IsdError
+from isd.record import Record
+from isd.batch import Batch
+
+__all__ = ["IsdError", "Record", "Batch"]
diff --git a/isd/batch.py b/isd/batch.py
@@ -0,0 +1,66 @@
+import gzip
+from io import BytesIO
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Union, Optional
+import datetime as dt
+
+from isd.record import Record
+
+import pandas as pd
+
+
+@dataclass
+class Batch:
+    records: List[Record]
+
+    @classmethod
+    def from_path(cls, path: Union[str, Path]) -> "Batch":
+        """Opens a local ISD file and returns an iterator over its records.
+
+        If the path has a .gz extension, this function will assume it has gzip
+        compression and will attempt to open it using `gzip.open`.
+        """
+        path = Path(path)
+        if path.suffix == ".gz":
+            with gzip.open(path) as gzip_file:
+                return cls(
+                    [
+                        Record.from_string(gzip_line.decode("utf-8"))
+                        for gzip_line in gzip_file
+                    ]
+                )
+        else:
+            with open(path) as uncompressed_file:
+                return cls(
+                    [
+                        Record.from_string(uncompressed_line)
+                        for uncompressed_line in uncompressed_file
+                    ]
+                )
+
+    @classmethod
+    def from_string(cls, string: Union[str, BytesIO]) -> "Batch":
+        """Reads records from a text io stream."""
+        if isinstance(string, BytesIO):
+            string = string.read().decode("utf-8")
+        return cls([Record.from_string(line) for line in string.splitlines()])
+
+    def filter_by_datetime(
+        self,
+        start_date: Optional[dt.datetime] = None,
+        end_date: Optional[dt.datetime] = None,
+    ) -> List[Record]:
+        """Returns an iterator over records filtered by start and end datetimes (both optional)."""
+        return [
+            record
+            for record in self.records
+            if (not start_date or record.datetime() >= start_date)
+            and (not end_date or record.datetime() < end_date)
+        ]
+
+    def to_df(self) -> pd.DataFrame:
+        """Reads a local ISD file into a DataFrame."""
+        import pandas as pd
+
+        return pd.DataFrame([record.to_dict() for record in self.records])
diff --git a/src/isd/cli.py → isd/cli.py b/src/isd/cli.py → isd/cli.py
@@ -1,13 +1,9 @@
 # type: ignore
 
-import dataclasses
-import itertools
-import json
-
 import click
 from click import ClickException
 
-import isd.io
+from isd.batch import Batch
 
 
 @click.group()
@@ -20,9 +16,9 @@ def main() -> None:
 @click.option("-i", "--index", default=0)
 def record(infile: str, index: int) -> None:
     """Prints a single record to standard output in JSON format."""
-    with isd.io.open(infile) as records:
-        record = next(itertools.islice(records, index, None), None)
-        if record:
-            print(json.dumps(dataclasses.asdict(record), indent=4))
-        else:
-            raise ClickException(f"No record with index {index}")
+    batch = Batch.from_path(infile)
+    try:
+        record_ = batch.records[index]
+        print(record_.to_json())
+    except IndexError:
+        raise ClickException(f"No record with index {index}")
diff --git a/src/isd/errors.py → isd/errors.py b/src/isd/errors.py → isd/errors.py