Skip to content

Commit

Permalink
Restructure library into record and batch
Browse files Browse the repository at this point in the history
  • Loading branch information
gutzbenj committed Dec 22, 2023
1 parent b18ab70 commit 72b7181
Show file tree
Hide file tree
Showing 19 changed files with 416 additions and 537 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
src/isd/_version.py
isd/_version.py

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
64 changes: 39 additions & 25 deletions examples/check_timesteps.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,44 @@
"""Given a directory of ISD files, checks that all timesteps monotonically increase."""

import os
import logging
from pathlib import Path
import sys
from typing import Union

import tqdm

import isd.io

directory = sys.argv[1]
paths = [os.path.join(directory, file_name) for file_name in os.listdir(directory)]
all_monotonic = True
bad_paths = []
for path in tqdm.tqdm(paths):
data_frame = isd.io.read_to_data_frame(path)
min = data_frame.timestamp.min()
max = data_frame.timestamp.max()
is_monotonic = data_frame.timestamp.is_monotonic
if not is_monotonic:
all_monotonic = False
bad_paths.append(path)
tqdm.tqdm.write(f"{path}: min={min}, max={max}, is_monotonic={is_monotonic}")

if all_monotonic:
print("All files have monotonically increasing timestamps!")
else:
print("Not all files have monotonically increasing timestamps, here they are:")
for path in bad_paths:
print(f" - {path}")
sys.exit(1)
from isd import Batch

logging.basicConfig(level=logging.INFO)

log = logging.getLogger(__name__)


def main(path: Union[str, Path]) -> None:
path = Path(path)
file_names = list(path.glob("*"))
all_monotonic = True
bad_files = []
for file_name in tqdm.tqdm(file_names):
df = Batch.from_path(file_name).to_df()
ts_min = df.datetime.min()
ts_max = df.datetime.max()
is_monotonic = df.datetime.is_monotonic_increasing
if not is_monotonic:
all_monotonic = False
bad_files.append(file_name)
log.info(
f"{file_name}: min={ts_min}, max={ts_max}, is_monotonic={is_monotonic}"
)

if all_monotonic:
print("All files have monotonically increasing timestamps!")
else:
print("Not all files have monotonically increasing timestamps, here they are:")
for file_name in bad_files:
print(f" - {file_name}")
sys.exit(1)


if __name__ == "__main__":
directory = sys.argv[1]
main(directory)
5 changes: 5 additions & 0 deletions isd/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from isd.errors import IsdError
from isd.record import Record
from isd.batch import Batch

__all__ = ["IsdError", "Record", "Batch"]
66 changes: 66 additions & 0 deletions isd/batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import gzip
from io import BytesIO
from pathlib import Path
from dataclasses import dataclass
from typing import List, Union, Optional
import datetime as dt

from isd.record import Record

import pandas as pd


@dataclass
class Batch:
records: List[Record]

@classmethod
def from_path(cls, path: Union[str, Path]) -> "Batch":
"""Opens a local ISD file and returns an iterator over its records.
If the path has a .gz extension, this function will assume it has gzip
compression and will attempt to open it using `gzip.open`.
"""
path = Path(path)
if path.suffix == ".gz":
with gzip.open(path) as gzip_file:
return cls(
[
Record.from_string(gzip_line.decode("utf-8"))
for gzip_line in gzip_file
]
)
else:
with open(path) as uncompressed_file:
return cls(
[
Record.from_string(uncompressed_line)
for uncompressed_line in uncompressed_file
]
)

@classmethod
def from_string(cls, string: Union[str, BytesIO]) -> "Batch":
"""Reads records from a text io stream."""
if isinstance(string, BytesIO):
string = string.read().decode("utf-8")
return cls([Record.from_string(line) for line in string.splitlines()])

def filter_by_datetime(
self,
start_date: Optional[dt.datetime] = None,
end_date: Optional[dt.datetime] = None,
) -> List[Record]:
"""Returns an iterator over records filtered by start and end datetimes (both optional)."""
return [
record
for record in self.records
if (not start_date or record.datetime() >= start_date)
and (not end_date or record.datetime() < end_date)
]

def to_df(self) -> pd.DataFrame:
"""Reads a local ISD file into a DataFrame."""
import pandas as pd

return pd.DataFrame([record.to_dict() for record in self.records])
18 changes: 7 additions & 11 deletions src/isd/cli.py → isd/cli.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
# type: ignore

import dataclasses
import itertools
import json

import click
from click import ClickException

import isd.io
from isd.batch import Batch


@click.group()
Expand All @@ -20,9 +16,9 @@ def main() -> None:
@click.option("-i", "--index", default=0)
def record(infile: str, index: int) -> None:
"""Prints a single record to standard output in JSON format."""
with isd.io.open(infile) as records:
record = next(itertools.islice(records, index, None), None)
if record:
print(json.dumps(dataclasses.asdict(record), indent=4))
else:
raise ClickException(f"No record with index {index}")
batch = Batch.from_path(infile)
try:
record_ = batch.records[index]
print(record_.to_json())
except IndexError:
raise ClickException(f"No record with index {index}")
File renamed without changes.
Loading

0 comments on commit 72b7181

Please sign in to comment.