diff --git a/docs/api/cuda_histogram.axis.rst b/docs/api/cuda_histogram.axis.rst new file mode 100644 index 0000000..1dab9f9 --- /dev/null +++ b/docs/api/cuda_histogram.axis.rst @@ -0,0 +1,7 @@ +cuda_histogram.axis package +=========================== + +.. automodule:: cuda_histogram.axis + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/cuda_histogram.hist.rst b/docs/api/cuda_histogram.hist.rst new file mode 100644 index 0000000..6705b43 --- /dev/null +++ b/docs/api/cuda_histogram.hist.rst @@ -0,0 +1,7 @@ +cuda_histogram.hist module +========================== + +.. automodule:: cuda_histogram.hist + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/cuda_histogram.hist_tools.rst b/docs/api/cuda_histogram.hist_tools.rst deleted file mode 100644 index 1ba2d3c..0000000 --- a/docs/api/cuda_histogram.hist_tools.rst +++ /dev/null @@ -1,8 +0,0 @@ -cuda_histogram.hist_tools module -================================ - -.. automodule:: cuda_histogram.hist_tools - :members: - :undoc-members: - :show-inheritance: - :private-members: diff --git a/docs/api/cuda_histogram.rst b/docs/api/cuda_histogram.rst index dd7d1ad..0707de1 100644 --- a/docs/api/cuda_histogram.rst +++ b/docs/api/cuda_histogram.rst @@ -1,10 +1,18 @@ cuda_histogram package ====================== +Subpackages +----------- + +.. toctree:: + :maxdepth: 3 + + cuda_histogram.axis + Submodules ---------- .. toctree:: :maxdepth: 3 - cuda_histogram.hist_tools + cuda_histogram.hist diff --git a/src/cuda_histogram/__init__.py b/src/cuda_histogram/__init__.py index 768e57f..135c5f4 100644 --- a/src/cuda_histogram/__init__.py +++ b/src/cuda_histogram/__init__.py @@ -8,15 +8,13 @@ from __future__ import annotations -from cuda_histogram.hist_tools import Bin, Cat, Hist, Interval, StringBin +from cuda_histogram import axis +from cuda_histogram.hist import Hist from ._version import version as __version__ __all__: list[str] = [ "Hist", - "Bin", - "Interval", - "Cat", - "StringBin", + "axis", "__version__", ] diff --git a/src/cuda_histogram/axis/__init__.py b/src/cuda_histogram/axis/__init__.py new file mode 100644 index 0000000..0613cc2 --- /dev/null +++ b/src/cuda_histogram/axis/__init__.py @@ -0,0 +1,719 @@ +from __future__ import annotations + +import functools +import numbers +import re +import warnings + +import awkward +import cupy +import numpy as np + +__all__: list[str] = [ + "Bin", + "Regular", + "Variable", + "Cat", + "Interval", + "StringBin", +] + +_replace_nans = cupy.ElementwiseKernel("T v", "T x", "x = isnan(x)?v:x", "replace_nans") + +_clip_bins = cupy.ElementwiseKernel( + "T Nbins, T lo, T hi, T id", + "T idx", + """ + const T floored = floor((id - lo) * float(Nbins) / (hi - lo)) + 1; + idx = floored < 0 ? 0 : floored; + idx = floored > Nbins ? Nbins + 1 : floored; + """, + "clip_bins", +) + + +def _overflow_behavior(overflow): + if overflow == "none": + return slice(1, -2) + elif overflow == "under": + return slice(None, -2) + elif overflow == "over": + return slice(1, -1) + elif overflow == "all": + return slice(None, -1) + elif overflow == "allnan": + return slice(None) + elif overflow == "justnan": + return slice(-1, None) + else: + raise ValueError(f"Unrecognized overflow behavior: {overflow}") + + +@functools.total_ordering +class Interval: + """Real number interval + + Totally ordered, assuming no overlap in intervals. + A special nan interval can be constructed, which is defined + as greater than ``[*, inf)`` + + Parameters + ---------- + lo : float + Bin lower bound, inclusive + hi : float + Bin upper bound, exclusive + """ + + def __init__(self, lo, hi, label=None): + self._lo = float(lo) + self._hi = float(hi) + self._label = label + + def __repr__(self): + return f"<{self.__class__.__name__} ({self!s}) instance at 0x{id(self):0x}>" + + def __str__(self): + if self._label is not None: + return self._label + if self.nan(): + return "(nanflow)" + return "{}{}, {})".format( + "(" if self._lo == -np.inf else "[", + self._lo, + self._hi, + ) + + def __hash__(self): + return hash((self._lo, self._hi)) + + def __lt__(self, other): + if other.nan() and not self.nan(): + return True + elif self.nan(): + return False + elif self._lo < other._lo: + if self._hi > other._lo: + raise ValueError( + f"Intervals {self!r} and {other!r} intersect! What are you doing?!" + ) + return True + return False + + def __eq__(self, other): + if not isinstance(other, Interval): + return False + if other.nan() and self.nan(): + return True + if self._lo == other._lo and self._hi == other._hi: # noqa: SIM103 + return True + return False + + def nan(self): + return np.isnan(self._hi) + + @property + def lo(self): + """Lower boundary of this bin, inclusive""" + return self._lo + + @property + def hi(self): + """Upper boundary of this bin, exclusive""" + return self._hi + + @property + def mid(self): + """Midpoint of this bin""" + return (self._hi + self._lo) / 2 + + @property + def label(self): + """Label of this bin, mutable""" + return self._label + + @label.setter + def label(self, lbl): + self._label = lbl + + +@functools.total_ordering +class StringBin: + """A string used to fill a sparse axis + + Totally ordered, lexicographically by name. + + Parameters + ---------- + name : str + Name of the bin, as used in `Hist.fill` calls + label : str + The `str` representation of this bin can be overridden by + a custom label. + """ + + def __init__(self, name, label=None): + if not isinstance(name, str): + raise TypeError( + f"StringBin only supports string categories, received a {name!r}" + ) + elif "*" in name: + raise ValueError( + "StringBin does not support character '*' as it conflicts with wildcard mapping." + ) + self._name = name + self._label = label + + def __repr__(self): + return f"<{self.__class__.__name__} ({self.name}) instance at 0x{id(self):0x}>" + + def __str__(self): + if self._label is not None: + return self._label + return self._name + + def __hash__(self): + return hash(self._name) + + def __lt__(self, other): + return self._name < other._name + + def __eq__(self, other): + if isinstance(other, StringBin): + return self._name == other._name + return False + + @property + def name(self): + """Name of this bin, *Immutable*""" + return self._name + + @property + def label(self): + """Label of this bin, mutable""" + return self._label + + @label.setter + def label(self, lbl): + self._label = lbl + + +class Axis: + """ + Axis: Base class for any type of axis + Derived classes should implement, at least, an equality override + """ + + def __init__(self, name, label): + if name == "weight": + raise ValueError( + "Cannot create axis: 'weight' is a reserved keyword for Hist.fill()" + ) + self._name = name + self._label = label + + def __repr__(self): + return f"<{self.__class__.__name__} (name={self._name}) instance at 0x{id(self):0x}>" + + @property + def name(self): + return self._name + + @property + def label(self): + return self._label + + @label.setter + def label(self, label): + self._label = label + + def __eq__(self, other): + if isinstance(other, Axis): + if self._name != other._name: # noqa: SIM103 + return False + # label doesn't matter + return True + elif isinstance(other, str): + # Convenient for testing axis in list by name + return not self._name != other + raise TypeError(f"Cannot compare an Axis with a {other!r}") + + +class SparseAxis(Axis): + """ + SparseAxis: ABC for a sparse axis + + Derived should implement: + **index(identifier)** - return a hashable object for indexing + + **__eq__(axis)** - axis has same definition (not necessarily same bins) + + **__getitem__(index)** - return an identifier + + **_ireduce(slice)** - return a list of hashes, slice is arbitrary + + What we really want here is a hashlist with some slice sugar on top + It is usually the case that the identifier is already hashable, + in which case index and __getitem__ are trivial, but this mechanism + may be useful if the size of the tuple of identifiers in a + sparse-binned histogram becomes too large + """ + + +class Cat(SparseAxis): + """A category axis with name and label + + Parameters + ---------- + name : str + is used as a keyword in histogram filling, immutable + label : str + describes the meaning of the axis, can be changed + sorting : {'identifier', 'placement', 'integral'}, optional + Axis sorting when listing identifiers. + + The number of categories is arbitrary, and can be filled sparsely + Identifiers are strings + """ + + def __init__(self, name, label, sorting="identifier"): + super().__init__(name, label) + # In all cases key == value.name + self._bins = {} + self._sorting = sorting + self._sorted = [] + + def index(self, identifier): + """Index of a identifier or label + + Parameters + ---------- + identifier : str or StringBin + The identifier to lookup + + Returns a `StringBin` corresponding to the given argument (trivial in the case + where a `StringBin` was passed) and saves a reference internally in the case where + the identifier was not seen before by this axis. + """ + if isinstance(identifier, StringBin): + index = identifier + else: + index = StringBin(identifier) + if index.name not in self._bins: + self._bins[index.name] = index + self._sorted.append(index.name) + if self._sorting == "identifier": + self._sorted.sort() + return self._bins[index.name] + + def __eq__(self, other): + # Sparse, so as long as name is the same + return super().__eq__(other) + + def __getitem__(self, index): + if not isinstance(index, StringBin): + raise TypeError(f"Expected a StringBin object, got: {index!r}") + identifier = index.name + if identifier not in self._bins: + raise KeyError("No identifier %r in this Category axis") + return identifier + + def _ireduce(self, the_slice): + out = None + if isinstance(the_slice, StringBin): + out = [the_slice.name] + elif isinstance(the_slice, re.Pattern): + out = [k for k in self._sorted if the_slice.match(k)] + elif isinstance(the_slice, str): + pattern = "^" + re.escape(the_slice).replace(r"\*", ".*") + "$" + m = re.compile(pattern) + out = [k for k in self._sorted if m.match(k)] + elif isinstance(the_slice, list): + if not all(k in self._sorted for k in the_slice): + warnings.warn( + f"Not all requested indices present in {self!r}", RuntimeWarning + ) + out = [k for k in self._sorted if k in the_slice] + elif isinstance(the_slice, slice): + if the_slice.step is not None: + raise IndexError("Not sure how to use slice step for categories...") + start, stop = 0, len(self._sorted) + if isinstance(the_slice.start, str): + start = self._sorted.index(the_slice.start) + else: + start = the_slice.start + if isinstance(the_slice.stop, str): + stop = self._sorted.index(the_slice.stop) + else: + stop = the_slice.stop + out = self._sorted[start:stop] + else: + raise IndexError(f"Cannot understand slice {the_slice!r} on axis {self!r}") + return [self._bins[k] for k in out] + + @property + def size(self): + """Number of bins""" + return len(self._bins) + + @property + def sorting(self): + """Sorting definition to adhere to + + See `Cat` constructor for possible values + """ + return self._sorting + + @sorting.setter + def sorting(self, newsorting): + if newsorting == "placement": + # not much we can do about already inserted values + pass + elif newsorting == "identifier": + self._sorted.sort() + elif newsorting == "integral": + # this will be checked in any Hist.identifiers() call accessing this axis + pass + else: + raise AttributeError(f"Invalid axis sorting type: {newsorting}") + self._sorting = newsorting + + def identifiers(self): + """List of `StringBin` identifiers""" + return [self._bins[k] for k in self._sorted] + + +class DenseAxis(Axis): + """ + DenseAxis: ABC for a fixed-size densely-indexed axis + + Derived should implement: + **index(identifier)** - return an index + + **__eq__(axis)** - axis has same definition and binning + + **__getitem__(index)** - return an identifier + + **_ireduce(slice)** - return a slice or list of indices, input slice to be interpred as values + + **reduced(islice)** - return a new axis with binning corresponding to the index slice (from _ireduce) + + TODO: hasoverflow(), not all dense axes might have an overflow concept, + currently it is implicitly assumed they do (as the only dense type is a numeric axis) + """ + + +class Bin(DenseAxis): + """A binned axis with name, label, and binning. + + Parameters + ---------- + name : str + is used as a keyword in histogram filling, immutable + label : str + describes the meaning of the axis, can be changed + n_or_arr : int or list or np.ndarray + Integer number of bins, if uniform binning. Otherwise, a list or + numpy 1D array of bin boundaries. + lo : float, optional + lower boundary of bin range, if uniform binning + hi : float, optional + upper boundary of bin range, if uniform binning + + This axis will generate frequencies for n+3 bins, special bin indices: + ``0 = underflow, n+1 = overflow, n+2 = nanflow`` + Bin boundaries are [lo, hi) + """ + + def __init__(self, name, label, n_or_arr, lo=None, hi=None): + super().__init__(name, label) + self._lazy_intervals = None + if isinstance(n_or_arr, (list, np.ndarray, cupy.ndarray)): + self._uniform = False + self._bins = cupy.array(n_or_arr, dtype="d") + if not all(np.sort(self._bins) == self._bins): + raise ValueError("Binning not sorted!") + self._lo = self._bins[0] + self._hi = self._bins[-1] + # to make searchsorted differentiate inf from nan + self._bins = cupy.append(self._bins, cupy.inf) + self._interval_bins = cupy.r_[-cupy.inf, self._bins, cupy.nan] + self._bin_names = np.full(self._interval_bins[:-1].size, None) + elif isinstance(n_or_arr, numbers.Integral): + if lo is None or hi is None: + raise TypeError( + "Interpreting n_or_arr as uniform binning, please specify lo and hi values" + ) + self._uniform = True + self._lo = lo + self._hi = hi + self._bins = n_or_arr + self._interval_bins = cupy.r_[ + -cupy.inf, + cupy.linspace(self._lo, self._hi, self._bins + 1), + cupy.inf, + cupy.nan, + ] + self._bin_names = np.full(self._interval_bins[:-1].size, None) + else: + raise TypeError( + f"Cannot understand n_or_arr (nbins or binning array) type {n_or_arr!r}" + ) + + @property + def _intervals(self): + if not hasattr(self, "_lazy_intervals") or self._lazy_intervals is None: + self._lazy_intervals = [ + Interval(low, high, bin) + for low, high, bin in zip( + self._interval_bins[:-1], self._interval_bins[1:], self._bin_names + ) + ] + return self._lazy_intervals + + def __getstate__(self): + if hasattr(self, "_lazy_intervals") and self._lazy_intervals is not None: + self._bin_names = np.array( + [interval.label for interval in self._lazy_intervals] + ) + self.__dict__.pop("_lazy_intervals", None) + return self.__dict__ + + def __setstate__(self, d): + if "_intervals" in d: # convert old hists to new serialization format + _old_intervals = d.pop("_intervals") + interval_bins = [i._lo for i in _old_intervals] + [_old_intervals[-1]._hi] + d["_interval_bins"] = cupy.array(interval_bins) + d["_bin_names"] = np.array([interval._label for interval in _old_intervals]) + if "_interval_bins" in d and "_bin_names" not in d: + d["_bin_names"] = np.full(d["_interval_bins"][:-1].size, None) + self.__dict__ = d + + def index(self, identifier): + """Index of a identifier or label + + Parameters + ---------- + identifier : float or Interval or np.ndarray + The identifier(s) to lookup. Supports vectorized + calls when a numpy 1D array of numbers is passed. + + Returns an integer corresponding to the index in the axis where the histogram would be filled. + The integer range includes flow bins: ``0 = underflow, n+1 = overflow, n+2 = nanflow`` + """ + isarray = isinstance(identifier, (awkward.Array, cupy.ndarray, np.ndarray)) + if isarray or isinstance(identifier, numbers.Number): + identifier = awkward.to_cupy(identifier) # cupy.asarray(identifier) + if self._uniform: + idx = None + if isarray: + idx = cupy.zeros_like(identifier) + _clip_bins(float(self._bins), self._lo, self._hi, identifier, idx) + else: + idx = np.clip( + np.floor( + (identifier - self._lo) + * float(self._bins) + / (self._hi - self._lo) + ) + + 1, + 0, + self._bins + 1, + ) + + if isinstance(idx, (cupy.ndarray, np.ndarray)): + _replace_nans( + self.size - 1, + idx if idx.dtype.kind == "f" else idx.astype(cupy.float32), + ) + idx = idx.astype(int) + elif np.isnan(idx): + idx = self.size - 1 + else: + idx = int(idx) + return idx + else: + return cupy.searchsorted(self._bins, identifier, side="right") + elif isinstance(identifier, Interval): + if identifier.nan(): + return self.size - 1 + for idx, interval in enumerate(self._intervals): + if ( + interval._lo <= identifier._lo + or cupy.isclose(interval._lo, identifier._lo) + ) and ( + interval._hi >= identifier._hi + or cupy.isclose(interval._hi, identifier._hi) + ): + return idx + raise ValueError( + f"Axis {self!r} has no interval that fully contains identifier {identifier!r}" + ) + raise TypeError("Request bin indices with a identifier or 1-D array only") + + def __eq__(self, other): + if isinstance(other, DenseAxis): + if not super().__eq__(other): + return False + if self._uniform != other._uniform: + return False + if self._uniform and self._bins != other._bins: + return False + if not self._uniform and not all(self._bins == other._bins): # noqa: SIM103 + return False + return True + return super().__eq__(other) + + def __getitem__(self, index): + return self._intervals[index] + + def _ireduce(self, the_slice): + if isinstance(the_slice, numbers.Number): + the_slice = slice(the_slice, the_slice) + elif isinstance(the_slice, Interval): + if the_slice.nan(): + return slice(-1, None) + lo = the_slice._lo if the_slice._lo > -np.inf else None + hi = the_slice._hi if the_slice._hi < np.inf else None + the_slice = slice(lo, hi) + if isinstance(the_slice, slice): + blo, bhi = None, None + if the_slice.start is not None: + if the_slice.start < self._lo: + raise ValueError( + f"Reducing along axis {self!r}: requested start {the_slice.start!r} exceeds bin boundaries (use open slicing, e.g. x[:stop])" + ) + if self._uniform: + blo_real = (the_slice.start - self._lo) * self._bins / ( + self._hi - self._lo + ) + 1 + blo = np.clip(np.round(blo_real).astype(int), 0, self._bins + 1) + if abs(blo - blo_real) > 1.0e-14: + warnings.warn( + f"Reducing along axis {self!r}: requested start {the_slice.start!r} between bin boundaries, no interpolation is performed", + RuntimeWarning, + ) + else: + if the_slice.start not in self._bins: + warnings.warn( + f"Reducing along axis {self!r}: requested start {the_slice.start!r} between bin boundaries, no interpolation is performed", + RuntimeWarning, + ) + blo = self.index(the_slice.start) + if the_slice.stop is not None: + if the_slice.stop > self._hi: + raise ValueError( + f"Reducing along axis {self!r}: requested stop {the_slice.stop!r} exceeds bin boundaries (use open slicing, e.g. x[start:])" + ) + if self._uniform: + bhi_real = (the_slice.stop - self._lo) * self._bins / ( + self._hi - self._lo + ) + 1 + bhi = np.clip(np.round(bhi_real).astype(int), 0, self._bins + 1) + if abs(bhi - bhi_real) > 1.0e-14: + warnings.warn( + f"Reducing along axis {self!r}: requested stop {the_slice.stop!r} between bin boundaries, no interpolation is performed", + RuntimeWarning, + ) + else: + if the_slice.stop not in self._bins: + warnings.warn( + f"Reducing along axis {self!r}: requested stop {the_slice.stop!r} between bin boundaries, no interpolation is performed", + RuntimeWarning, + ) + bhi = self.index(the_slice.stop) + # Assume null ranges (start==stop) mean we want the bin containing the value + if blo is not None and blo == bhi: + bhi += 1 + if the_slice.step is not None: + raise NotImplementedError( + "Step slicing can be interpreted as a rebin factor" + ) + return slice(blo, bhi, the_slice.step) + elif isinstance(the_slice, list) and all( + isinstance(v, Interval) for v in the_slice + ): + raise NotImplementedError("Slice histogram from list of intervals") + raise IndexError(f"Cannot understand slice {the_slice!r} on axis {self!r}") + + def reduced(self, islice): + """Return a new axis with reduced binning + + The new binning corresponds to the slice made on this axis. + Overflow will be taken care of by ``Hist.__getitem__`` + + Parameters + ---------- + islice : slice + ``islice.start`` and ``islice.stop`` should be None or within ``[1, ax.size() - 1]`` + This slice is usually as returned from ``Bin._ireduce`` + """ + if islice.step is not None: + raise NotImplementedError( + "Step slicing can be interpreted as a rebin factor" + ) + if islice.start is None and islice.stop is None: + return self + if self._uniform: + lo = self._lo + ilo = 0 + if islice.start is not None: + lo += (islice.start - 1) * (self._hi - self._lo) / self._bins + ilo = islice.start - 1 + hi = self._hi + ihi = self._bins + if islice.stop is not None: + hi = self._lo + (islice.stop - 1) * (self._hi - self._lo) / self._bins + ihi = islice.stop - 1 + bins = ihi - ilo + # TODO: remove this once satisfied it works + rbins = (hi - lo) * self._bins / (self._hi - self._lo) + assert abs(bins - rbins) < 1e-14, "%d %f %r" % (bins, rbins, self) + return Bin(self._name, self._label, bins, lo, hi) + else: + lo = None if islice.start is None else islice.start - 1 + hi = -1 if islice.stop is None else islice.stop + bins = self._bins[slice(lo, hi)] + return Bin(self._name, self._label, bins) + + @property + def size(self): + """Number of bins, including overflow (i.e. ``n + 3``)""" + if self._uniform: + return self._bins + 3 + # (inf added at constructor) + return len(self._bins) + 1 + + def edges(self, overflow="none"): + """Bin boundaries + + Parameters + ---------- + overflow : str + Create overflow and/or underflow bins by adding a bin of same width to each end. + See `Hist.sum` description for the allowed values. + """ + if self._uniform: + out = cupy.linspace(self._lo, self._hi, self._bins + 1) + else: + out = self._bins[:-1].copy() + out = cupy.r_[ + 2 * out[0] - out[1], out, 2 * out[-1] - out[-2], 3 * out[-1] - 2 * out[-2] + ] + return out[_overflow_behavior(overflow)] + + def centers(self, overflow="none"): + """Bin centers + + Parameters + ---------- + overflow : str + Create overflow and/or underflow bins by adding a bin of same width to each end. + See `Hist.sum` description for the allowed values. + """ + edges = self.edges(overflow) + return (edges[:-1] + edges[1:]) / 2 + + def identifiers(self, overflow="none"): + """List of `Interval` identifiers""" + return self._intervals[_overflow_behavior(overflow)] diff --git a/src/cuda_histogram/hist_tools.py b/src/cuda_histogram/hist.py similarity index 55% rename from src/cuda_histogram/hist_tools.py rename to src/cuda_histogram/hist.py index 12d5b45..1f28c2e 100644 --- a/src/cuda_histogram/hist_tools.py +++ b/src/cuda_histogram/hist.py @@ -1,9 +1,7 @@ from __future__ import annotations import copy -import functools import numbers -import re import warnings from abc import ABCMeta, abstractmethod from collections import namedtuple @@ -13,23 +11,22 @@ import cupy import numpy as np -MaybeSumSlice = namedtuple("MaybeSumSlice", ["start", "stop", "sum"]) +from cuda_histogram.axis import ( + Axis, + Bin, + Cat, + DenseAxis, + Interval, + SparseAxis, + _overflow_behavior, +) -_replace_nans = cupy.ElementwiseKernel("T v", "T x", "x = isnan(x)?v:x", "replace_nans") +__all__: list[str] = ["Hist"] -_clip_bins = cupy.ElementwiseKernel( - "T Nbins, T lo, T hi, T id", - "T idx", - """ - const T floored = floor((id - lo) * float(Nbins) / (hi - lo)) + 1; - idx = floored < 0 ? 0 : floored; - idx = floored > Nbins ? Nbins + 1 : floored; - """, - "clip_bins", -) +_MaybeSumSlice = namedtuple("_MaybeSumSlice", ["start", "stop", "sum"]) -def assemble_blocks(array, ndslice, depth=0): +def _assemble_blocks(array, ndslice, depth=0): """ Turns an n-dimensional slice of array (tuple of slices) into a nested list of numpy arrays that can be passed to np.block() @@ -38,7 +35,7 @@ def assemble_blocks(array, ndslice, depth=0): this function will add the range not in the slice to the appropriate (over/under)flow bins """ if depth == 0: - ndslice = [MaybeSumSlice(s.start, s.stop, False) for s in ndslice] + ndslice = [_MaybeSumSlice(s.start, s.stop, False) for s in ndslice] if depth == len(ndslice): slice_op = tuple(slice(s.start, s.stop) for s in ndslice) sum_op = tuple(i for i, s in enumerate(ndslice) if s.sum) @@ -46,708 +43,18 @@ def assemble_blocks(array, ndslice, depth=0): slist = [] newslice = ndslice[:] if ndslice[depth].start is not None: - newslice[depth] = MaybeSumSlice(None, ndslice[depth].start, True) - slist.append(assemble_blocks(array, newslice, depth + 1)) - newslice[depth] = MaybeSumSlice(ndslice[depth].start, ndslice[depth].stop, False) - slist.append(assemble_blocks(array, newslice, depth + 1)) + newslice[depth] = _MaybeSumSlice(None, ndslice[depth].start, True) + slist.append(_assemble_blocks(array, newslice, depth + 1)) + newslice[depth] = _MaybeSumSlice(ndslice[depth].start, ndslice[depth].stop, False) + slist.append(_assemble_blocks(array, newslice, depth + 1)) if ndslice[depth].stop is not None: - newslice[depth] = MaybeSumSlice(ndslice[depth].stop, -1, True) - slist.append(assemble_blocks(array, newslice, depth + 1)) - newslice[depth] = MaybeSumSlice(-1, None, False) - slist.append(assemble_blocks(array, newslice, depth + 1)) + newslice[depth] = _MaybeSumSlice(ndslice[depth].stop, -1, True) + slist.append(_assemble_blocks(array, newslice, depth + 1)) + newslice[depth] = _MaybeSumSlice(-1, None, False) + slist.append(_assemble_blocks(array, newslice, depth + 1)) return slist -def overflow_behavior(overflow): - if overflow == "none": - return slice(1, -2) - elif overflow == "under": - return slice(None, -2) - elif overflow == "over": - return slice(1, -1) - elif overflow == "all": - return slice(None, -1) - elif overflow == "allnan": - return slice(None) - elif overflow == "justnan": - return slice(-1, None) - else: - raise ValueError(f"Unrecognized overflow behavior: {overflow}") - - -@functools.total_ordering -class Interval: - """Real number interval - - Totally ordered, assuming no overlap in intervals. - A special nan interval can be constructed, which is defined - as greater than ``[*, inf)`` - - Parameters - ---------- - lo : float - Bin lower bound, inclusive - hi : float - Bin upper bound, exclusive - """ - - def __init__(self, lo, hi, label=None): - self._lo = float(lo) - self._hi = float(hi) - self._label = label - - def __repr__(self): - return f"<{self.__class__.__name__} ({self!s}) instance at 0x{id(self):0x}>" - - def __str__(self): - if self._label is not None: - return self._label - if self.nan(): - return "(nanflow)" - return "{}{}, {})".format( - "(" if self._lo == -np.inf else "[", - self._lo, - self._hi, - ) - - def __hash__(self): - return hash((self._lo, self._hi)) - - def __lt__(self, other): - if other.nan() and not self.nan(): - return True - elif self.nan(): - return False - elif self._lo < other._lo: - if self._hi > other._lo: - raise ValueError( - f"Intervals {self!r} and {other!r} intersect! What are you doing?!" - ) - return True - return False - - def __eq__(self, other): - if not isinstance(other, Interval): - return False - if other.nan() and self.nan(): - return True - if self._lo == other._lo and self._hi == other._hi: # noqa: SIM103 - return True - return False - - def nan(self): - return np.isnan(self._hi) - - @property - def lo(self): - """Lower boundary of this bin, inclusive""" - return self._lo - - @property - def hi(self): - """Upper boundary of this bin, exclusive""" - return self._hi - - @property - def mid(self): - """Midpoint of this bin""" - return (self._hi + self._lo) / 2 - - @property - def label(self): - """Label of this bin, mutable""" - return self._label - - @label.setter - def label(self, lbl): - self._label = lbl - - -@functools.total_ordering -class StringBin: - """A string used to fill a sparse axis - - Totally ordered, lexicographically by name. - - Parameters - ---------- - name : str - Name of the bin, as used in `Hist.fill` calls - label : str - The `str` representation of this bin can be overridden by - a custom label, which will be used preferentially in legends - produced by `hist.plot1d`, etc. - """ - - def __init__(self, name, label=None): - if not isinstance(name, str): - raise TypeError( - f"StringBin only supports string categories, received a {name!r}" - ) - elif "*" in name: - raise ValueError( - "StringBin does not support character '*' as it conflicts with wildcard mapping." - ) - self._name = name - self._label = label - - def __repr__(self): - return f"<{self.__class__.__name__} ({self.name}) instance at 0x{id(self):0x}>" - - def __str__(self): - if self._label is not None: - return self._label - return self._name - - def __hash__(self): - return hash(self._name) - - def __lt__(self, other): - return self._name < other._name - - def __eq__(self, other): - if isinstance(other, StringBin): - return self._name == other._name - return False - - @property - def name(self): - """Name of this bin, *Immutable*""" - return self._name - - @property - def label(self): - """Label of this bin, mutable""" - return self._label - - @label.setter - def label(self, lbl): - self._label = lbl - - -class Axis: - """ - Axis: Base class for any type of axis - Derived classes should implement, at least, an equality override - """ - - def __init__(self, name, label): - if name == "weight": - raise ValueError( - "Cannot create axis: 'weight' is a reserved keyword for Hist.fill()" - ) - self._name = name - self._label = label - - def __repr__(self): - return f"<{self.__class__.__name__} (name={self._name}) instance at 0x{id(self):0x}>" - - @property - def name(self): - return self._name - - @property - def label(self): - return self._label - - @label.setter - def label(self, label): - self._label = label - - def __eq__(self, other): - if isinstance(other, Axis): - if self._name != other._name: # noqa: SIM103 - return False - # label doesn't matter - return True - elif isinstance(other, str): - # Convenient for testing axis in list by name - return not self._name != other - raise TypeError(f"Cannot compare an Axis with a {other!r}") - - -class SparseAxis(Axis): - """ - SparseAxis: ABC for a sparse axis - - Derived should implement: - **index(identifier)** - return a hashable object for indexing - - **__eq__(axis)** - axis has same definition (not necessarily same bins) - - **__getitem__(index)** - return an identifier - - **_ireduce(slice)** - return a list of hashes, slice is arbitrary - - What we really want here is a hashlist with some slice sugar on top - It is usually the case that the identifier is already hashable, - in which case index and __getitem__ are trivial, but this mechanism - may be useful if the size of the tuple of identifiers in a - sparse-binned histogram becomes too large - """ - - -class Cat(SparseAxis): - """A category axis with name and label - - Parameters - ---------- - name : str - is used as a keyword in histogram filling, immutable - label : str - describes the meaning of the axis, can be changed - sorting : {'identifier', 'placement', 'integral'}, optional - Axis sorting when listing identifiers. Default 'placement' - Changing this setting can effect the order of stack plotting - in `hist.plot1d`. - - The number of categories is arbitrary, and can be filled sparsely - Identifiers are strings - """ - - def __init__(self, name, label, sorting="identifier"): - super().__init__(name, label) - # In all cases key == value.name - self._bins = {} - self._sorting = sorting - self._sorted = [] - - def index(self, identifier): - """Index of a identifier or label - - Parameters - ---------- - identifier : str or StringBin - The identifier to lookup - - Returns a `StringBin` corresponding to the given argument (trivial in the case - where a `StringBin` was passed) and saves a reference internally in the case where - the identifier was not seen before by this axis. - """ - if isinstance(identifier, StringBin): - index = identifier - else: - index = StringBin(identifier) - if index.name not in self._bins: - self._bins[index.name] = index - self._sorted.append(index.name) - if self._sorting == "identifier": - self._sorted.sort() - return self._bins[index.name] - - def __eq__(self, other): - # Sparse, so as long as name is the same - return super().__eq__(other) - - def __getitem__(self, index): - if not isinstance(index, StringBin): - raise TypeError(f"Expected a StringBin object, got: {index!r}") - identifier = index.name - if identifier not in self._bins: - raise KeyError("No identifier %r in this Category axis") - return identifier - - def _ireduce(self, the_slice): - out = None - if isinstance(the_slice, StringBin): - out = [the_slice.name] - elif isinstance(the_slice, re.Pattern): - out = [k for k in self._sorted if the_slice.match(k)] - elif isinstance(the_slice, str): - pattern = "^" + re.escape(the_slice).replace(r"\*", ".*") + "$" - m = re.compile(pattern) - out = [k for k in self._sorted if m.match(k)] - elif isinstance(the_slice, list): - if not all(k in self._sorted for k in the_slice): - warnings.warn( - f"Not all requested indices present in {self!r}", RuntimeWarning - ) - out = [k for k in self._sorted if k in the_slice] - elif isinstance(the_slice, slice): - if the_slice.step is not None: - raise IndexError("Not sure how to use slice step for categories...") - start, stop = 0, len(self._sorted) - if isinstance(the_slice.start, str): - start = self._sorted.index(the_slice.start) - else: - start = the_slice.start - if isinstance(the_slice.stop, str): - stop = self._sorted.index(the_slice.stop) - else: - stop = the_slice.stop - out = self._sorted[start:stop] - else: - raise IndexError(f"Cannot understand slice {the_slice!r} on axis {self!r}") - return [self._bins[k] for k in out] - - @property - def size(self): - """Number of bins""" - return len(self._bins) - - @property - def sorting(self): - """Sorting definition to adhere to - - See `Cat` constructor for possible values - """ - return self._sorting - - @sorting.setter - def sorting(self, newsorting): - if newsorting == "placement": - # not much we can do about already inserted values - pass - elif newsorting == "identifier": - self._sorted.sort() - elif newsorting == "integral": - # this will be checked in any Hist.identifiers() call accessing this axis - pass - else: - raise AttributeError(f"Invalid axis sorting type: {newsorting}") - self._sorting = newsorting - - def identifiers(self): - """List of `StringBin` identifiers""" - return [self._bins[k] for k in self._sorted] - - -class DenseAxis(Axis): - """ - DenseAxis: ABC for a fixed-size densely-indexed axis - - Derived should implement: - **index(identifier)** - return an index - - **__eq__(axis)** - axis has same definition and binning - - **__getitem__(index)** - return an identifier - - **_ireduce(slice)** - return a slice or list of indices, input slice to be interpred as values - - **reduced(islice)** - return a new axis with binning corresponding to the index slice (from _ireduce) - - TODO: hasoverflow(), not all dense axes might have an overflow concept, - currently it is implicitly assumed they do (as the only dense type is a numeric axis) - """ - - -class Bin(DenseAxis): - """A binned axis with name, label, and binning. - - Parameters - ---------- - name : str - is used as a keyword in histogram filling, immutable - label : str - describes the meaning of the axis, can be changed - n_or_arr : int or list or np.ndarray - Integer number of bins, if uniform binning. Otherwise, a list or - numpy 1D array of bin boundaries. - lo : float, optional - lower boundary of bin range, if uniform binning - hi : float, optional - upper boundary of bin range, if uniform binning - - This axis will generate frequencies for n+3 bins, special bin indices: - ``0 = underflow, n+1 = overflow, n+2 = nanflow`` - Bin boundaries are [lo, hi) - """ - - def __init__(self, name, label, n_or_arr, lo=None, hi=None): - super().__init__(name, label) - self._lazy_intervals = None - if isinstance(n_or_arr, (list, np.ndarray, cupy.ndarray)): - self._uniform = False - self._bins = cupy.array(n_or_arr, dtype="d") - if not all(np.sort(self._bins) == self._bins): - raise ValueError("Binning not sorted!") - self._lo = self._bins[0] - self._hi = self._bins[-1] - # to make searchsorted differentiate inf from nan - self._bins = cupy.append(self._bins, cupy.inf) - self._interval_bins = cupy.r_[-cupy.inf, self._bins, cupy.nan] - self._bin_names = np.full(self._interval_bins[:-1].size, None) - elif isinstance(n_or_arr, numbers.Integral): - if lo is None or hi is None: - raise TypeError( - "Interpreting n_or_arr as uniform binning, please specify lo and hi values" - ) - self._uniform = True - self._lo = lo - self._hi = hi - self._bins = n_or_arr - self._interval_bins = cupy.r_[ - -cupy.inf, - cupy.linspace(self._lo, self._hi, self._bins + 1), - cupy.inf, - cupy.nan, - ] - self._bin_names = np.full(self._interval_bins[:-1].size, None) - else: - raise TypeError( - f"Cannot understand n_or_arr (nbins or binning array) type {n_or_arr!r}" - ) - - @property - def _intervals(self): - if not hasattr(self, "_lazy_intervals") or self._lazy_intervals is None: - self._lazy_intervals = [ - Interval(low, high, bin) - for low, high, bin in zip( - self._interval_bins[:-1], self._interval_bins[1:], self._bin_names - ) - ] - return self._lazy_intervals - - def __getstate__(self): - if hasattr(self, "_lazy_intervals") and self._lazy_intervals is not None: - self._bin_names = np.array( - [interval.label for interval in self._lazy_intervals] - ) - self.__dict__.pop("_lazy_intervals", None) - return self.__dict__ - - def __setstate__(self, d): - if "_intervals" in d: # convert old hists to new serialization format - _old_intervals = d.pop("_intervals") - interval_bins = [i._lo for i in _old_intervals] + [_old_intervals[-1]._hi] - d["_interval_bins"] = cupy.array(interval_bins) - d["_bin_names"] = np.array([interval._label for interval in _old_intervals]) - if "_interval_bins" in d and "_bin_names" not in d: - d["_bin_names"] = np.full(d["_interval_bins"][:-1].size, None) - self.__dict__ = d - - def index(self, identifier): - """Index of a identifier or label - - Parameters - ---------- - identifier : float or Interval or np.ndarray - The identifier(s) to lookup. Supports vectorized - calls when a numpy 1D array of numbers is passed. - - Returns an integer corresponding to the index in the axis where the histogram would be filled. - The integer range includes flow bins: ``0 = underflow, n+1 = overflow, n+2 = nanflow`` - """ - isarray = isinstance(identifier, (awkward.Array, cupy.ndarray, np.ndarray)) - if isarray or isinstance(identifier, numbers.Number): - identifier = awkward.to_cupy(identifier) # cupy.asarray(identifier) - if self._uniform: - idx = None - if isarray: - idx = cupy.zeros_like(identifier) - _clip_bins(float(self._bins), self._lo, self._hi, identifier, idx) - else: - idx = np.clip( - np.floor( - (identifier - self._lo) - * float(self._bins) - / (self._hi - self._lo) - ) - + 1, - 0, - self._bins + 1, - ) - - if isinstance(idx, (cupy.ndarray, np.ndarray)): - _replace_nans( - self.size - 1, - idx if idx.dtype.kind == "f" else idx.astype(cupy.float32), - ) - idx = idx.astype(int) - elif np.isnan(idx): - idx = self.size - 1 - else: - idx = int(idx) - return idx - else: - return cupy.searchsorted(self._bins, identifier, side="right") - elif isinstance(identifier, Interval): - if identifier.nan(): - return self.size - 1 - for idx, interval in enumerate(self._intervals): - if ( - interval._lo <= identifier._lo - or cupy.isclose(interval._lo, identifier._lo) - ) and ( - interval._hi >= identifier._hi - or cupy.isclose(interval._hi, identifier._hi) - ): - return idx - raise ValueError( - f"Axis {self!r} has no interval that fully contains identifier {identifier!r}" - ) - raise TypeError("Request bin indices with a identifier or 1-D array only") - - def __eq__(self, other): - if isinstance(other, DenseAxis): - if not super().__eq__(other): - return False - if self._uniform != other._uniform: - return False - if self._uniform and self._bins != other._bins: - return False - if not self._uniform and not all(self._bins == other._bins): # noqa: SIM103 - return False - return True - return super().__eq__(other) - - def __getitem__(self, index): - return self._intervals[index] - - def _ireduce(self, the_slice): - if isinstance(the_slice, numbers.Number): - the_slice = slice(the_slice, the_slice) - elif isinstance(the_slice, Interval): - if the_slice.nan(): - return slice(-1, None) - lo = the_slice._lo if the_slice._lo > -np.inf else None - hi = the_slice._hi if the_slice._hi < np.inf else None - the_slice = slice(lo, hi) - if isinstance(the_slice, slice): - blo, bhi = None, None - if the_slice.start is not None: - if the_slice.start < self._lo: - raise ValueError( - f"Reducing along axis {self!r}: requested start {the_slice.start!r} exceeds bin boundaries (use open slicing, e.g. x[:stop])" - ) - if self._uniform: - blo_real = (the_slice.start - self._lo) * self._bins / ( - self._hi - self._lo - ) + 1 - blo = np.clip(np.round(blo_real).astype(int), 0, self._bins + 1) - if abs(blo - blo_real) > 1.0e-14: - warnings.warn( - f"Reducing along axis {self!r}: requested start {the_slice.start!r} between bin boundaries, no interpolation is performed", - RuntimeWarning, - ) - else: - if the_slice.start not in self._bins: - warnings.warn( - f"Reducing along axis {self!r}: requested start {the_slice.start!r} between bin boundaries, no interpolation is performed", - RuntimeWarning, - ) - blo = self.index(the_slice.start) - if the_slice.stop is not None: - if the_slice.stop > self._hi: - raise ValueError( - f"Reducing along axis {self!r}: requested stop {the_slice.stop!r} exceeds bin boundaries (use open slicing, e.g. x[start:])" - ) - if self._uniform: - bhi_real = (the_slice.stop - self._lo) * self._bins / ( - self._hi - self._lo - ) + 1 - bhi = np.clip(np.round(bhi_real).astype(int), 0, self._bins + 1) - if abs(bhi - bhi_real) > 1.0e-14: - warnings.warn( - f"Reducing along axis {self!r}: requested stop {the_slice.stop!r} between bin boundaries, no interpolation is performed", - RuntimeWarning, - ) - else: - if the_slice.stop not in self._bins: - warnings.warn( - f"Reducing along axis {self!r}: requested stop {the_slice.stop!r} between bin boundaries, no interpolation is performed", - RuntimeWarning, - ) - bhi = self.index(the_slice.stop) - # Assume null ranges (start==stop) mean we want the bin containing the value - if blo is not None and blo == bhi: - bhi += 1 - if the_slice.step is not None: - raise NotImplementedError( - "Step slicing can be interpreted as a rebin factor" - ) - return slice(blo, bhi, the_slice.step) - elif isinstance(the_slice, list) and all( - isinstance(v, Interval) for v in the_slice - ): - raise NotImplementedError("Slice histogram from list of intervals") - raise IndexError(f"Cannot understand slice {the_slice!r} on axis {self!r}") - - def reduced(self, islice): - """Return a new axis with reduced binning - - The new binning corresponds to the slice made on this axis. - Overflow will be taken care of by ``Hist.__getitem__`` - - Parameters - ---------- - islice : slice - ``islice.start`` and ``islice.stop`` should be None or within ``[1, ax.size() - 1]`` - This slice is usually as returned from ``Bin._ireduce`` - """ - if islice.step is not None: - raise NotImplementedError( - "Step slicing can be interpreted as a rebin factor" - ) - if islice.start is None and islice.stop is None: - return self - if self._uniform: - lo = self._lo - ilo = 0 - if islice.start is not None: - lo += (islice.start - 1) * (self._hi - self._lo) / self._bins - ilo = islice.start - 1 - hi = self._hi - ihi = self._bins - if islice.stop is not None: - hi = self._lo + (islice.stop - 1) * (self._hi - self._lo) / self._bins - ihi = islice.stop - 1 - bins = ihi - ilo - # TODO: remove this once satisfied it works - rbins = (hi - lo) * self._bins / (self._hi - self._lo) - assert abs(bins - rbins) < 1e-14, "%d %f %r" % (bins, rbins, self) - return Bin(self._name, self._label, bins, lo, hi) - else: - lo = None if islice.start is None else islice.start - 1 - hi = -1 if islice.stop is None else islice.stop - bins = self._bins[slice(lo, hi)] - return Bin(self._name, self._label, bins) - - @property - def size(self): - """Number of bins, including overflow (i.e. ``n + 3``)""" - if self._uniform: - return self._bins + 3 - # (inf added at constructor) - return len(self._bins) + 1 - - def edges(self, overflow="none"): - """Bin boundaries - - Parameters - ---------- - overflow : str - Create overflow and/or underflow bins by adding a bin of same width to each end. - See `Hist.sum` description for the allowed values. - """ - if self._uniform: - out = cupy.linspace(self._lo, self._hi, self._bins + 1) - else: - out = self._bins[:-1].copy() - out = cupy.r_[ - 2 * out[0] - out[1], out, 2 * out[-1] - out[-2], 3 * out[-1] - 2 * out[-2] - ] - return out[overflow_behavior(overflow)] - - def centers(self, overflow="none"): - """Bin centers - - Parameters - ---------- - overflow : str - Create overflow and/or underflow bins by adding a bin of same width to each end. - See `Hist.sum` description for the allowed values. - """ - edges = self.edges(overflow) - return (edges[:-1] + edges[1:]) / 2 - - def identifiers(self, overflow="none"): - """List of `Interval` identifiers""" - return self._intervals[overflow_behavior(overflow)] - - class AccumulatorABC(metaclass=ABCMeta): """Abstract base class for an accumulator @@ -1076,7 +383,7 @@ def __getitem__(self, keys): def dense_op(array): as_numpy = array.get() - blocked = np.block(assemble_blocks(as_numpy, dense_idx)) + blocked = np.block(_assemble_blocks(as_numpy, dense_idx)) return cupy.asarray(blocked) out = Hist(self._label, *new_dims, dtype=self._dtype) @@ -1212,7 +519,7 @@ def sum(self, *axes, **kwargs): if isinstance(axis, DenseAxis): idense = self._idense(axis) dense_sum_dim.append(idense) - dense_slice[idense] = overflow_behavior(overflow) + dense_slice[idense] = _overflow_behavior(overflow) elif isinstance(axis, SparseAxis): isparse = self._isparse(axis) sparse_drop.append(isparse) @@ -1439,7 +746,7 @@ def view_dim(arr): return arr else: return arr[ - tuple(overflow_behavior(overflow) for _ in range(self.dense_dim())) + tuple(_overflow_behavior(overflow) for _ in range(self.dense_dim())) ] out = {} @@ -1530,7 +837,7 @@ def identifiers(self, axis, overflow="none"): return axis.identifiers(overflow=overflow) def to_boost(self): - """Convert this coffea Hist object to a boost_histogram obbject""" + """Convert this cuda_histogram object to a boost_histogram object""" import boost_histogram newaxes = [] @@ -1599,7 +906,7 @@ def expandkey(key): return out def to_hist(self): - """Convert the cuda_histogram histogram to a hist object""" + """Convert this cuda_histogram object to a hist object""" import hist return hist.Hist(self.to_boost())