Skip to content

Commit

Permalink
Merge pull request #204 from guillermo-navas-palencia/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
guillermo-navas-palencia authored Oct 27, 2022
2 parents 8a1f8ed + c7db97b commit ceccfce
Show file tree
Hide file tree
Showing 15 changed files with 254 additions and 44 deletions.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ OptBinning requires
* numpy (>=1.16.1)
* ortools (>=7.2)
* pandas
* ropwr (>=0.2.0)
* ropwr (>=0.4.0)
* scikit-learn (>=0.22.0)
* scipy (>=1.6.0)

Expand Down
4 changes: 2 additions & 2 deletions doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
author = 'Guillermo Navas-Palencia'

# The short X.Y version
version = '0.16.0'
version = '0.16.1'
# The full version, including alpha/beta/rc tags
release = '0.16.0'
release = '0.16.1'


# -- General configuration ---------------------------------------------------
Expand Down
6 changes: 6 additions & 0 deletions doc/source/outlier.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,9 @@ Outlier detection
:members:
:inherited-members:
:show-inheritance:


.. autoclass:: optbinning.binning.outlier.YQuantileDetector
:members:
:inherited-members:
:show-inheritance:
17 changes: 17 additions & 0 deletions doc/source/release_notes.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
Release Notes
=============

Version 0.16.1 (2022-11-28)
---------------------------

New features:

- Outlier detector``YQuantileDetector`` for continuous target (`Issue 203 <https://github.com/guillermo-navas-palencia/optbinning/issues/203>`_).

Improvements

- Add support to solver SCS and HIGHS for optimal piecewise binning classes.
- Unit testing outlier detector methods.

Bugfixes

- Pass ``lb`` and ``ub`` as keyword arguments to RoPWR fit method (required since ropwr>=0.4.0).


Version 0.16.0 (2022-10-24)
---------------------------

Expand Down
2 changes: 1 addition & 1 deletion optbinning/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Version information."""

__version__ = "0.16.0"
__version__ = "0.16.1"
14 changes: 10 additions & 4 deletions optbinning/binning/binning_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,15 +617,17 @@ def plot(self, metric="woe", add_special=True, add_missing=True,
add_missing : bool (default=True)
Whether to add the special values bin.
style: str, optional (default="bin")
style : str, optional (default="bin")
Plot style. style="bin" shows the standard binning plot. If
style="actual", show the plot with the actual scale, i.e, actual
bin widths.
show_bin_labels: bool (default=False)
show_bin_labels : bool (default=False)
Whether to show the bin label instead of the bin id on the x-axis.
For long labels (length > 27), labels are truncated.
.. versionadded:: 0.15.1
savefig : str or None (default=None)
Path to save the plot figure.
"""
Expand Down Expand Up @@ -1203,10 +1205,12 @@ def plot(self, add_special=True, add_missing=True, show_bin_labels=False,
add_missing : bool (default=True)
Whether to add the special values bin.
show_bin_labels: bool (default=False)
show_bin_labels : bool (default=False)
Whether to show the bin label instead of the bin id on the x-axis.
For long labels (length > 27), labels are truncated.
.. versionadded:: 0.15.1
savefig : str or None (default=None)
Path to save the plot figure.
"""
Expand Down Expand Up @@ -1660,10 +1664,12 @@ def plot(self, add_special=True, add_missing=True, style="bin",
style="actual", show the plot with the actual scale, i.e, actual
bin widths.
show_bin_labels: bool (default=False)
show_bin_labels : bool (default=False)
Whether to show the bin label instead of the bin id on the x-axis.
For long labels (length > 27), labels are truncated.
.. versionadded:: 0.15.1
savefig : str or None (default=None)
Path to save the plot figure.
"""
Expand Down
10 changes: 6 additions & 4 deletions optbinning/binning/continuous_binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,10 @@ def _check_parameters(name, dtype, prebinning_method, max_n_prebins,
raise ValueError("gamma must be >= 0; got {}.".format(gamma))

if outlier_detector is not None:
if outlier_detector not in ("range", "zscore"):
if outlier_detector not in ("range", "zscore", "yquantile"):
raise ValueError('Invalid value for outlier_detector. Allowed '
'string values are "range" and "zscore".')
'string values are "range", "zscore" and '
'"yquantile".')

if outlier_params is not None:
if not isinstance(outlier_params, dict):
Expand Down Expand Up @@ -255,8 +256,9 @@ class ContinuousOptimalBinning(OptimalBinning):
outlier_detector : str or None, optional (default=None)
The outlier detection method. Supported methods are "range" to use
the interquartile range based method or "zcore" to use the modified
Z-score method.
the interquartile range based method, "zcore" to use the modified
Z-score method or "yquantile" to use the y-axis detector over
quantiles.
outlier_params : dict or None, optional (default=None)
Dictionary of parameters to pass to the outlier detection method.
Expand Down
82 changes: 77 additions & 5 deletions optbinning/binning/outlier.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,20 @@ def __init__(self):
# flag
self._is_fitted = False

def fit(self, x):
"""Fit univariate outlier detector.
def fit(self, x, y=None):
"""Fit outlier detector.
Parameters
----------
x : array-like, shape = (n_samples)
y : array-like, shape = (n_samples) or None (default=None)
Returns
-------
self : OutlierDetector
"""
self._fit(x)
self._fit(x, y)

return self

Expand Down Expand Up @@ -88,7 +90,7 @@ def __init__(self, interval_length=0.5, k=1.5, method="ETI"):
self.k = k
self.method = method

def _fit(self, x):
def _fit(self, x, y=None):
if self.method not in ("ETI", "HDI"):
raise ValueError('Invalid value for method. Allowed string '
'values are "ETI" and "HDI".')
Expand Down Expand Up @@ -144,7 +146,12 @@ class ModifiedZScoreDetector(BaseEstimator, OutlierDetector):
def __init__(self, threshold=3.5):
self.threshold = threshold

def _fit(self, x):
def _fit(self, x, y=None):
if (not isinstance(self.threshold, numbers.Number) or
self.threshold < 0):
raise ValueError("threshold must be a value >= 0; got {}".
format(self.threshold))

x = np.asarray(x)
median = np.median(x)
mad = np.median(np.abs(x - median))
Expand All @@ -153,3 +160,68 @@ def _fit(self, x):
self._support = np.abs(m_z_score) > self.threshold

self._is_fitted = True


class YQuantileDetector(BaseEstimator, OutlierDetector):
"""Outlier detector on the y-axis over quantiles.
Parameters
----------
outlier_detector : str or None, optional (default=None)
The outlier detection method. Supported methods are "range" to use
the interquartile range based method or "zcore" to use the modified
Z-score method.
outlier_params : dict or None, optional (default=None)
Dictionary of parameters to pass to the outlier detection method.
n_bins : int (default=5)
The maximum number of bins to consider.
"""
def __init__(self, outlier_detector="zscore", outlier_params=None,
n_bins=5):
self.outlier_detector = outlier_detector
self.outlier_params = outlier_params
self.n_bins = n_bins

def _fit(self, x, y):
if self.outlier_detector not in ("range", "zscore"):
raise ValueError('Invalid value for outlier_detector. Allowed '
'string values are "range" and "zscore".')

if self.outlier_params is not None:
if not isinstance(self.outlier_params, dict):
raise TypeError("outlier_params must be a dict or None; "
"got {}.".format(self.outlier_params))

if not isinstance(self.n_bins, numbers.Integral) or self.n_bins <= 0:
raise ValueError("bins must be a positive integer; got {}."
.format(self.n_bins))

x = np.asarray(x)
y = np.asarray(y)

q = np.linspace(0, 1, self.n_bins + 1)
splits = np.unique(np.quantile(x, q))[1:-1]
n_bins = len(splits) + 1
indices = np.digitize(x, splits, right=False)

self._support = np.zeros(x.size, dtype=bool)
idx_support = np.arange(x.size)

if self.outlier_detector == "zscore":
detector = ModifiedZScoreDetector()
elif self.outlier_detector == "range":
detector = RangeDetector()

if self.outlier_params is not None:
detector.set_params(**self.outlier_params)

for i in range(n_bins):
mask_x = indices == i
detector.fit(y[mask_x])
mask_out = detector.get_support()
idx_out = idx_support[mask_x][mask_out]
self._support[idx_out] = True

self._is_fitted = True
12 changes: 7 additions & 5 deletions optbinning/binning/piecewise/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,10 @@ def _check_parameters(name, estimator, objective, degree, continuous,
'values are "all" and "consecutive".')

if outlier_detector is not None:
if outlier_detector not in ("range", "zscore"):
if outlier_detector not in ("range", "zscore", "yquantile"):
raise ValueError('Invalid value for outlier_detector. Allowed '
'string values are "range" and "zscore".')
'string values are "range", "zscore" and '
'"yquantile".')

if outlier_params is not None:
if not isinstance(outlier_params, dict):
Expand Down Expand Up @@ -171,9 +172,10 @@ def _check_parameters(name, estimator, objective, degree, continuous,
raise ValueError("split_digist must be an integer in [0, 8]; "
"got {}.".format(split_digits))

if solver not in ("auto", "ecos", "osqp", "direct"):
if solver not in ("auto", "ecos", "osqp", "direct", "scs", "highs"):
raise ValueError('Invalid value for solver. Allowed string '
'values are "auto", "ecos", "osqp" and "direct".')
'values are "auto", "ecos", "osqp", "direct", '
'"scs" and "highs".')

if not isinstance(h_epsilon, numbers.Number) or h_epsilon < 1.0:
raise ValueError("h_epsilon must a number >= 1.0; got {}."
Expand Down Expand Up @@ -453,7 +455,7 @@ def _fit_binning(self, x, y, prediction, lb, ub):
self.solver, self.h_epsilon, self.quantile, self.regularization,
self.reg_l1, self.reg_l1, self.verbose)

optimizer.fit(x_subsamples, pred_subsamples, splits, lb, ub)
optimizer.fit(x_subsamples, pred_subsamples, splits, lb=lb, ub=ub)

self._c = optimizer.coef_

Expand Down
9 changes: 6 additions & 3 deletions optbinning/binning/piecewise/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,9 @@ class OptimalPWBinning(BasePWBinning):
outlier_detector : str or None, optional (default=None)
The outlier detection method. Supported methods are "range" to use
the interquartile range based method or "zcore" to use the modified
Z-score method.
the interquartile range based method, "zcore" to use the modified
Z-score method or "yquantile" to use the y-axis detector over
quantiles.
outlier_params : dict or None, optional (default=None)
Dictionary of parameters to pass to the outlier detection method.
Expand All @@ -142,7 +143,9 @@ class OptimalPWBinning(BasePWBinning):
<https://github.com/embotech/ecos>`_, `"osqp"
<https://github.com/oxfordcontrol/osqp>`_, "direct", to choose the
direct solver, and "auto", to choose the most appropriate solver for
the problem.
the problem. Version 0.16.1 added support to solvers
`"scs" <https://github.com/cvxgrp/scs>`_ and `"highs"
<https://github.com/ERGO-Code/HiGHS>`_.
h_epsilon: float (default=1.35)
The parameter h_epsilon used when ``objective="huber"``, controls the
Expand Down
18 changes: 6 additions & 12 deletions optbinning/binning/piecewise/continuous_binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,9 @@ class ContinuousOptimalPWBinning(BasePWBinning):
outlier_detector : str or None, optional (default=None)
The outlier detection method. Supported methods are "range" to use
the interquartile range based method or "zcore" to use the modified
Z-score method.
the interquartile range based method, "zcore" to use the modified
Z-score method or "yquantile" to use the y-axis detector over
quantiles.
outlier_params : dict or None, optional (default=None)
Dictionary of parameters to pass to the outlier detection method.
Expand All @@ -132,7 +133,9 @@ class ContinuousOptimalPWBinning(BasePWBinning):
<https://github.com/embotech/ecos>`_, `"osqp"
<https://github.com/oxfordcontrol/osqp>`_, "direct", to choose the
direct solver, and "auto", to choose the most appropriate solver for
the problem.
the problem. Version 0.16.1 added support to solvers
`"scs" <https://github.com/cvxgrp/scs>`_ and `"highs"
<https://github.com/ERGO-Code/HiGHS>`_.
h_epsilon: float (default=1.35)
The parameter h_epsilon used when ``objective="huber"``, controls the
Expand Down Expand Up @@ -340,15 +343,6 @@ def _fit(self, x, y, lb, ub, check_input):

time_postprocessing = time.perf_counter()

# Compute n_records and sum for special and missing
# self._n_records_special = len(y_special)
# self._sum_special = np.sum(y_special)
# self._n_zeros_special = np.count_nonzero(y_special == 0)
# if len(y_special):
# self._std_special = np.std(y_special)
# self._min_target_special = np.min(y_special)
# self._max_target_special = np.max(y_special)

[self._n_records_special, self._sum_special, self._n_zeros_special,
self._std_special, self._min_target_special,
self._max_target_special] = target_info_special_continuous(
Expand Down
19 changes: 14 additions & 5 deletions optbinning/binning/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from .outlier import ModifiedZScoreDetector
from .outlier import RangeDetector
from .outlier import YQuantileDetector


def categorical_transform(x, y):
Expand Down Expand Up @@ -78,8 +79,9 @@ def split_data(dtype, x, y, special_codes=None, cat_cutoff=None,
outlier_detector : str or None (default=None)
The outlier detection method. Supported methods are "range" to use
the interquartile range based method or "zcore" to use the modified
Z-score method.
the interquartile range based method, "zcore" to use the modified
Z-score method or "yquantile" to use the y-axis detector over
quantiles.
outlier_params : dict or None (default=None)
Dictionary of parameters to pass to the outlier detection method.
Expand Down Expand Up @@ -139,9 +141,10 @@ def split_data(dtype, x, y, special_codes=None, cat_cutoff=None,
Others data sample weight.
"""
if outlier_detector is not None:
if outlier_detector not in ("range", "zscore"):
if outlier_detector not in ("range", "zscore", "yquantile"):
raise ValueError('Invalid value for outlier_detector. Allowed '
'string values are "range" and "zscore".')
'string values are "range", "zscore" and '
'"yquantile".')

if outlier_params is not None:
if not isinstance(outlier_params, dict):
Expand Down Expand Up @@ -229,11 +232,17 @@ def split_data(dtype, x, y, special_codes=None, cat_cutoff=None,
detector = RangeDetector()
elif outlier_detector == "zscore":
detector = ModifiedZScoreDetector()
elif outlier_detector == "yquantile":
detector = YQuantileDetector()

if outlier_params is not None:
detector.set_params(**outlier_params)

mask_outlier = detector.fit(x_clean).get_support()
if outlier_detector in ("range", "zscore"):
mask_outlier = detector.fit(x_clean).get_support()
else:
mask_outlier = detector.fit(x_clean, y_clean).get_support()

x_clean = x_clean[~mask_outlier]
y_clean = y_clean[~mask_outlier]
sw_clean = sw_clean[~mask_outlier]
Expand Down
Loading

0 comments on commit ceccfce

Please sign in to comment.