diff --git a/README.rst b/README.rst index b27e7e0..3200a87 100644 --- a/README.rst +++ b/README.rst @@ -227,7 +227,7 @@ Print overview information about the options settings, problem statistics, and t .. code-block:: text - optbinning (Version 0.16.0) + optbinning (Version 0.17.0) Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0 Begin options @@ -395,7 +395,7 @@ and the number of selected variables after the binning process. .. code-block:: text - optbinning (Version 0.16.0) + optbinning (Version 0.17.0) Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0 Begin options diff --git a/doc/source/conf.py b/doc/source/conf.py index 8214b63..385a06b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -22,9 +22,9 @@ author = 'Guillermo Navas-Palencia' # The short X.Y version -version = '0.16.1' +version = '0.17.0' # The full version, including alpha/beta/rc tags -release = '0.16.1' +release = '0.17.0' # -- General configuration --------------------------------------------------- diff --git a/doc/source/release_notes.rst b/doc/source/release_notes.rst index 8422665..f2aeef1 100644 --- a/doc/source/release_notes.rst +++ b/doc/source/release_notes.rst @@ -1,12 +1,27 @@ Release Notes ============= -Version 0.16.1 (2022-11-28) +Version 0.17.1 (2022-11-06) --------------------------- New features: - - Outlier detector``YQuantileDetector`` for continuous target (`Issue 203 `_). + - Optimize formulation of minimum difference constraints for all optimal binning classes and support these constraints regardless of the monotonic trend (`Issue 201 `_). + + - Implementation of sample weight for ``ContinuousOptimalBinning`` (`Issue 131 `_). + + +Bugfixes: + + - Fix ``ContinuousOptimalBinning`` prebinning step when no prebinning splits were generated (`Issue 205 `_). + + +Version 0.16.1 (2022-10-28) +--------------------------- + +New features: + + - Outlier detector ``YQuantileDetector`` for continuous target (`Issue 203 `_). Improvements diff --git a/doc/source/tutorials/tutorial_binary.ipynb b/doc/source/tutorials/tutorial_binary.ipynb index 30954bf..6208dce 100644 --- a/doc/source/tutorials/tutorial_binary.ipynb +++ b/doc/source/tutorials/tutorial_binary.ipynb @@ -1135,7 +1135,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : mean radius \n", @@ -1166,7 +1166,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : mean radius \n", @@ -1213,7 +1213,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", @@ -2091,7 +2091,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : mean texture \n", @@ -2373,7 +2373,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : NAME_INCOME_TYPE \n", @@ -2564,7 +2564,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : AverageMInFile \n", diff --git a/doc/source/tutorials/tutorial_binary_large_scale.ipynb b/doc/source/tutorials/tutorial_binary_large_scale.ipynb index a7ee354..17f71e1 100644 --- a/doc/source/tutorials/tutorial_binary_large_scale.ipynb +++ b/doc/source/tutorials/tutorial_binary_large_scale.ipynb @@ -134,7 +134,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : REGION_POPULATION_RELATIVE \n", @@ -309,7 +309,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : REGION_POPULATION_RELATIVE \n", diff --git a/doc/source/tutorials/tutorial_binary_localsolver.ipynb b/doc/source/tutorials/tutorial_binary_localsolver.ipynb index 5f68ccb..d7b6696 100644 --- a/doc/source/tutorials/tutorial_binary_localsolver.ipynb +++ b/doc/source/tutorials/tutorial_binary_localsolver.ipynb @@ -272,7 +272,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : REGION_POPULATION_RELATIVE \n", @@ -398,7 +398,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : REGION_POPULATION_RELATIVE \n", @@ -499,7 +499,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : REGION_POPULATION_RELATIVE \n", @@ -589,7 +589,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The current solution is IV = 0.03514331, compared to the LocalSolver solver solution 0.03776231. Let us increase the time limit to 200 seconds." + "The current solution is IV = 0.03737164, compared to the LocalSolver solver solution 0.03776231. Let us increase the time limit to 200 seconds." ] }, { @@ -669,7 +669,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : REGION_POPULATION_RELATIVE \n", diff --git a/doc/source/tutorials/tutorial_binary_under_uncertainty.ipynb b/doc/source/tutorials/tutorial_binary_under_uncertainty.ipynb index 299252a..f908ea7 100644 --- a/doc/source/tutorials/tutorial_binary_under_uncertainty.ipynb +++ b/doc/source/tutorials/tutorial_binary_under_uncertainty.ipynb @@ -303,7 +303,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", diff --git a/doc/source/tutorials/tutorial_binning_2d.ipynb b/doc/source/tutorials/tutorial_binning_2d.ipynb index 13a1893..c9d045d 100644 --- a/doc/source/tutorials/tutorial_binning_2d.ipynb +++ b/doc/source/tutorials/tutorial_binning_2d.ipynb @@ -1192,7 +1192,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", @@ -2606,7 +2606,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : ExternalRiskEstimate-AverageMInFile\n", @@ -2681,7 +2681,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : ExternalRiskEstimate-AverageMInFile\n", @@ -3065,7 +3065,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : ExternalRiskEstimate-AverageMInFile\n", @@ -3481,7 +3481,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : ExternalRiskEstimate-AverageMInFile\n", @@ -3558,7 +3558,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : ExternalRiskEstimate-AverageMInFile\n", @@ -3950,7 +3950,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : ExternalRiskEstimate-AverageMInFile\n", diff --git a/doc/source/tutorials/tutorial_binning_process_FICO_xAI.ipynb b/doc/source/tutorials/tutorial_binning_process_FICO_xAI.ipynb index 3074cb8..e78cb18 100644 --- a/doc/source/tutorials/tutorial_binning_process_FICO_xAI.ipynb +++ b/doc/source/tutorials/tutorial_binning_process_FICO_xAI.ipynb @@ -463,7 +463,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", diff --git a/doc/source/tutorials/tutorial_binning_process_sklearn_pipeline.ipynb b/doc/source/tutorials/tutorial_binning_process_sklearn_pipeline.ipynb index bb8f1cf..fb07ad9 100644 --- a/doc/source/tutorials/tutorial_binning_process_sklearn_pipeline.ipynb +++ b/doc/source/tutorials/tutorial_binning_process_sklearn_pipeline.ipynb @@ -204,7 +204,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Statistics\n", diff --git a/doc/source/tutorials/tutorial_binning_process_telco_churn.ipynb b/doc/source/tutorials/tutorial_binning_process_telco_churn.ipynb index c949aff..98816e4 100644 --- a/doc/source/tutorials/tutorial_binning_process_telco_churn.ipynb +++ b/doc/source/tutorials/tutorial_binning_process_telco_churn.ipynb @@ -402,7 +402,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", diff --git a/doc/source/tutorials/tutorial_continuous.ipynb b/doc/source/tutorials/tutorial_continuous.ipynb index 7f8eb33..ea1c402 100644 --- a/doc/source/tutorials/tutorial_continuous.ipynb +++ b/doc/source/tutorials/tutorial_continuous.ipynb @@ -1022,7 +1022,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : INDUS \n", diff --git a/doc/source/tutorials/tutorial_counterfactual_binary_target.ipynb b/doc/source/tutorials/tutorial_counterfactual_binary_target.ipynb index e003d65..3b359ec 100644 --- a/doc/source/tutorials/tutorial_counterfactual_binary_target.ipynb +++ b/doc/source/tutorials/tutorial_counterfactual_binary_target.ipynb @@ -303,7 +303,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", @@ -751,7 +751,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Status : OPTIMAL \n", @@ -871,7 +871,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Status : OPTIMAL \n", @@ -990,7 +990,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Status : OPTIMAL \n", @@ -1111,7 +1111,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Status : OPTIMAL \n", @@ -1294,7 +1294,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Status : FEASIBLE \n", @@ -1450,7 +1450,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Status : OPTIMAL \n", @@ -1606,7 +1606,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Status : OPTIMAL \n", @@ -1893,7 +1893,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Status : OPTIMAL \n", @@ -2030,7 +2030,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Status : OPTIMAL \n", @@ -2169,7 +2169,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Status : OPTIMAL \n", diff --git a/doc/source/tutorials/tutorial_counterfactual_continuous_target.ipynb b/doc/source/tutorials/tutorial_counterfactual_continuous_target.ipynb index 62656df..bce3639 100644 --- a/doc/source/tutorials/tutorial_counterfactual_continuous_target.ipynb +++ b/doc/source/tutorials/tutorial_counterfactual_continuous_target.ipynb @@ -348,7 +348,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Status : OPTIMAL \n", diff --git a/doc/source/tutorials/tutorial_piecewise_binary.ipynb b/doc/source/tutorials/tutorial_piecewise_binary.ipynb index 3e0e548..23a648b 100644 --- a/doc/source/tutorials/tutorial_piecewise_binary.ipynb +++ b/doc/source/tutorials/tutorial_piecewise_binary.ipynb @@ -548,7 +548,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : mean radius \n", @@ -579,7 +579,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : mean radius \n", @@ -624,7 +624,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", @@ -1360,7 +1360,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : mean texture \n", @@ -1579,7 +1579,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : AverageMInFile \n", diff --git a/doc/source/tutorials/tutorial_scorecard_binary_target.ipynb b/doc/source/tutorials/tutorial_scorecard_binary_target.ipynb index 39a4015..80b88f3 100644 --- a/doc/source/tutorials/tutorial_scorecard_binary_target.ipynb +++ b/doc/source/tutorials/tutorial_scorecard_binary_target.ipynb @@ -478,7 +478,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", diff --git a/doc/source/tutorials/tutorial_scorecard_continuous_target.ipynb b/doc/source/tutorials/tutorial_scorecard_continuous_target.ipynb index 35a910c..3bc8453 100644 --- a/doc/source/tutorials/tutorial_scorecard_continuous_target.ipynb +++ b/doc/source/tutorials/tutorial_scorecard_continuous_target.ipynb @@ -267,7 +267,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", diff --git a/doc/source/tutorials/tutorial_scorecard_monitoring.ipynb b/doc/source/tutorials/tutorial_scorecard_monitoring.ipynb index 11d90be..8c6cc33 100644 --- a/doc/source/tutorials/tutorial_scorecard_monitoring.ipynb +++ b/doc/source/tutorials/tutorial_scorecard_monitoring.ipynb @@ -208,7 +208,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", @@ -338,7 +338,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", @@ -1322,7 +1322,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", diff --git a/doc/source/tutorials/tutorial_sketch_binary.ipynb b/doc/source/tutorials/tutorial_sketch_binary.ipynb index f05fa7a..dcb5431 100644 --- a/doc/source/tutorials/tutorial_sketch_binary.ipynb +++ b/doc/source/tutorials/tutorial_sketch_binary.ipynb @@ -199,7 +199,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", @@ -912,7 +912,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Begin options\n", @@ -1399,7 +1399,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "optbinning (Version 0.16.0)\n", + "optbinning (Version 0.17.0)\n", "Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0\n", "\n", " Name : EXT_SOURCE_3 \n", diff --git a/optbinning/_version.py b/optbinning/_version.py index dbc8259..7486f15 100644 --- a/optbinning/_version.py +++ b/optbinning/_version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "0.16.1" +__version__ = "0.17.0" diff --git a/optbinning/binning/binning.py b/optbinning/binning/binning.py index eb97085..d94667d 100644 --- a/optbinning/binning/binning.py +++ b/optbinning/binning/binning.py @@ -331,9 +331,9 @@ class OptimalBinning(BaseOptimalBinning): monotonic constraint is disabled. min_event_rate_diff : float, optional (default=0) - The minimum event rate difference between consecutives bins. This - option currently only applies when ``monotonic_trend`` is "ascending", - "descending", "peak_heuristic" or "valley_heuristic". + The minimum event rate difference between consecutives bins. For solver + "ls", this option currently only applies when monotonic_trend is + “ascending”, “descending”, “peak_heuristic” or “valley_heuristic”. max_pvalue : float or None, optional (default=None) The maximum p-value among bins. The Z-test is used to detect bins diff --git a/optbinning/binning/binning_statistics.py b/optbinning/binning/binning_statistics.py index 2497800..be6543d 100644 --- a/optbinning/binning/binning_statistics.py +++ b/optbinning/binning/binning_statistics.py @@ -165,7 +165,7 @@ def target_info_special_multiclass(special_codes, x, y, classes): return n_event -def target_info_special_continuous(special_codes, x, y): +def target_info_special_continuous(special_codes, x, y, sw): if isinstance(special_codes, dict): n_records_special = [] sum_special = [] @@ -185,27 +185,34 @@ def target_info_special_continuous(special_codes, x, y): sl = s if isinstance(s, (list, np.ndarray)) else [s] mask = xt.isin(sl).values - n_records = np.count_nonzero(mask) + n_records = np.sum(sw[mask]) n_records_special.append(n_records) - sum_special.append(np.sum(y[mask])) - n_zeros_special.append(np.count_nonzero(y[mask] == 0)) + + ymask = sw[mask] * y[mask] + sum_special.append(np.sum(ymask)) + n_zeros_special.append(np.count_nonzero(ymask == 0)) if n_records: - std_special.append(np.std(y[mask])) - min_target_special.append(np.min(y[mask])) - max_target_special.append(np.max(y[mask])) + std_special.append(np.std(ymask)) + min_target_special.append(np.min(ymask)) + max_target_special.append(np.max(ymask)) else: std_special.append(0) min_target_special.append(0) max_target_special.append(0) else: - n_records_special = len(y) - sum_special = np.sum(y) - n_zeros_special = np.count_nonzero(y == 0) + if len(sw): + sw_y = sw * y + else: + sw_y = y + + n_records_special = np.sum(sw) + sum_special = np.sum(sw_y) + n_zeros_special = np.count_nonzero(sw_y == 0) if len(y): - std_special = np.std(y) - min_target_special = np.min(y) - max_target_special = np.max(y) + std_special = np.std(sw_y) + min_target_special = np.min(sw_y) + max_target_special = np.max(sw_y) else: std_special = None min_target_special = None diff --git a/optbinning/binning/continuous_binning.py b/optbinning/binning/continuous_binning.py index 47dc487..b96fa3a 100644 --- a/optbinning/binning/continuous_binning.py +++ b/optbinning/binning/continuous_binning.py @@ -235,9 +235,7 @@ class ContinuousOptimalBinning(OptimalBinning): is disabled. min_mean_diff : float, optional (default=0) - The minimum mean difference between consecutives bins. This - option currently only applies when ``monotonic_trend`` is "ascending" - or "descending". + The minimum mean difference between consecutives bins. max_pvalue : float or None, optional (default=None) The maximum p-value among bins. The T-test is used to detect bins @@ -397,7 +395,7 @@ def __init__(self, name="", dtype="numerical", prebinning_method="cart", self._is_fitted = False - def fit(self, x, y, check_input=False): + def fit(self, x, y, sample_weight=None, check_input=False): """Fit the optimal binning according to the given training data. Parameters @@ -408,6 +406,11 @@ def fit(self, x, y, check_input=False): y : array-like, shape = (n_samples,) Target vector relative to x. + sample_weight : array-like of shape (n_samples,) (default=None) + Array of weights that are assigned to individual samples. + If not provided, then each sample is given unit weight. + Only applied if ``prebinning_method="cart"``. + check_input : bool (default=False) Whether to check input arrays. @@ -416,10 +419,11 @@ def fit(self, x, y, check_input=False): self : ContinuousOptimalBinning Fitted optimal binning. """ - return self._fit(x, y, check_input) + return self._fit(x, y, sample_weight, check_input) - def fit_transform(self, x, y, metric="mean", metric_special=0, - metric_missing=0, show_digits=2, check_input=False): + def fit_transform(self, x, y, sample_weight=None, metric="mean", + metric_special=0, metric_missing=0, show_digits=2, + check_input=False): """Fit the optimal binning according to the given training data, then transform it. @@ -431,6 +435,11 @@ def fit_transform(self, x, y, metric="mean", metric_special=0, y : array-like, shape = (n_samples,) Target vector relative to x. + sample_weight : array-like of shape (n_samples,) (default=None) + Array of weights that are assigned to individual samples. + If not provided, then each sample is given unit weight. + Only applied if ``prebinning_method="cart"``. + metric : str (default="mean"): The metric used to transform the input vector. Supported metrics are "mean" to choose the mean, "indices" to assign the @@ -459,7 +468,7 @@ def fit_transform(self, x, y, metric="mean", metric_special=0, x_new : numpy array, shape = (n_samples,) Transformed array. """ - return self.fit(x, y, check_input).transform( + return self.fit(x, y, sample_weight, check_input).transform( x, metric, metric_special, metric_missing, show_digits, check_input) @@ -516,7 +525,7 @@ def transform(self, x, metric="mean", metric_special=0, metric_missing=0, metric_missing, self.user_splits, show_digits, check_input) - def _fit(self, x, y, check_input): + def _fit(self, x, y, sample_weight, check_input): time_init = time.perf_counter() if self.verbose: @@ -538,10 +547,11 @@ def _fit(self, x, y, check_input): time_preprocessing = time.perf_counter() [x_clean, y_clean, x_missing, y_missing, x_special, y_special, - y_others, categories, cat_others, _, _, _, _] = split_data( + y_others, categories, cat_others, sw_clean, sw_missing, sw_special, + sw_others] = split_data( self.dtype, x, y, self.special_codes, self.cat_cutoff, self.user_splits, check_input, self.outlier_detector, - self.outlier_params) + self.outlier_params, None, None, None, sample_weight) self._time_preprocessing = time.perf_counter() - time_preprocessing @@ -612,9 +622,9 @@ def _fit(self, x, y, check_input): user_splits = user_splits[sorted_idx] else: [categories, user_splits, x_clean, y_clean, y_others, - cat_others, _, _, sorted_idx + cat_others, sw_clean, sw_others, sorted_idx ] = preprocessing_user_splits_categorical( - self.user_splits, x_clean, y_clean, None) + self.user_splits, x_clean, y_clean, sw_clean) if self.user_splits_fixed is not None: self.user_splits_fixed = np.asarray( @@ -623,11 +633,13 @@ def _fit(self, x, y, check_input): [splits, n_records, sums, ssums, stds, min_t, max_t, n_zeros] = self._prebinning_refinement( user_splits, x_clean, y_clean, y_missing, x_special, - y_special, y_others) + y_special, y_others, sw_clean, sw_missing, sw_special, + sw_others) else: [splits, n_records, sums, ssums, stds, min_t, max_t, n_zeros] = self._fit_prebinning( - x_clean, y_clean, y_missing, x_special, y_special, y_others) + x_clean, y_clean, y_missing, x_special, y_special, y_others, + None, sw_clean, sw_missing, sw_special, sw_others) self._n_prebins = len(n_records) @@ -654,8 +666,14 @@ def _fit(self, x, y, check_input): time_postprocessing = time.perf_counter() if not len(splits): - n_records = n_records.sum() - sums = sums.sum() + n_records = np.sum(sw_clean) + sw_y_clean = sw_clean * y_clean + sums = np.sum(sw_y_clean) + ssums = np.sum(sw_y_clean ** 2) + n_zeros = np.count_nonzero(sw_y_clean == 0) + stds = np.std(sw_y_clean) + min_t = np.min(sw_y_clean) + max_t = np.max(sw_y_clean) [self._n_records, self._sums, self._stds, self._min_target, self._max_target, self._n_zeros] = continuous_bin_info( @@ -808,24 +826,19 @@ def _fit_optimizer(self, splits, n_records, sums, ssums, stds): .format(self._time_solver)) def _prebinning_refinement(self, splits_prebinning, x, y, y_missing, - x_special, y_special, y_others, sw_clean=None, - sw_missing=None, sw_special=None, - sw_others=None): - n_splits = len(splits_prebinning) - - if not n_splits: - return splits_prebinning, np.array([]), np.array([]) - - if self.split_digits is not None: - splits_prebinning = np.round(splits_prebinning, self.split_digits) + x_special, y_special, y_others, sw_clean, + sw_missing, sw_special, sw_others): # Compute n_records, sum and std for special, missing and others [self._n_records_special, self._sum_special, self._n_zeros_special, self._std_special, self._min_target_special, self._max_target_special] = target_info_special_continuous( - self.special_codes, x_special, y_special) + self.special_codes, x_special, y_special, sw_special) - self._n_records_missing = len(y_missing) + if len(sw_missing): + y_missing *= sw_missing + + self._n_records_missing = np.sum(sw_missing) self._sum_missing = np.sum(y_missing) self._n_zeros_missing = np.count_nonzero(y_missing == 0) if len(y_missing): @@ -834,20 +847,33 @@ def _prebinning_refinement(self, splits_prebinning, x, y, y_missing, self._max_target_missing = np.max(y_missing) if len(y_others): - self._n_records_cat_others = len(y_others) + if len(sw_others): + y_others *= sw_others + + self._n_records_cat_others = np.sum(sw_others) self._sum_cat_others = np.sum(y_others) self._std_cat_others = np.std(y_others) self._min_target_others = np.min(y_others) self._max_target_others = np.max(y_others) self._n_zeros_others = np.count_nonzero(y_others == 0) + n_splits = len(splits_prebinning) + + if not n_splits: + return (splits_prebinning, np.array([]), np.array([]), + np.array([]), np.array([]), np.array([]), np.array([]), + np.array([])) + + if self.split_digits is not None: + splits_prebinning = np.round(splits_prebinning, self.split_digits) + (splits_prebinning, n_records, sums, ssums, stds, min_t, max_t, - n_zeros) = self._compute_prebins(splits_prebinning, x, y) + n_zeros) = self._compute_prebins(splits_prebinning, x, y, sw_clean) return (splits_prebinning, n_records, sums, ssums, stds, min_t, max_t, n_zeros) - def _compute_prebins(self, splits_prebinning, x, y): + def _compute_prebins(self, splits_prebinning, x, y, sw): n_splits = len(splits_prebinning) if not n_splits: return splits_prebinning, np.array([]), np.array([]) @@ -870,8 +896,8 @@ def _compute_prebins(self, splits_prebinning, x, y): # Compute prebin information for i in range(n_bins): mask = (indices == i) - n_records[i] = np.count_nonzero(mask) - ymask = y[mask] + n_records[i] = np.sum(sw[mask]) + ymask = sw[mask] * y[mask] sums[i] = np.sum(ymask) ssums[i] = np.sum(ymask ** 2) n_zeros[i] = np.count_nonzero(ymask == 0) @@ -914,7 +940,7 @@ def _compute_prebins(self, splits_prebinning, x, y): .format(np.count_nonzero(mask_remove))) (splits_prebinning, n_records, sums, ssums, stds, min_t, max_t, - n_zeros) = self._compute_prebins(splits, x, y) + n_zeros) = self._compute_prebins(splits, x, y, sw) return (splits_prebinning, n_records, sums, ssums, stds, min_t, max_t, n_zeros) diff --git a/optbinning/binning/continuous_cp.py b/optbinning/binning/continuous_cp.py index c673285..9eba467 100644 --- a/optbinning/binning/continuous_cp.py +++ b/optbinning/binning/continuous_cp.py @@ -42,8 +42,10 @@ def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, def build_model(self, n_records, sums, ssums, trend_change): # Parameters M = int(1e6) - U, V, pvalue_violation_indices = continuous_model_data( - n_records, sums, ssums, self.max_pvalue, self.max_pvalue_policy, M) + [U, V, pvalue_violation_indices, + min_diff_violation_indices] = continuous_model_data( + n_records, sums, ssums, self.max_pvalue, self.max_pvalue_policy, + self.min_mean_diff, M) n = len(n_records) @@ -115,6 +117,9 @@ def build_model(self, n_records, sums, ssums, trend_change): # Constraint: max-pvalue self.add_max_pvalue_constraint(model, x, pvalue_violation_indices) + # Constraint: min diff + self.add_min_diff_constraint(model, x, min_diff_violation_indices) + # Constraint: fixed splits self.add_constraint_fixed_splits(model, n, x) @@ -123,7 +128,6 @@ def build_model(self, n_records, sums, ssums, trend_change): self._n = n def add_constraint_monotonic_ascending(self, model, n, U, x, M): - min_mean_diff = int(M * self.min_mean_diff) for i in range(1, n): for z in range(i): model.Add( @@ -131,11 +135,10 @@ def add_constraint_monotonic_ascending(self, model, n, U, x, M): for j in range(z)]) + U[z][z] * x[z, z] - U[i][i] * x[i, i] - sum([(U[i][j] - U[i][j + 1]) * x[i, j] - for j in range(i)]) + min_mean_diff <= 0 + for j in range(i)]) <= 0 ).OnlyEnforceIf([x[z, z], x[i, i]]) def add_constraint_monotonic_descending(self, model, n, U, x, M): - min_mean_diff = int(M * self.min_mean_diff) for i in range(1, n): for z in range(i): model.Add( @@ -143,7 +146,7 @@ def add_constraint_monotonic_descending(self, model, n, U, x, M): for j in range(i)]) + U[i][i] * x[i, i] - U[z][z] * x[z, z] - sum([(U[z][j] - U[z][j+1]) * x[z, j] - for j in range(z)]) + min_mean_diff <= 0 + for j in range(z)]) <= 0 ).OnlyEnforceIf([x[z, z], x[i, i]]) def add_constraint_monotonic_concave(self, model, n, U, x): @@ -213,7 +216,6 @@ def add_constraint_monotonic_valley(self, model, n, U, x, y): U[i][i] * x[i, i] >= 0).OnlyEnforceIf([x[z, z], x[i, i]]) def add_constraint_monotonic_peak_heuristic(self, model, n, U, x, tc, M): - min_mean_diff = int(M * self.min_mean_diff) for i in range(1, tc): for z in range(i): model.Add( @@ -221,7 +223,7 @@ def add_constraint_monotonic_peak_heuristic(self, model, n, U, x, tc, M): for j in range(z)]) + U[z][z] * x[z, z] - U[i][i] * x[i, i] - sum([(U[i][j] - U[i][j + 1]) * x[i, j] - for j in range(i)]) + min_mean_diff <= 0 + for j in range(i)]) <= 0 ).OnlyEnforceIf([x[z, z], x[i, i]]) for i in range(tc, n): @@ -231,11 +233,10 @@ def add_constraint_monotonic_peak_heuristic(self, model, n, U, x, tc, M): for j in range(i)]) + U[i][i] * x[i, i] - U[z][z] * x[z, z] - sum([(U[z][j] - U[z][j+1]) * x[z, j] - for j in range(z)]) + min_mean_diff <= 0 + for j in range(z)]) <= 0 ).OnlyEnforceIf([x[z, z], x[i, i]]) def add_constraint_monotonic_valley_heuristic(self, model, n, U, x, tc, M): - min_mean_diff = int(M * self.min_mean_diff) for i in range(1, tc): for z in range(i): model.Add( @@ -243,7 +244,7 @@ def add_constraint_monotonic_valley_heuristic(self, model, n, U, x, tc, M): for j in range(i)]) + U[i][i] * x[i, i] - U[z][z] * x[z, z] - sum([(U[z][j] - U[z][j+1]) * x[z, j] - for j in range(z)]) + min_mean_diff <= 0 + for j in range(z)]) <= 0 ).OnlyEnforceIf([x[z, z], x[i, i]]) for i in range(tc, n): @@ -253,5 +254,5 @@ def add_constraint_monotonic_valley_heuristic(self, model, n, U, x, tc, M): for j in range(z)]) + U[z][z] * x[z, z] - U[i][i] * x[i, i] - sum([(U[i][j] - U[i][j + 1]) * x[i, j] - for j in range(i)]) + min_mean_diff <= 0 + for j in range(i)]) <= 0 ).OnlyEnforceIf([x[z, z], x[i, i]]) diff --git a/optbinning/binning/cp.py b/optbinning/binning/cp.py index 0da9205..c9a092c 100644 --- a/optbinning/binning/cp.py +++ b/optbinning/binning/cp.py @@ -49,9 +49,10 @@ def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, def build_model(self, divergence, n_nonevent, n_event, trend_change): # Parameters M = int(1e6) - D, V, pvalue_violation_indices = model_data(divergence, n_nonevent, - n_event, self.max_pvalue, - self.max_pvalue_policy, M) + (D, V, pvalue_violation_indices, + min_diff_violation_indices) = model_data( + divergence, n_nonevent, n_event, self.max_pvalue, + self.max_pvalue_policy, self.min_event_rate_diff, M) n = len(n_nonevent) n_records = n_nonevent + n_event @@ -159,6 +160,9 @@ def build_model(self, divergence, n_nonevent, n_event, trend_change): # Constraint: max-pvalue self.add_max_pvalue_constraint(model, x, pvalue_violation_indices) + # Constraint: min diff + self.add_min_diff_constraint(model, x, min_diff_violation_indices) + # Constraint: fixed splits self.add_constraint_fixed_splits(model, n, x) @@ -169,8 +173,10 @@ def build_model(self, divergence, n_nonevent, n_event, trend_change): def build_model_scenarios(self, n_nonevent, n_event, w): # Parameters M = int(1e6) - D, V, pvalue_violation_indices = multiclass_model_data( - n_nonevent, n_event, self.max_pvalue, self.max_pvalue_policy, M) + (D, V, pvalue_violation_indices, + min_diff_violation_indices) = multiclass_model_data( + n_nonevent, n_event, self.max_pvalue, self.max_pvalue_policy, + self.min_event_rate_diff, M) n = len(n_nonevent) n_records = n_nonevent + n_event @@ -240,6 +246,11 @@ def build_model_scenarios(self, n_nonevent, n_event, w): self.add_max_pvalue_constraint(model, x, pvalue_violation_indices[s]) + # Constraint: min diff + for s in range(n_scenarios): + self.add_min_diff_constraint(model, x, + min_diff_violation_indices[s]) + # Constraint: fixed splits self.add_constraint_fixed_splits(model, n, x) @@ -378,7 +389,6 @@ def add_constraint_min_max_bin_size_scenarios(self, model, n, x, model.Add(bin_size <= self.max_bin_size[s] * x[i, i]) def add_constraint_monotonic_ascending(self, model, n, D, x, M): - min_event_rate_diff = int(M * self.min_event_rate_diff) for i in range(1, n): for z in range(i): model.Add( @@ -386,8 +396,7 @@ def add_constraint_monotonic_ascending(self, model, n, D, x, M): for j in range(z)]) + D[z][z] * x[z, z] - M - (D[i][i] - M) * x[i, i] - sum([(D[i][j] - D[i][j + 1]) * x[i, j] - for j in range(i)]) + - min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(i)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -399,7 +408,6 @@ def add_constraint_monotonic_ascending(self, model, n, D, x, M): model.Add(x[i+j, i+j] == 0) def add_constraint_monotonic_descending(self, model, n, D, x, M): - min_event_rate_diff = int(M * self.min_event_rate_diff) for i in range(1, n): for z in range(i): model.Add( @@ -407,8 +415,7 @@ def add_constraint_monotonic_descending(self, model, n, D, x, M): for j in range(i)]) + D[i][i] * x[i, i] - M - (D[z][z] - M) * x[z, z] - sum([(D[z][j] - D[z][j+1]) * x[z, j] - for j in range(z)]) + - min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(z)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -484,7 +491,6 @@ def add_constraint_monotonic_valley(self, model, n, D, x, y, M): D[i][i] * x[i, i] >= 0) def add_constraint_monotonic_peak_heuristic(self, model, n, D, x, tc, M): - min_event_rate_diff = int(M * self.min_event_rate_diff) for i in range(1, tc): for z in range(i): model.Add( @@ -492,8 +498,7 @@ def add_constraint_monotonic_peak_heuristic(self, model, n, D, x, tc, M): for j in range(z)]) + D[z][z] * x[z, z] - M - (D[i][i] - M) * x[i, i] - sum([(D[i][j] - D[i][j + 1]) * x[i, j] - for j in range(i)]) + - min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(i)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -511,8 +516,7 @@ def add_constraint_monotonic_peak_heuristic(self, model, n, D, x, tc, M): for j in range(i)]) + D[i][i] * x[i, i] - M - (D[z][z] - M) * x[z, z] - sum([(D[z][j] - D[z][j+1]) * x[z, j] - for j in range(z)]) + - min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(z)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -524,7 +528,6 @@ def add_constraint_monotonic_peak_heuristic(self, model, n, D, x, tc, M): model.Add(x[i+j, i+j] == 0) def add_constraint_monotonic_valley_heuristic(self, model, n, D, x, tc, M): - min_event_rate_diff = int(M * self.min_event_rate_diff) for i in range(1, tc): for z in range(i): model.Add( @@ -532,8 +535,7 @@ def add_constraint_monotonic_valley_heuristic(self, model, n, D, x, tc, M): for j in range(i)]) + D[i][i] * x[i, i] - M - (D[z][z] - M) * x[z, z] - sum([(D[z][j] - D[z][j+1]) * x[z, j] - for j in range(z)]) + - min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(z)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -551,8 +553,7 @@ def add_constraint_monotonic_valley_heuristic(self, model, n, D, x, tc, M): for j in range(z)]) + D[z][z] * x[z, z] - M - (D[i][i] - M) * x[i, i] - sum([(D[i][j] - D[i][j + 1]) * x[i, j] - for j in range(i)]) + - min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(i)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -568,6 +569,11 @@ def add_max_pvalue_constraint(self, model, x, pvalue_violation_indices): model.AddImplication(x[ind1[0], ind1[1]], x[ind2[0], ind2[1]].Not()) + def add_min_diff_constraint(self, model, x, min_diff_violation_indices): + for ind1, ind2 in min_diff_violation_indices: + model.AddImplication(x[ind1[0], ind1[1]], + x[ind2[0], ind2[1]].Not()) + def add_constraint_fixed_splits(self, model, n, x): if self.user_splits_fixed is not None: for i in range(n - 1): diff --git a/optbinning/binning/distributed/binning_sketch.py b/optbinning/binning/distributed/binning_sketch.py index df65875..f741d86 100644 --- a/optbinning/binning/distributed/binning_sketch.py +++ b/optbinning/binning/distributed/binning_sketch.py @@ -302,9 +302,7 @@ class OptimalBinningSketch(BaseSketch, BaseEstimator): monotonic constraint is disabled. min_event_rate_diff : float, optional (default=0) - The minimum event rate difference between consecutives bins. This - option currently only applies when ``monotonic_trend`` is "ascending", - "descending", "peak_heuristic" or "valley_heuristic". + The minimum event rate difference between consecutives bins. max_pvalue : float or None, optional (default=None) The maximum p-value among bins. The Z-test is used to detect bins diff --git a/optbinning/binning/ls.py b/optbinning/binning/ls.py index 54fe44e..080399a 100644 --- a/optbinning/binning/ls.py +++ b/optbinning/binning/ls.py @@ -49,9 +49,9 @@ def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, def build_model(self, divergence, n_nonevent, n_event, trend_change): # Parameters M = int(1e6) - D, V, NE, E, pvalue_violation_indices = model_data( + D, V, NE, E, _, _ = model_data( divergence, n_nonevent, n_event, self.max_pvalue, - self.max_pvalue_policy, M, True) + self.max_pvalue_policy, self.min_event_rate_diff, M, True) n = len(n_nonevent) diff --git a/optbinning/binning/mip.py b/optbinning/binning/mip.py index f0990d4..6a0c233 100644 --- a/optbinning/binning/mip.py +++ b/optbinning/binning/mip.py @@ -47,9 +47,11 @@ def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, def build_model(self, divergence, n_nonevent, n_event, trend_change): # Parameters - D, V, pvalue_violation_indices = model_data(divergence, n_nonevent, - n_event, self.max_pvalue, - self.max_pvalue_policy) + [D, V, pvalue_violation_indices, + min_diff_violation_indices] = model_data( + divergence, n_nonevent, n_event, self.max_pvalue, + self.max_pvalue_policy, self.min_event_rate_diff) + n = len(n_nonevent) n_records = n_nonevent + n_event @@ -165,6 +167,9 @@ def build_model(self, divergence, n_nonevent, n_event, trend_change): # Constraint: max-pvalue self.add_max_pvalue_constraint(solver, x, pvalue_violation_indices) + # Constraint: min diff + self.add_min_diff_constraint(solver, x, min_diff_violation_indices) + # Constraint: fixed splits self.add_constraint_fixed_splits(solver, n, x) @@ -281,8 +286,7 @@ def add_constraint_monotonic_ascending(self, solver, n, D, x): for j in range(z)]) + D[z][z] * x[z, z] - 1 - (D[i][i] - 1) * x[i, i] - solver.Sum([(D[i][j] - D[i][j + 1]) * x[i, j] - for j in range(i)]) + - self.min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(i)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -301,8 +305,7 @@ def add_constraint_monotonic_descending(self, solver, n, D, x): for j in range(i)]) + D[i][i] * x[i, i] - 1 - (D[z][z] - 1) * x[z, z] - solver.Sum([(D[z][j] - D[z][j+1]) * x[z, j] - for j in range(z)]) + - self.min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(z)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -389,8 +392,7 @@ def add_constraint_monotonic_peak_heuristic(self, solver, n, D, x, tc): for j in range(z)]) + D[z][z] * x[z, z] - 1 - (D[i][i] - 1) * x[i, i] - solver.Sum([(D[i][j] - D[i][j + 1]) * x[i, j] - for j in range(i)]) + - self.min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(i)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -408,8 +410,7 @@ def add_constraint_monotonic_peak_heuristic(self, solver, n, D, x, tc): for j in range(i)]) + D[i][i] * x[i, i] - 1 - (D[z][z] - 1) * x[z, z] - solver.Sum([(D[z][j] - D[z][j+1]) * x[z, j] - for j in range(z)]) + - self.min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(z)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -428,8 +429,7 @@ def add_constraint_monotonic_valley_heuristic(self, solver, n, D, x, tc): for j in range(i)]) + D[i][i] * x[i, i] - 1 - (D[z][z] - 1) * x[z, z] - solver.Sum([(D[z][j] - D[z][j+1]) * x[z, j] - for j in range(z)]) + - self.min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(z)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -447,8 +447,7 @@ def add_constraint_monotonic_valley_heuristic(self, solver, n, D, x, tc): for j in range(z)]) + D[z][z] * x[z, z] - 1 - (D[i][i] - 1) * x[i, i] - solver.Sum([(D[i][j] - D[i][j + 1]) * x[i, j] - for j in range(i)]) + - self.min_event_rate_diff * (x[i, i] + x[z, z] - 1) <= 0) + for j in range(i)]) <= 0) # Preprocessing if self.min_event_rate_diff == 0: @@ -463,6 +462,10 @@ def add_max_pvalue_constraint(self, solver, x, pvalue_violation_indices): for ind1, ind2 in pvalue_violation_indices: solver.Add(x[ind1[0], ind1[1]] + x[ind2[0], ind2[1]] <= 1) + def add_min_diff_constraint(self, solver, x, min_diff_violation_indices): + for ind1, ind2 in min_diff_violation_indices: + solver.Add(x[ind1[0], ind1[1]] + x[ind2[0], ind2[1]] <= 1) + def add_constraint_fixed_splits(self, solver, n, x): if self.user_splits_fixed is not None: for i in range(n - 1): diff --git a/optbinning/binning/model_data.py b/optbinning/binning/model_data.py index a9beeb7..64a123c 100644 --- a/optbinning/binning/model_data.py +++ b/optbinning/binning/model_data.py @@ -93,8 +93,22 @@ def find_pvalue_violation_indices_continuous(n, U, S, R, max_pvalue, return pvalue_violation_indices +def find_min_diff_violation_indices(n, X, min_diff): + min_diff_violation_indices = [] + + for i in range(n - 1): + for k in range(i + 1): + x = X[i][k] + for j in range(i + 1, n): + x2 = X[j][i + 1] + if abs(x - x2) < min_diff: + min_diff_violation_indices.append(([i, k], [j, i+1])) + + return min_diff_violation_indices + + def model_data(divergence, n_nonevent, n_event, max_pvalue, max_pvalue_policy, - scale=None, return_nonevent_event=False): + min_event_rate_diff, scale=None, return_nonevent_event=False): n = len(n_nonevent) t_n_event = n_event.sum() @@ -143,20 +157,32 @@ def model_data(divergence, n_nonevent, n_event, max_pvalue, max_pvalue_policy, else: pvalue_violation_indices = [] + if min_event_rate_diff > 0: + if scale is not None: + min_diff = int(min_event_rate_diff * scale) + else: + min_diff = min_event_rate_diff + + min_diff_violation_indices = find_min_diff_violation_indices( + n, D, min_diff) + else: + min_diff_violation_indices = [] + if return_nonevent_event: return D, V, NE, E, pvalue_violation_indices - return D, V, pvalue_violation_indices + return D, V, pvalue_violation_indices, min_diff_violation_indices def multiclass_model_data(n_nonevent, n_event, max_pvalue, max_pvalue_policy, - scale=None): + min_event_rate_diff, scale=None): n, n_classes = n_nonevent.shape DD = [] - PV = [] VV = [] + PV = [] + MD = [] for c in range(n_classes): t_n_event = n_event[:, c].sum() @@ -197,15 +223,27 @@ def multiclass_model_data(n_nonevent, n_event, max_pvalue, max_pvalue_policy, else: pvalue_violation_indices = [] + if min_event_rate_diff > 0: + if scale is not None: + min_diff = int(min_event_rate_diff * scale) + else: + min_diff = min_event_rate_diff + + min_diff_violation_indices = find_min_diff_violation_indices( + n, D, min_diff) + else: + min_diff_violation_indices = [] + DD.append(D) VV.append(V) PV.append(pvalue_violation_indices) + MD.append(min_diff_violation_indices) - return DD, VV, PV + return DD, VV, PV, MD def continuous_model_data(n_records, sums, ssums, max_pvalue, - max_pvalue_policy, scale=None): + max_pvalue_policy, min_mean_diff, scale=None): n = len(n_records) @@ -250,4 +288,15 @@ def continuous_model_data(n_records, sums, ssums, max_pvalue, else: pvalue_violation_indices = [] - return U, V, pvalue_violation_indices + if min_mean_diff > 0: + if scale is not None: + min_diff = int(min_mean_diff * scale) + else: + min_diff = min_mean_diff + + min_diff_violation_indices = find_min_diff_violation_indices( + n, U, min_diff) + else: + min_diff_violation_indices = [] + + return U, V, pvalue_violation_indices, min_diff_violation_indices diff --git a/optbinning/binning/multiclass_binning.py b/optbinning/binning/multiclass_binning.py index 9f0db75..5399af2 100644 --- a/optbinning/binning/multiclass_binning.py +++ b/optbinning/binning/multiclass_binning.py @@ -32,10 +32,11 @@ def _check_parameters(name, prebinning_method, solver, max_n_prebins, min_prebin_size, min_n_bins, max_n_bins, min_bin_size, - max_bin_size, monotonic_trend, max_pvalue, - max_pvalue_policy, outlier_detector, outlier_params, - user_splits, user_splits_fixed, special_codes, - split_digits, mip_solver, time_limit, verbose): + max_bin_size, monotonic_trend, min_event_rate_diff, + max_pvalue, max_pvalue_policy, outlier_detector, + outlier_params, user_splits, user_splits_fixed, + special_codes, split_digits, mip_solver, time_limit, + verbose): if not isinstance(name, str): raise TypeError("name must be a string.") @@ -110,6 +111,11 @@ def _check_parameters(name, prebinning_method, solver, max_n_prebins, raise ValueError("Invalid value for monotonic trend; got {}." .format(monotonic_trend)) + if (not isinstance(min_event_rate_diff, numbers.Number) or + not 0. <= min_event_rate_diff <= 1.0): + raise ValueError("min_event_rate_diff must be in [0, 1]; got {}." + .format(min_event_rate_diff)) + if max_pvalue is not None: if (not isinstance(max_pvalue, numbers.Number) or not 0. < max_pvalue <= 1.0): @@ -234,6 +240,11 @@ class MulticlassOptimalBinning(OptimalBinning): "peak_heuristic", "valley_heuristic" and None, one for each class. If None, then the monotonic constraint is disabled. + min_event_rate_diff : float, optional (default=0) + The minimum event rate difference between consecutives bins. + + .. versionadded:: 0.17.0 + max_pvalue : float or None, optional (default=None) The maximum p-value among bins. The Z-test is used to detect bins not satisfying the p-value constraint. @@ -296,7 +307,8 @@ class MulticlassOptimalBinning(OptimalBinning): def __init__(self, name="", prebinning_method="cart", solver="cp", max_n_prebins=20, min_prebin_size=0.05, min_n_bins=None, max_n_bins=None, min_bin_size=None, - max_bin_size=None, monotonic_trend="auto", max_pvalue=None, + max_bin_size=None, monotonic_trend="auto", + min_event_rate_diff=0, max_pvalue=None, max_pvalue_policy="consecutive", outlier_detector=None, outlier_params=None, user_splits=None, user_splits_fixed=None, special_codes=None, split_digits=None, mip_solver="bop", @@ -316,6 +328,7 @@ def __init__(self, name="", prebinning_method="cart", solver="cp", self.max_bin_size = max_bin_size self.monotonic_trend = monotonic_trend + self.min_event_rate_diff = min_event_rate_diff self.max_pvalue = max_pvalue self.max_pvalue_policy = max_pvalue_policy @@ -729,14 +742,18 @@ def _fit_optimizer(self, splits, n_nonevent, n_event): if self.solver == "cp": optimizer = MulticlassBinningCP(monotonic, self.min_n_bins, self.max_n_bins, min_bin_size, - max_bin_size, self.max_pvalue, + max_bin_size, + self.min_event_rate_diff, + self.max_pvalue, self.max_pvalue_policy, self.user_splits_fixed, self.time_limit) else: optimizer = MulticlassBinningMIP(monotonic, self.min_n_bins, self.max_n_bins, min_bin_size, - max_bin_size, self.max_pvalue, + max_bin_size, + self.min_event_rate_diff, + self.max_pvalue, self.max_pvalue_policy, self.mip_solver, self.user_splits_fixed, diff --git a/optbinning/binning/multiclass_cp.py b/optbinning/binning/multiclass_cp.py index 4fea8a6..8ab70cb 100644 --- a/optbinning/binning/multiclass_cp.py +++ b/optbinning/binning/multiclass_cp.py @@ -14,8 +14,8 @@ class MulticlassBinningCP(BinningCP): def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, - max_bin_size, max_pvalue, max_pvalue_policy, - user_splits_fixed, time_limit): + max_bin_size, min_event_rate_diff, max_pvalue, + max_pvalue_policy, user_splits_fixed, time_limit): self.monotonic_trend = monotonic_trend @@ -24,13 +24,12 @@ def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, self.min_bin_size = min_bin_size self.max_bin_size = max_bin_size + self.min_event_rate_diff = min_event_rate_diff self.max_pvalue = max_pvalue self.max_pvalue_policy = max_pvalue_policy self.user_splits_fixed = user_splits_fixed self.time_limit = time_limit - self.min_event_rate_diff = 0 - self.solver_ = None self._model = None @@ -40,8 +39,10 @@ def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, def build_model(self, n_nonevent, n_event, trend_changes): # Parameters M = int(1e6) - D, V, pvalue_violation_indices = multiclass_model_data( - n_nonevent, n_event, self.max_pvalue, self.max_pvalue_policy, M) + (D, V, pvalue_violation_indices, + min_diff_violation_indices) = multiclass_model_data( + n_nonevent, n_event, self.max_pvalue, self.max_pvalue_policy, + self.min_event_rate_diff, M) n = len(n_nonevent) n_records = n_nonevent + n_event @@ -101,11 +102,16 @@ def build_model(self, n_nonevent, n_event, trend_changes): self.add_constraint_monotonic_valley_heuristic( model, n, D[c], x, trend_changes[c], M) - # constraint: max-pvalue + # Constraint: max-pvalue for c in range(n_classes): self.add_max_pvalue_constraint(model, x, pvalue_violation_indices[c]) + # Constraint: min diff + for c in range(n_classes): + self.add_min_diff_constraint(model, x, + min_diff_violation_indices[c]) + # Constraint: fixed splits self.add_constraint_fixed_splits(model, n, x) diff --git a/optbinning/binning/multiclass_mip.py b/optbinning/binning/multiclass_mip.py index 4fc13c5..e55617a 100644 --- a/optbinning/binning/multiclass_mip.py +++ b/optbinning/binning/multiclass_mip.py @@ -14,8 +14,8 @@ class MulticlassBinningMIP(BinningMIP): def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, - max_bin_size, max_pvalue, max_pvalue_policy, mip_solver, - user_splits_fixed, time_limit): + max_bin_size, min_event_rate_diff, max_pvalue, + max_pvalue_policy, mip_solver, user_splits_fixed, time_limit): self.monotonic_trend = monotonic_trend @@ -24,6 +24,7 @@ def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, self.min_bin_size = min_bin_size self.max_bin_size = max_bin_size + self.min_event_rate_diff = min_event_rate_diff self.max_pvalue = max_pvalue self.max_pvalue_policy = max_pvalue_policy @@ -31,8 +32,6 @@ def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, self.user_splits_fixed = user_splits_fixed self.time_limit = time_limit - self.min_event_rate_diff = 0 - self.solver_ = None self._n = None @@ -40,8 +39,10 @@ def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, def build_model(self, n_nonevent, n_event, trend_changes): # Parameters - D, V, pvalue_violation_indices = multiclass_model_data( - n_nonevent, n_event, self.max_pvalue, self.max_pvalue_policy) + (D, V, pvalue_violation_indices, + min_diff_violation_indices) = multiclass_model_data( + n_nonevent, n_event, self.max_pvalue, self.max_pvalue_policy, + self.min_event_rate_diff) n = len(n_nonevent) n_records = n_nonevent + n_event @@ -106,11 +107,16 @@ def build_model(self, n_nonevent, n_event, trend_changes): self.add_constraint_monotonic_valley_heuristic( solver, n, D[c], x, trend_changes[c]) - # constraint: max-pvalue + # Constraint: max-pvalue for c in range(n_classes): self.add_max_pvalue_constraint(solver, x, pvalue_violation_indices[c]) + # Constraint: min diff + for c in range(n_classes): + self.add_min_diff_constraint(solver, x, + min_diff_violation_indices[c]) + # Constraint: fixed splits self.add_constraint_fixed_splits(solver, n, x) diff --git a/optbinning/binning/piecewise/continuous_binning.py b/optbinning/binning/piecewise/continuous_binning.py index d3ddfee..718a6be 100644 --- a/optbinning/binning/piecewise/continuous_binning.py +++ b/optbinning/binning/piecewise/continuous_binning.py @@ -303,7 +303,8 @@ def _fit(self, x, y, lb, ub, check_input): time_preprocessing = time.perf_counter() [x_clean, y_clean, x_missing, y_missing, x_special, y_special, - _, _, _, _, _, _, _] = self._fit_preprocessing(x, y, check_input) + _, _, _, _, _, sw_special, _] = self._fit_preprocessing( + x, y, check_input) self._time_preprocessing = time.perf_counter() - time_preprocessing @@ -346,7 +347,7 @@ def _fit(self, x, y, lb, ub, check_input): [self._n_records_special, self._sum_special, self._n_zeros_special, self._std_special, self._min_target_special, self._max_target_special] = target_info_special_continuous( - self.special_codes, x_special, y_special) + self.special_codes, x_special, y_special, sw_special) self._n_records_missing = len(y_missing) self._sum_missing = np.sum(y_missing) diff --git a/optbinning/binning/uncertainty/binning_scenarios.py b/optbinning/binning/uncertainty/binning_scenarios.py index e7da565..0ee5579 100644 --- a/optbinning/binning/uncertainty/binning_scenarios.py +++ b/optbinning/binning/uncertainty/binning_scenarios.py @@ -220,9 +220,7 @@ class SBOptimalBinning(OptimalBinning): the monotonic constraint is disabled. min_event_rate_diff : float, optional (default=0) - The minimum event rate difference between consecutives bins. This - option currently only applies when ``monotonic_trend`` is "ascending", - "descending", "peak_heuristic" or "valley_heuristic". + The minimum event rate difference between consecutives bins. max_pvalue : float or None, optional (default=None) The maximum p-value among bins. The Z-test is used to detect bins diff --git a/optbinning/options.py b/optbinning/options.py index b26133d..1789aa7 100644 --- a/optbinning/options.py +++ b/optbinning/options.py @@ -50,6 +50,7 @@ "min_bin_size": None, "max_bin_size": None, "monotonic_trend": "auto", + "min_event_rate_diff": 0, "max_pvalue": None, "max_pvalue_policy": "consecutive", "user_splits": None, diff --git a/tests/test_binning.py b/tests/test_binning.py index 83008e7..83fd9d0 100644 --- a/tests/test_binning.py +++ b/tests/test_binning.py @@ -436,6 +436,19 @@ def test_numerical_prebinning_kwargs(): assert optb_kwargs.binning_table.iv == approx(4.37337682, rel=1e-6) +def test_min_event_rate_diff(): + min_event_rate_diff = 0.01 + + for solver, mip_solver in (('cp', 'bop'), ('mip', 'bop'), ('mip', 'cbc')): + optb = OptimalBinning(solver=solver, mip_solver=mip_solver, + min_event_rate_diff=min_event_rate_diff) + optb.fit(x, y) + + event_rate = optb.binning_table.build()['Event rate'].values[:-3] + min_diff = np.absolute(event_rate[1:] - event_rate[:-1]) + assert np.all(min_diff >= min_event_rate_diff) + + def test_numerical_default_transform(): optb = OptimalBinning() with raises(NotFittedError): diff --git a/tests/test_continuous_binning.py b/tests/test_continuous_binning.py index 0629d9b..f83ddd6 100644 --- a/tests/test_continuous_binning.py +++ b/tests/test_continuous_binning.py @@ -221,6 +221,18 @@ def test_numerical_max_pvalue(): rel=1e-6) +def test_min_mean_diff(): + min_mean_diff = 2 + + optb = ContinuousOptimalBinning( + monotonic_trend=None, min_mean_diff=min_mean_diff) + optb.fit(x, y) + + mean = optb.binning_table.build()['Mean'].values[:-3] + min_diff = np.absolute(mean[1:] - mean[:-1]) + assert np.all(min_diff >= min_mean_diff) + + def test_auto_modes(): x = df["INDUS"].values