From 2b7811b4ec85ef1b6bc4f0c00c49b501ca5b8e1e Mon Sep 17 00:00:00 2001 From: eromoe Date: Thu, 11 Apr 2024 21:34:14 +0800 Subject: [PATCH 1/6] fix retrain on sequence dataset --- python-package/lightgbm/basic.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index ee55b642ffa0..81ba2fb697c3 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1192,15 +1192,18 @@ def predict( predict_type=predict_type, ) elif isinstance(data, list): - try: - data = np.array(data) - except BaseException as err: - raise ValueError("Cannot convert data list to numpy array.") from err + if isinstance(data[0], Sequence): + data = np.concatenate([i[:] for i in data]) + else: + try: + data = np.array(data) + except BaseException as err: + raise ValueError('Cannot convert data list to numpy array.') from err preds, nrow = self.__pred_for_np2d( mat=data, start_iteration=start_iteration, num_iteration=num_iteration, - predict_type=predict_type, + predict_type=predict_type ) elif isinstance(data, dt_DataTable): preds, nrow = self.__pred_for_np2d( From a07800c1d97ee279292463a9132dac60052640f7 Mon Sep 17 00:00:00 2001 From: eromoe Date: Sun, 14 Jul 2024 14:46:15 +0800 Subject: [PATCH 2/6] add testing for incremental training on Dataset with lgb.Sequence --- python-package/lightgbm/basic.py | 5 +- tests/python_package_test/test_sequence.py | 59 ++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 tests/python_package_test/test_sequence.py diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 81ba2fb697c3..e305bb2665ae 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1193,7 +1193,10 @@ def predict( ) elif isinstance(data, list): if isinstance(data[0], Sequence): - data = np.concatenate([i[:] for i in data]) + try: + data = np.concatenate([i[:] for i in data]) + except BaseException as err: + raise ValueError('Cannot convert Sequence list to numpy array.') from err else: try: data = np.array(data) diff --git a/tests/python_package_test/test_sequence.py b/tests/python_package_test/test_sequence.py new file mode 100644 index 000000000000..44574fd8c154 --- /dev/null +++ b/tests/python_package_test/test_sequence.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# Author: mithril +# Created Date: 2024-07-14 14:18:46 +# Last Modified: 2024-07-14 14:44:50 + + +import numpy as np +import pytest +import sklearn.datasets +import lightgbm as lgb + + +class PartitionSequence(lgb.Sequence): + def __init__(self, data:np.ndarray, batch_size=4096): + self.data = data + self.batch_size = batch_size + + def __getitem__(self, idx): + return self.data[idx] + + def __len__(self): + return len(self.data) + + +def test_list_of_sequence(): + X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) + X_seq = list() + y_seq = list() + for i in range(2): + X_seq.append(PartitionSequence(X, 200)) + y_seq.append(y) + + y = np.concatenate(y_seq) + + dataset = lgb.Dataset(X_seq, label=y, free_raw_data=False) + + params = { + "objective": "binary", + "metric": "auc", + "min_data": 10, + "num_leaves": 10, + "verbose": -1, + "num_threads": 1, + "max_bin": 255, + "gpu_use_dp": True, + } + + model1 = lgb.train( + params, + dataset, + keep_training_booster=True, + ) + + model2 = lgb.train( + params, + dataset, + init_model=model1, + ) \ No newline at end of file From 3ac186ccee852a0102e23fea23c7735ee186f0e0 Mon Sep 17 00:00:00 2001 From: eromoe Date: Mon, 15 Jul 2024 14:38:13 +0800 Subject: [PATCH 3/6] Update python-package/lightgbm/basic.py Co-authored-by: James Lamb --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index e305bb2665ae..933768afaf34 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1206,7 +1206,7 @@ def predict( mat=data, start_iteration=start_iteration, num_iteration=num_iteration, - predict_type=predict_type + predict_type=predict_type, ) elif isinstance(data, dt_DataTable): preds, nrow = self.__pred_for_np2d( From ecd57463080a61a46d8d293bec8a79e5ef8be372 Mon Sep 17 00:00:00 2001 From: eromoe Date: Mon, 15 Jul 2024 14:39:15 +0800 Subject: [PATCH 4/6] Update tests/python_package_test/test_sequence.py Co-authored-by: James Lamb --- tests/python_package_test/test_sequence.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/python_package_test/test_sequence.py b/tests/python_package_test/test_sequence.py index 44574fd8c154..0b5932dc5d1e 100644 --- a/tests/python_package_test/test_sequence.py +++ b/tests/python_package_test/test_sequence.py @@ -37,13 +37,9 @@ def test_list_of_sequence(): params = { "objective": "binary", - "metric": "auc", "min_data": 10, "num_leaves": 10, "verbose": -1, - "num_threads": 1, - "max_bin": 255, - "gpu_use_dp": True, } model1 = lgb.train( From b3bcf3725dc00284502f256d714303aafa91e0fd Mon Sep 17 00:00:00 2001 From: eromoe Date: Mon, 15 Jul 2024 15:01:44 +0800 Subject: [PATCH 5/6] move seqence test to test_basic --- tests/python_package_test/test_basic.py | 34 +++++++++++++ tests/python_package_test/test_sequence.py | 55 ---------------------- 2 files changed, 34 insertions(+), 55 deletions(-) delete mode 100644 tests/python_package_test/test_sequence.py diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 7177623be02d..06c3398c299d 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -217,6 +217,40 @@ def test_sequence_get_data(num_seq): np.testing.assert_array_equal(subset_data.get_data(), X[sorted(used_indices)]) +def test_retrain_list_of_sequence(): + X, y = load_breast_cancer(return_X_y=True) + seqs = _create_sequence_from_ndarray(X, 2, 100) + + seq_ds = lgb.Dataset(seqs, label=y, free_raw_data=False) + + params = { + "objective": "binary", + "num_boost_round": 20, + "min_data": 10, + "num_leaves": 10, + "verbose": -1, + } + + model1 = lgb.train( + params, + seq_ds, + keep_training_booster=True, + ) + + assert model1.current_iteration() == 20 + assert model1.num_trees() == 20 + + model2 = lgb.train( + params, + seq_ds, + init_model=model1, + ) + + assert model2.current_iteration() == 20 + assert model2.num_trees() == 20 + + assert seq_ds.get_data() == seqs + def test_chunked_dataset(): X_train, X_test, y_train, y_test = train_test_split( *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2 diff --git a/tests/python_package_test/test_sequence.py b/tests/python_package_test/test_sequence.py deleted file mode 100644 index 0b5932dc5d1e..000000000000 --- a/tests/python_package_test/test_sequence.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- -# Author: mithril -# Created Date: 2024-07-14 14:18:46 -# Last Modified: 2024-07-14 14:44:50 - - -import numpy as np -import pytest -import sklearn.datasets -import lightgbm as lgb - - -class PartitionSequence(lgb.Sequence): - def __init__(self, data:np.ndarray, batch_size=4096): - self.data = data - self.batch_size = batch_size - - def __getitem__(self, idx): - return self.data[idx] - - def __len__(self): - return len(self.data) - - -def test_list_of_sequence(): - X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) - X_seq = list() - y_seq = list() - for i in range(2): - X_seq.append(PartitionSequence(X, 200)) - y_seq.append(y) - - y = np.concatenate(y_seq) - - dataset = lgb.Dataset(X_seq, label=y, free_raw_data=False) - - params = { - "objective": "binary", - "min_data": 10, - "num_leaves": 10, - "verbose": -1, - } - - model1 = lgb.train( - params, - dataset, - keep_training_booster=True, - ) - - model2 = lgb.train( - params, - dataset, - init_model=model1, - ) \ No newline at end of file From 48f062caac777d55b5307789ff680cb298baea22 Mon Sep 17 00:00:00 2001 From: eromoe Date: Mon, 15 Jul 2024 15:09:15 +0800 Subject: [PATCH 6/6] add test for seq_ds --- tests/python_package_test/test_basic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 06c3398c299d..932a89107992 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -223,6 +223,10 @@ def test_retrain_list_of_sequence(): seq_ds = lgb.Dataset(seqs, label=y, free_raw_data=False) + assert sum([len(s) for s in seq_ds.get_data()]) == X.shape[0] + assert len(seq_ds.get_feature_name()) == X.shape[1] + assert seq_ds.get_data() == seqs + params = { "objective": "binary", "num_boost_round": 20, @@ -249,7 +253,6 @@ def test_retrain_list_of_sequence(): assert model2.current_iteration() == 20 assert model2.num_trees() == 20 - assert seq_ds.get_data() == seqs def test_chunked_dataset(): X_train, X_test, y_train, y_test = train_test_split(