diff --git a/openwakeword/model.py b/openwakeword/model.py index 8f2ef42..6029963 100755 --- a/openwakeword/model.py +++ b/openwakeword/model.py @@ -327,11 +327,10 @@ def predict(self, x: np.ndarray, patience: dict = {}, )[0][-1] predictions[cls] = verifier_prediction - # Update prediction buffer, and zero predictions for first 5 frames during model initialization + # Zero predictions for first 5 frames during model initialization for cls in predictions.keys(): if len(self.prediction_buffer[cls]) < 5: predictions[cls] = 0.0 - self.prediction_buffer[cls].append(predictions[cls]) # Get timing information if timing: @@ -346,14 +345,22 @@ def predict(self, x: np.ndarray, patience: dict = {}, raise ValueError("Error! The `patience` and `debounce_time` arguments cannot be used together!") for mdl in predictions.keys(): parent_model = self.get_parent_model_from_label(mdl) - if parent_model in patience.keys(): - scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:] - if (scores >= threshold[parent_model]).sum() < patience[parent_model]: - predictions[mdl] = 0.0 - if debounce_time > 0: - n_frames = int(debounce_time*1000/80) - if (np.array(self.prediction_buffer[mdl])[-n_frames:] >= threshold[parent_model]).sum() > 0: - predictions[mdl] = 0.0 + if predictions[mdl] != 0.0: + if parent_model in patience.keys(): + scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:] + if (scores >= threshold[parent_model]).sum() < patience[parent_model]: + predictions[mdl] = 0.0 + elif debounce_time > 0: + if parent_model in threshold.keys(): + n_frames = int(np.ceil(debounce_time/(n_prepared_samples/16000))) + recent_predictions = np.array(self.prediction_buffer[mdl])[-n_frames:] + if predictions[mdl] >= threshold[parent_model] and \ + (recent_predictions >= threshold[parent_model]).sum() > 0: + predictions[mdl] = 0.0 + + # Update prediction buffer + for mdl in predictions.keys(): + self.prediction_buffer[mdl].append(predictions[mdl]) # (optionally) get voice activity detection scores and update model scores if self.vad_threshold > 0: diff --git a/tests/test_models.py b/tests/test_models.py index e728065..fb6defd 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -208,6 +208,25 @@ def test_models_with_speex_noise_cancellation(self): ) assert 1 == 1 + def test_models_with_debounce(self): + # Load model with defaults + owwModel = openwakeword.Model() + + # Get test clip + os.path.join("tests", "data", "alexa_test.wav") + + # Predict with chunks of 1280 with and without debounce + predictions = owwModel.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), + debounce_time=0, threshold={"alexa_v0.1": 0.5}) + scores = np.array([i['alexa'] for i in predictions]) + + predictions = owwModel.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), + debounce_time=1.25, threshold={"alexa": 0.5}) + scores_with_debounce = np.array([i['alexa'] for i in predictions]) + print(scores, scores_with_debounce) + assert (scores >= 0.5).sum() > 1 + assert (scores_with_debounce >= 0.5).sum() == 1 + def test_models_with_vad(self): # Load model with defaults owwModel = openwakeword.Model(vad_threshold=0.5)