From 001e8d97a0c0efc84a39545d4f4d6459aacf758e Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Tue, 12 Mar 2024 17:08:11 +0000 Subject: [PATCH 01/18] Improved logging when plugins are cancelled for taking too long Fixes: https://github.com/freeipa/freeipa-healthcheck/issues/322 Signed-off-by: Sam Morris --- src/ipahealthcheck/core/core.py | 50 ++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index 1b71fd0a..ad0e0277 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -45,9 +45,19 @@ def find_plugins(name, registry): def run_plugin(plugin, available=(), timeout=constants.DEFAULT_TIMEOUT): + plugin_name = F"{plugin.__class__.__module__}:{plugin.__class__.__name__}" + timed_out = [] def signal_handler(signum, frame): - raise TimeoutError('Request timed out') - + # Our exception will likely be caught by code called by the plugin, + # which may not log the traceback, or may even catch the exception + # itself! So while we throw the exception in order to interrupt the + # plugin code, we also stash a copy of it so that we can log it after + # the plugin returns. + timed_out.append(TimeoutError(f"Health check {plugin_name} cancelled after after {timeout} sec")) + logger.error("--- %s ---", timed_out[0]) + traceback.print_stack() + logger.error("--- The following messages were logged by the plugin after it was cancelled. They may not indicate the reason why the plugin took too long to run. ---") + raise timed_out[0] # manually calculate duration when we create results of our own start = datetime.now(tz=timezone.utc) signal.signal(signal.SIGALRM, signal_handler) @@ -58,16 +68,36 @@ def signal_handler(signum, frame): # Treat no result as success, fudge start time result = Result(plugin, constants.SUCCESS, start=start) yield result - except TimeoutError as e: - yield Result(plugin, constants.ERROR, exception=str(e), - start=start) except Exception as e: - logger.debug('Exception raised: %s', e) - logger.debug(traceback.format_exc()) - yield Result(plugin, constants.CRITICAL, exception=str(e), - traceback=traceback.format_exc(), - start=start) + if e is TimeoutError: + # The plugin code _may_ have raised our original TimeoutError + # however we also have to deal with cases where it did not raise + # anything! We do that in the finally block below; we don't + # therefore need to do anything here. + pass + else: + # We've got no way to know whether the plugin's own exception was + # related to our TimeoutError; if it was then we will yield a + # result here based on the plugin's own exception, and _also_ later + # on in the finally block. + logger.log('Exception raised in health check %r: %s', plugin_name, e, level=constants.CRITICAL, exc_info=True) + yield Result(plugin, constants.CRITICAL, exception=str(e), + traceback=traceback.format_exc(), + start=start) finally: + if timed_out: + logger.error("--- Increasing the timeout in /etc/ipahealthcheck/ipahealthcheck.conf may allow this healthcheck to complete before the deadline. ---") + # If the plugin already handled our TimeoutError properly (i.e, + # logged it and raised its own exception with a sensible error + # message) then there is no need to yield our own error result + # here; however we can't rely on the plugin doing so. It may have + # caught our TimeoutError and logged something misleading; in that + # case let's err on the side of caution and return an additional + # result. + yield Result(plugin, constants.ERROR, exception=str(timed_out[0]), + start=start) + else: + logging.debug("Plugin %s complete after %s sec", plugin_name, datetime.utcnow() - start) signal.alarm(0) From 47473e0ed7ced817ba59da0810edc11108271c47 Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Tue, 12 Mar 2024 17:19:21 +0000 Subject: [PATCH 02/18] appease flake8 --- src/ipahealthcheck/core/core.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index ad0e0277..71e9665e 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -47,17 +47,25 @@ def find_plugins(name, registry): def run_plugin(plugin, available=(), timeout=constants.DEFAULT_TIMEOUT): plugin_name = F"{plugin.__class__.__module__}:{plugin.__class__.__name__}" timed_out = [] + def signal_handler(signum, frame): # Our exception will likely be caught by code called by the plugin, # which may not log the traceback, or may even catch the exception # itself! So while we throw the exception in order to interrupt the # plugin code, we also stash a copy of it so that we can log it after # the plugin returns. - timed_out.append(TimeoutError(f"Health check {plugin_name} cancelled after after {timeout} sec")) + timed_out.append(TimeoutError( + f"Health check {plugin_name} cancelled after after {timeout} sec" + )) logger.error("--- %s ---", timed_out[0]) traceback.print_stack() - logger.error("--- The following messages were logged by the plugin after it was cancelled. They may not indicate the reason why the plugin took too long to run. ---") + logger.error( + "--- The following messages were logged by the plugin after it" + " was cancelled. They may not indicate the reason why the plugin" + " took too long to run. ---" + ) raise timed_out[0] + # manually calculate duration when we create results of our own start = datetime.now(tz=timezone.utc) signal.signal(signal.SIGALRM, signal_handler) @@ -78,15 +86,22 @@ def signal_handler(signum, frame): else: # We've got no way to know whether the plugin's own exception was # related to our TimeoutError; if it was then we will yield a - # result here based on the plugin's own exception, and _also_ later - # on in the finally block. - logger.log('Exception raised in health check %r: %s', plugin_name, e, level=constants.CRITICAL, exc_info=True) + # result here based on the plugin's own exception, and _also_ + # later on in the finally block. + logger.log( + 'Exception raised in health check %r: %s', + plugin_name, e, level=constants.CRITICAL, exc_info=True + ) yield Result(plugin, constants.CRITICAL, exception=str(e), traceback=traceback.format_exc(), start=start) finally: if timed_out: - logger.error("--- Increasing the timeout in /etc/ipahealthcheck/ipahealthcheck.conf may allow this healthcheck to complete before the deadline. ---") + logger.error( + "--- Increasing the timeout in" + " /etc/ipahealthcheck/ipahealthcheck.conf may allow this" + " healthcheck to complete before the deadline. ---" + ) # If the plugin already handled our TimeoutError properly (i.e, # logged it and raised its own exception with a sensible error # message) then there is no need to yield our own error result @@ -97,7 +112,10 @@ def signal_handler(signum, frame): yield Result(plugin, constants.ERROR, exception=str(timed_out[0]), start=start) else: - logging.debug("Plugin %s complete after %s sec", plugin_name, datetime.utcnow() - start) + logging.debug( + "Plugin %s complete after %s sec", + plugin_name, datetime.utcnow() - start + ) signal.alarm(0) From e4512583fc37c7dc0faeb272ac9237d54c34a3a7 Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Tue, 12 Mar 2024 17:27:22 +0000 Subject: [PATCH 03/18] use new way to get utcnow --- src/ipahealthcheck/core/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index 71e9665e..8658d9e1 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -114,7 +114,7 @@ def signal_handler(signum, frame): else: logging.debug( "Plugin %s complete after %s sec", - plugin_name, datetime.utcnow() - start + plugin_name, datetime.now(tz=timezone.utc) - start ) signal.alarm(0) From 5f54da6b103d8ae608c60f665a0cb2a4a06c137e Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Tue, 12 Mar 2024 17:32:19 +0000 Subject: [PATCH 04/18] fix logging exceptions raised by plugins --- src/ipahealthcheck/core/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index 8658d9e1..cf624fc2 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -88,9 +88,9 @@ def signal_handler(signum, frame): # related to our TimeoutError; if it was then we will yield a # result here based on the plugin's own exception, and _also_ # later on in the finally block. - logger.log( - 'Exception raised in health check %r: %s', - plugin_name, e, level=constants.CRITICAL, exc_info=True + logger.exception( + "Exception raised in health check %r", + plugin_name ) yield Result(plugin, constants.CRITICAL, exception=str(e), traceback=traceback.format_exc(), From d6f6f205c8feafe3af66adcb7edb0fdf4c6db5d0 Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Tue, 12 Mar 2024 19:12:42 +0000 Subject: [PATCH 05/18] fixup --- src/ipahealthcheck/core/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index cf624fc2..3ce50e2a 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -55,7 +55,7 @@ def signal_handler(signum, frame): # plugin code, we also stash a copy of it so that we can log it after # the plugin returns. timed_out.append(TimeoutError( - f"Health check {plugin_name} cancelled after after {timeout} sec" + f"Health check {plugin_name} cancelled after {timeout} sec" )) logger.error("--- %s ---", timed_out[0]) traceback.print_stack() @@ -77,7 +77,7 @@ def signal_handler(signum, frame): result = Result(plugin, constants.SUCCESS, start=start) yield result except Exception as e: - if e is TimeoutError: + if isinstance(e, TimeoutError): # The plugin code _may_ have raised our original TimeoutError # however we also have to deal with cases where it did not raise # anything! We do that in the finally block below; we don't From c55e55648c3552ee2a2e2ff37fb11cbc5728aa79 Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Tue, 12 Mar 2024 19:12:58 +0000 Subject: [PATCH 06/18] Fix Rest expected behaviour WRT number of exceptions raised for each plugin behaviour --- tests/test_plugins.py | 87 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 8 deletions(-) diff --git a/tests/test_plugins.py b/tests/test_plugins.py index d6ec9528..eafaddaf 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -13,11 +13,55 @@ def test_timeout(): """ Test that timeouts are detected. """ - class plugin1(Plugin): + class plugin_slow_passing_exception_up(Plugin): + ''' + Some plugins will not catch unexpected exceptions, and they will be + passed up to the caller. + ''' def check(self): time.sleep(5) - class plugin2(Plugin): + class plugin_slow_catching_exception_and_ignoring(Plugin): + ''' + Some plugins will catch unexpected exceptions and handle them in + unpredictable ways, such as ignoring them. + ''' + def check(self): + try: + time.sleep(5) + except: + pass + return # ugly syntax to let us yield no results + yield + + class plugin_slow_catching_exception_and_handling(Plugin): + ''' + Some plugins will catch unexpected exceptions but handle them + sanely, such as by yielding an error result. + + In this case, the user will get two failure results; one from + the plugin itself, and one from the exception handler. + ''' + def check(self): + try: + time.sleep(5) + except Exception as e: + yield Result(self, constants.ERROR, key='test', msg='fail', exception=str(e)) + + class plugin_slow_raising_exception(Plugin): + ''' + Some plugins will catch unexpected exceptions and handle them in + unpredictable ways, such as raising their own exception. + + The user will get one failure result, from the exception handler. + ''' + def check(self): + try: + time.sleep(5) + except: + raise Exception("I didn't expect an exception to be thrown") + + class plugin_fast(Plugin): def check(self): yield Result(self, constants.SUCCESS, key='test', msg='pass') @@ -25,16 +69,43 @@ def check(self): r = Registry() # Register the plugins - r(plugin1) - r(plugin2) + r(plugin_slow_passing_exception_up) + r(plugin_slow_catching_exception_and_ignoring) + r(plugin_slow_catching_exception_and_handling) + r(plugin_slow_raising_exception) + r(plugin_fast) # Collect the results results = run_plugins(r.get_plugins(), (), None, None, {}, timeout=1) - assert len(results.results) == 2 + assert 7 == len(results.results) + assert results.results[0].kw.get('exception') == 'Health check test_plugins:plugin_slow_passing_exception_up cancelled after 1 sec' assert results.results[0].result == constants.ERROR - assert results.results[0].kw.get('exception') == 'Request timed out' + assert not results.results[0].kw.get('traceback') + + assert results.results[1].kw.get('exception') == 'Health check test_plugins:plugin_slow_catching_exception_and_ignoring cancelled after 1 sec' + assert results.results[1].result == constants.ERROR + assert not results.results[1].kw.get('traceback') + + assert results.results[2].kw.get('exception') == 'Health check test_plugins:plugin_slow_catching_exception_and_handling cancelled after 1 sec' + assert results.results[2].result == constants.ERROR + assert results.results[2].kw.get('msg') == 'fail' + assert not results.results[2].kw.get('traceback') + + assert results.results[3].kw.get('exception')== 'Health check test_plugins:plugin_slow_catching_exception_and_handling cancelled after 1 sec' + assert results.results[3].result == constants.ERROR + assert not results.results[3].kw.get('traceback') + + assert results.results[4].kw.get('exception') == "I didn't expect an exception to be thrown" + assert results.results[4].result == constants.CRITICAL + assert results.results[4].kw.get('traceback') + + assert results.results[5].kw.get('exception') == 'Health check test_plugins:plugin_slow_raising_exception cancelled after 1 sec' + assert results.results[5].result == constants.ERROR + assert not results.results[5].kw.get('traceback') - assert results.results[1].result == constants.SUCCESS - assert results.results[1].kw.get('msg') == 'pass' + assert results.results[6].kw.get('msg') == 'pass' + assert not results.results[6].kw.get('exception') + assert results.results[6].result == constants.SUCCESS + assert not results.results[6].kw.get('traceback') From e82e074fdfd428b1d0706b3b40015764f1d96c51 Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Tue, 12 Mar 2024 19:43:13 +0000 Subject: [PATCH 07/18] appease flake8 --- tests/test_plugins.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/tests/test_plugins.py b/tests/test_plugins.py index eafaddaf..e8407064 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -29,7 +29,7 @@ class plugin_slow_catching_exception_and_ignoring(Plugin): def check(self): try: time.sleep(5) - except: + except Exception: pass return # ugly syntax to let us yield no results yield @@ -46,7 +46,11 @@ def check(self): try: time.sleep(5) except Exception as e: - yield Result(self, constants.ERROR, key='test', msg='fail', exception=str(e)) + yield Result( + self, constants.ERROR, + key='test', msg='fail', + exception=str(e) + ) class plugin_slow_raising_exception(Plugin): ''' @@ -58,7 +62,7 @@ class plugin_slow_raising_exception(Plugin): def check(self): try: time.sleep(5) - except: + except Exception: raise Exception("I didn't expect an exception to be thrown") class plugin_fast(Plugin): @@ -80,28 +84,38 @@ def check(self): assert 7 == len(results.results) - assert results.results[0].kw.get('exception') == 'Health check test_plugins:plugin_slow_passing_exception_up cancelled after 1 sec' + assert results.results[0].kw.get('exception') == 'Health check' \ + ' test_plugins:plugin_slow_passing_exception_up' \ + ' cancelled after 1 sec' assert results.results[0].result == constants.ERROR assert not results.results[0].kw.get('traceback') - assert results.results[1].kw.get('exception') == 'Health check test_plugins:plugin_slow_catching_exception_and_ignoring cancelled after 1 sec' + assert results.results[1].kw.get('exception') == 'Health check' \ + ' test_plugins:plugin_slow_catching_exception_and_ignoring' \ + ' cancelled after 1 sec' assert results.results[1].result == constants.ERROR assert not results.results[1].kw.get('traceback') - assert results.results[2].kw.get('exception') == 'Health check test_plugins:plugin_slow_catching_exception_and_handling cancelled after 1 sec' + assert results.results[2].kw.get('exception') == 'Health check' \ + ' test_plugins:plugin_slow_catching_exception_and_handling' \ + ' cancelled after 1 sec' assert results.results[2].result == constants.ERROR assert results.results[2].kw.get('msg') == 'fail' assert not results.results[2].kw.get('traceback') - assert results.results[3].kw.get('exception')== 'Health check test_plugins:plugin_slow_catching_exception_and_handling cancelled after 1 sec' + assert results.results[3].kw.get('exception') == 'Health check' \ + ' test_plugins:plugin_slow_catching_exception_and_handling' \ + ' cancelled after 1 sec' assert results.results[3].result == constants.ERROR assert not results.results[3].kw.get('traceback') - assert results.results[4].kw.get('exception') == "I didn't expect an exception to be thrown" + assert results.results[4].kw.get('exception') == "I didn't expect an" \ + " exception to be thrown" assert results.results[4].result == constants.CRITICAL assert results.results[4].kw.get('traceback') - assert results.results[5].kw.get('exception') == 'Health check test_plugins:plugin_slow_raising_exception cancelled after 1 sec' + assert results.results[5].kw.get('exception') == 'Health check' \ + ' test_plugins:plugin_slow_raising_exception cancelled after 1 sec' assert results.results[5].result == constants.ERROR assert not results.results[5].kw.get('traceback') From 5265f8cfc7fe9907a9a546386a394f3fabfa09cb Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Wed, 13 Mar 2024 02:50:14 +0000 Subject: [PATCH 08/18] Use a dedicated key so that we can disambiguate failures caused by the timeout mechanism itself from failures caused by healthcheck plugins catching the TimeoutException and returning its own failures --- src/ipahealthcheck/core/core.py | 2 +- tests/test_plugins.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index 3ce50e2a..135dc47d 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -110,7 +110,7 @@ def signal_handler(signum, frame): # case let's err on the side of caution and return an additional # result. yield Result(plugin, constants.ERROR, exception=str(timed_out[0]), - start=start) + key="healthcheck_timeout", start=start) else: logging.debug( "Plugin %s complete after %s sec", diff --git a/tests/test_plugins.py b/tests/test_plugins.py index e8407064..5f4d44e1 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -88,38 +88,38 @@ def check(self): ' test_plugins:plugin_slow_passing_exception_up' \ ' cancelled after 1 sec' assert results.results[0].result == constants.ERROR - assert not results.results[0].kw.get('traceback') + assert results.results[0].kw.get('key') == 'healthcheck_timeout' assert results.results[1].kw.get('exception') == 'Health check' \ ' test_plugins:plugin_slow_catching_exception_and_ignoring' \ ' cancelled after 1 sec' assert results.results[1].result == constants.ERROR - assert not results.results[1].kw.get('traceback') + assert results.results[1].kw.get('key') == 'healthcheck_timeout' assert results.results[2].kw.get('exception') == 'Health check' \ ' test_plugins:plugin_slow_catching_exception_and_handling' \ ' cancelled after 1 sec' assert results.results[2].result == constants.ERROR assert results.results[2].kw.get('msg') == 'fail' - assert not results.results[2].kw.get('traceback') + assert results.results[2].kw.get('key') == 'test' assert results.results[3].kw.get('exception') == 'Health check' \ ' test_plugins:plugin_slow_catching_exception_and_handling' \ ' cancelled after 1 sec' assert results.results[3].result == constants.ERROR - assert not results.results[3].kw.get('traceback') + assert results.results[3].kw.get('key') == 'healthcheck_timeout' assert results.results[4].kw.get('exception') == "I didn't expect an" \ " exception to be thrown" assert results.results[4].result == constants.CRITICAL - assert results.results[4].kw.get('traceback') + assert not results.results[4].kw.get('key') assert results.results[5].kw.get('exception') == 'Health check' \ ' test_plugins:plugin_slow_raising_exception cancelled after 1 sec' assert results.results[5].result == constants.ERROR - assert not results.results[5].kw.get('traceback') + assert results.results[5].kw.get('key') == 'healthcheck_timeout' assert results.results[6].kw.get('msg') == 'pass' assert not results.results[6].kw.get('exception') assert results.results[6].result == constants.SUCCESS - assert not results.results[6].kw.get('traceback') + assert results.results[6].kw.get('key') == 'test' From 72f3281a41fb18adbfba345d7095a6d38f655486 Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Wed, 13 Mar 2024 09:20:48 +0000 Subject: [PATCH 09/18] simplify --- src/ipahealthcheck/core/core.py | 36 ++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index 135dc47d..5f33a59e 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -76,25 +76,25 @@ def signal_handler(signum, frame): # Treat no result as success, fudge start time result = Result(plugin, constants.SUCCESS, start=start) yield result + except TimeoutError: + # The plugin code _may_ have raised our original TimeoutError however + # we also have to deal with cases where it did not raise anything! In + # the finally block below we handle such cases, and since we already + # stashed the TimeoutError that we raised in timed_out[0], we don't + # need to handle it here. + pass except Exception as e: - if isinstance(e, TimeoutError): - # The plugin code _may_ have raised our original TimeoutError - # however we also have to deal with cases where it did not raise - # anything! We do that in the finally block below; we don't - # therefore need to do anything here. - pass - else: - # We've got no way to know whether the plugin's own exception was - # related to our TimeoutError; if it was then we will yield a - # result here based on the plugin's own exception, and _also_ - # later on in the finally block. - logger.exception( - "Exception raised in health check %r", - plugin_name - ) - yield Result(plugin, constants.CRITICAL, exception=str(e), - traceback=traceback.format_exc(), - start=start) + # We've got no way to know whether the plugin's own exception was + # related to our TimeoutError; if it was then we will yield a + # result here based on the plugin's own exception, and _also_ + # later on in the finally block. + logger.exception( + "Exception raised in health check %r", + plugin_name + ) + yield Result(plugin, constants.CRITICAL, exception=str(e), + traceback=traceback.format_exc(), + start=start) finally: if timed_out: logger.error( From 63386e7aef3ff7a829c2d8a7073251afab9e9ee5 Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Wed, 13 Mar 2024 09:21:11 +0000 Subject: [PATCH 10/18] should a healthcheck plugin timing out be critical? --- src/ipahealthcheck/core/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index 5f33a59e..822d63f0 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -109,6 +109,7 @@ def signal_handler(signum, frame): # caught our TimeoutError and logged something misleading; in that # case let's err on the side of caution and return an additional # result. + # TODO: suggest that timeouts are CRITICAL yield Result(plugin, constants.ERROR, exception=str(timed_out[0]), key="healthcheck_timeout", start=start) else: From ab79e14bd1f6b6df4e4ea35a2d063f7056b6fb0a Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Wed, 13 Mar 2024 09:21:41 +0000 Subject: [PATCH 11/18] typo --- src/ipahealthcheck/core/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index 822d63f0..4c973639 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -109,7 +109,7 @@ def signal_handler(signum, frame): # caught our TimeoutError and logged something misleading; in that # case let's err on the side of caution and return an additional # result. - # TODO: suggest that timeouts are CRITICAL + # TODO: suggest that timeouts are CRITICAL? yield Result(plugin, constants.ERROR, exception=str(timed_out[0]), key="healthcheck_timeout", start=start) else: From 35a813a57f404a33bae0f3715db6fcfc3aae536e Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Wed, 13 Mar 2024 09:53:45 +0000 Subject: [PATCH 12/18] provide a traceback to our timeout errors --- src/ipahealthcheck/core/core.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index 4c973639..b68663a6 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -46,7 +46,7 @@ def find_plugins(name, registry): def run_plugin(plugin, available=(), timeout=constants.DEFAULT_TIMEOUT): plugin_name = F"{plugin.__class__.__module__}:{plugin.__class__.__name__}" - timed_out = [] + timed_out = {} def signal_handler(signum, frame): # Our exception will likely be caught by code called by the plugin, @@ -54,17 +54,18 @@ def signal_handler(signum, frame): # itself! So while we throw the exception in order to interrupt the # plugin code, we also stash a copy of it so that we can log it after # the plugin returns. - timed_out.append(TimeoutError( + timed_out["exception"] = TimeoutError( f"Health check {plugin_name} cancelled after {timeout} sec" - )) - logger.error("--- %s ---", timed_out[0]) + ) + logger.error("--- %s ---", timed_out["exception"]) traceback.print_stack() + timed_out["stack"] = traceback.format_stack() logger.error( "--- The following messages were logged by the plugin after it" " was cancelled. They may not indicate the reason why the plugin" " took too long to run. ---" ) - raise timed_out[0] + raise timed_out["exception"] # manually calculate duration when we create results of our own start = datetime.now(tz=timezone.utc) @@ -110,8 +111,10 @@ def signal_handler(signum, frame): # case let's err on the side of caution and return an additional # result. # TODO: suggest that timeouts are CRITICAL? - yield Result(plugin, constants.ERROR, exception=str(timed_out[0]), - key="healthcheck_timeout", start=start) + yield Result(plugin, constants.ERROR, + exception=str(timed_out["exception"]), + key="healthcheck_timeout", start=start, + traceback=timed_out["stack"]) else: logging.debug( "Plugin %s complete after %s sec", From 9eaab1fd16f3bd4ddf3c4cf7faea9817ec6c2261 Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Wed, 13 Mar 2024 17:26:35 +0000 Subject: [PATCH 13/18] Use constent terminology - call them 'checks' --- src/ipahealthcheck/core/core.py | 9 +++++---- tests/test_plugins.py | 10 +++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index b68663a6..829a7174 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -55,13 +55,13 @@ def signal_handler(signum, frame): # plugin code, we also stash a copy of it so that we can log it after # the plugin returns. timed_out["exception"] = TimeoutError( - f"Health check {plugin_name} cancelled after {timeout} sec" + f"Check {plugin_name} cancelled after {timeout} sec" ) logger.error("--- %s ---", timed_out["exception"]) traceback.print_stack() timed_out["stack"] = traceback.format_stack() logger.error( - "--- The following messages were logged by the plugin after it" + "--- The following messages were logged by the check after it" " was cancelled. They may not indicate the reason why the plugin" " took too long to run. ---" ) @@ -71,6 +71,7 @@ def signal_handler(signum, frame): start = datetime.now(tz=timezone.utc) signal.signal(signal.SIGALRM, signal_handler) signal.alarm(timeout) + try: for result in plugin.check(): if result is None: @@ -90,7 +91,7 @@ def signal_handler(signum, frame): # result here based on the plugin's own exception, and _also_ # later on in the finally block. logger.exception( - "Exception raised in health check %r", + "Exception raised in check %r", plugin_name ) yield Result(plugin, constants.CRITICAL, exception=str(e), @@ -117,7 +118,7 @@ def signal_handler(signum, frame): traceback=timed_out["stack"]) else: logging.debug( - "Plugin %s complete after %s sec", + "--- Check %s complete after %s sec ---", plugin_name, datetime.now(tz=timezone.utc) - start ) signal.alarm(0) diff --git a/tests/test_plugins.py b/tests/test_plugins.py index 5f4d44e1..f7d7eb5b 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -84,26 +84,26 @@ def check(self): assert 7 == len(results.results) - assert results.results[0].kw.get('exception') == 'Health check' \ + assert results.results[0].kw.get('exception') == 'Check' \ ' test_plugins:plugin_slow_passing_exception_up' \ ' cancelled after 1 sec' assert results.results[0].result == constants.ERROR assert results.results[0].kw.get('key') == 'healthcheck_timeout' - assert results.results[1].kw.get('exception') == 'Health check' \ + assert results.results[1].kw.get('exception') == 'Check' \ ' test_plugins:plugin_slow_catching_exception_and_ignoring' \ ' cancelled after 1 sec' assert results.results[1].result == constants.ERROR assert results.results[1].kw.get('key') == 'healthcheck_timeout' - assert results.results[2].kw.get('exception') == 'Health check' \ + assert results.results[2].kw.get('exception') == 'Check' \ ' test_plugins:plugin_slow_catching_exception_and_handling' \ ' cancelled after 1 sec' assert results.results[2].result == constants.ERROR assert results.results[2].kw.get('msg') == 'fail' assert results.results[2].kw.get('key') == 'test' - assert results.results[3].kw.get('exception') == 'Health check' \ + assert results.results[3].kw.get('exception') == 'Check' \ ' test_plugins:plugin_slow_catching_exception_and_handling' \ ' cancelled after 1 sec' assert results.results[3].result == constants.ERROR @@ -114,7 +114,7 @@ def check(self): assert results.results[4].result == constants.CRITICAL assert not results.results[4].kw.get('key') - assert results.results[5].kw.get('exception') == 'Health check' \ + assert results.results[5].kw.get('exception') == 'Check' \ ' test_plugins:plugin_slow_raising_exception cancelled after 1 sec' assert results.results[5].result == constants.ERROR assert results.results[5].kw.get('key') == 'healthcheck_timeout' From 14294d0c84fc96c649276d0902735f925032917a Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Wed, 13 Mar 2024 17:26:59 +0000 Subject: [PATCH 14/18] timeout errors are critical --- src/ipahealthcheck/core/core.py | 3 +-- tests/test_plugins.py | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index 829a7174..c4abd2f5 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -111,8 +111,7 @@ def signal_handler(signum, frame): # caught our TimeoutError and logged something misleading; in that # case let's err on the side of caution and return an additional # result. - # TODO: suggest that timeouts are CRITICAL? - yield Result(plugin, constants.ERROR, + yield Result(plugin, constants.CRITICAL, exception=str(timed_out["exception"]), key="healthcheck_timeout", start=start, traceback=timed_out["stack"]) diff --git a/tests/test_plugins.py b/tests/test_plugins.py index f7d7eb5b..e21c84ab 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -87,13 +87,13 @@ def check(self): assert results.results[0].kw.get('exception') == 'Check' \ ' test_plugins:plugin_slow_passing_exception_up' \ ' cancelled after 1 sec' - assert results.results[0].result == constants.ERROR + assert results.results[0].result == constants.CRITICAL assert results.results[0].kw.get('key') == 'healthcheck_timeout' assert results.results[1].kw.get('exception') == 'Check' \ ' test_plugins:plugin_slow_catching_exception_and_ignoring' \ ' cancelled after 1 sec' - assert results.results[1].result == constants.ERROR + assert results.results[1].result == constants.CRITICAL assert results.results[1].kw.get('key') == 'healthcheck_timeout' assert results.results[2].kw.get('exception') == 'Check' \ @@ -106,7 +106,7 @@ def check(self): assert results.results[3].kw.get('exception') == 'Check' \ ' test_plugins:plugin_slow_catching_exception_and_handling' \ ' cancelled after 1 sec' - assert results.results[3].result == constants.ERROR + assert results.results[3].result == constants.CRITICAL assert results.results[3].kw.get('key') == 'healthcheck_timeout' assert results.results[4].kw.get('exception') == "I didn't expect an" \ @@ -116,7 +116,7 @@ def check(self): assert results.results[5].kw.get('exception') == 'Check' \ ' test_plugins:plugin_slow_raising_exception cancelled after 1 sec' - assert results.results[5].result == constants.ERROR + assert results.results[5].result == constants.CRITICAL assert results.results[5].kw.get('key') == 'healthcheck_timeout' assert results.results[6].kw.get('msg') == 'pass' From 71f20f0a7d42227f9861102f8cb9bc4065149198 Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Wed, 13 Mar 2024 17:28:33 +0000 Subject: [PATCH 15/18] Use -- markers --- with existing log entries that delimit output from checks --- src/ipahealthcheck/core/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index c4abd2f5..8b4d3ba6 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -193,7 +193,7 @@ def run_service_plugins(plugins, source, check): ) continue - logger.debug('Calling check %s', plugin) + logger.debug('--- Calling check %s ---', plugin) for result in plugin.check(): # always run the service checks so dependencies work if result is not None and result.result == constants.SUCCESS: @@ -228,7 +228,7 @@ def run_plugins(plugins, available, source, check, if not source_or_check_matches(plugin, source, check): continue - logger.debug("Calling check %s", plugin) + logger.debug("--- Calling check %s ---", plugin) if not set(plugin.requires).issubset(available): logger.debug('Skipping %s:%s because %s service(s) not running', plugin.__class__.__module__, From 176dc85531f005880a59c275af8a96bc2f8c78c4 Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Mon, 18 Mar 2024 19:41:30 +0000 Subject: [PATCH 16/18] actaully chain the inner exception --- tests/test_plugins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_plugins.py b/tests/test_plugins.py index e21c84ab..ce1fc02e 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -62,8 +62,8 @@ class plugin_slow_raising_exception(Plugin): def check(self): try: time.sleep(5) - except Exception: - raise Exception("I didn't expect an exception to be thrown") + except Exception as e: + raise Exception("I didn't expect an exception to be thrown") from e class plugin_fast(Plugin): def check(self): From 494d7a3199d96d0682574462692745187528369d Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Tue, 19 Mar 2024 08:11:12 +0000 Subject: [PATCH 17/18] pyflake --- tests/test_plugins.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_plugins.py b/tests/test_plugins.py index ce1fc02e..53d45796 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -63,7 +63,9 @@ def check(self): try: time.sleep(5) except Exception as e: - raise Exception("I didn't expect an exception to be thrown") from e + raise Exception( + "I didn't expect an exception to be thrown") from e + ) class plugin_fast(Plugin): def check(self): From 350cf6b433ce1f89d3ef19e3c400576b47c4b3ef Mon Sep 17 00:00:00 2001 From: Sam Morris Date: Tue, 19 Mar 2024 08:14:01 +0000 Subject: [PATCH 18/18] syntax --- tests/test_plugins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_plugins.py b/tests/test_plugins.py index 53d45796..337b3961 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -64,8 +64,8 @@ def check(self): time.sleep(5) except Exception as e: raise Exception( - "I didn't expect an exception to be thrown") from e - ) + "I didn't expect an exception to be thrown" + ) from e class plugin_fast(Plugin): def check(self):