Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved logging when plugins are cancelled for taking too long #323

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 60 additions & 8 deletions src/ipahealthcheck/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,29 +45,81 @@ def find_plugins(name, registry):


def run_plugin(plugin, available=(), timeout=constants.DEFAULT_TIMEOUT):
plugin_name = F"{plugin.__class__.__module__}:{plugin.__class__.__name__}"
timed_out = {}

def signal_handler(signum, frame):
raise TimeoutError('Request timed out')
# Our exception will likely be caught by code called by the plugin,
# which may not log the traceback, or may even catch the exception
# itself! So while we throw the exception in order to interrupt the
# plugin code, we also stash a copy of it so that we can log it after
# the plugin returns.
timed_out["exception"] = TimeoutError(
f"Check {plugin_name} cancelled after {timeout} sec"
)
logger.error("--- %s ---", timed_out["exception"])
traceback.print_stack()
timed_out["stack"] = traceback.format_stack()
logger.error(
"--- The following messages were logged by the check after it"
" was cancelled. They may not indicate the reason why the plugin"
" took too long to run. ---"
)
raise timed_out["exception"]

# manually calculate duration when we create results of our own
start = datetime.now(tz=timezone.utc)
signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(timeout)

try:
for result in plugin.check():
if result is None:
# Treat no result as success, fudge start time
result = Result(plugin, constants.SUCCESS, start=start)
yield result
except TimeoutError as e:
yield Result(plugin, constants.ERROR, exception=str(e),
start=start)
except TimeoutError:
# The plugin code _may_ have raised our original TimeoutError however
# we also have to deal with cases where it did not raise anything! In
# the finally block below we handle such cases, and since we already
# stashed the TimeoutError that we raised in timed_out[0], we don't
# need to handle it here.
pass
except Exception as e:
logger.debug('Exception raised: %s', e)
logger.debug(traceback.format_exc())
# We've got no way to know whether the plugin's own exception was
# related to our TimeoutError; if it was then we will yield a
# result here based on the plugin's own exception, and _also_
# later on in the finally block.
logger.exception(
"Exception raised in check %r",
plugin_name
)
yield Result(plugin, constants.CRITICAL, exception=str(e),
traceback=traceback.format_exc(),
start=start)
finally:
if timed_out:
logger.error(
"--- Increasing the timeout in"
" /etc/ipahealthcheck/ipahealthcheck.conf may allow this"
" healthcheck to complete before the deadline. ---"
)
# If the plugin already handled our TimeoutError properly (i.e,
# logged it and raised its own exception with a sensible error
# message) then there is no need to yield our own error result
# here; however we can't rely on the plugin doing so. It may have
# caught our TimeoutError and logged something misleading; in that
# case let's err on the side of caution and return an additional
# result.
yield Result(plugin, constants.CRITICAL,
exception=str(timed_out["exception"]),
key="healthcheck_timeout", start=start,
traceback=timed_out["stack"])
else:
logging.debug(
"--- Check %s complete after %s sec ---",
plugin_name, datetime.now(tz=timezone.utc) - start
)
signal.alarm(0)


Expand Down Expand Up @@ -141,7 +193,7 @@ def run_service_plugins(plugins, source, check):
)
continue

logger.debug('Calling check %s', plugin)
logger.debug('--- Calling check %s ---', plugin)
for result in plugin.check():
# always run the service checks so dependencies work
if result is not None and result.result == constants.SUCCESS:
Expand Down Expand Up @@ -176,7 +228,7 @@ def run_plugins(plugins, available, source, check,
if not source_or_check_matches(plugin, source, check):
continue

logger.debug("Calling check %s", plugin)
logger.debug("--- Calling check %s ---", plugin)
if not set(plugin.requires).issubset(available):
logger.debug('Skipping %s:%s because %s service(s) not running',
plugin.__class__.__module__,
Expand Down
105 changes: 96 additions & 9 deletions tests/test_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,115 @@ def test_timeout():
"""
Test that timeouts are detected.
"""
class plugin1(Plugin):
class plugin_slow_passing_exception_up(Plugin):
'''
Some plugins will not catch unexpected exceptions, and they will be
passed up to the caller.
'''
def check(self):
time.sleep(5)

class plugin2(Plugin):
class plugin_slow_catching_exception_and_ignoring(Plugin):
'''
Some plugins will catch unexpected exceptions and handle them in
unpredictable ways, such as ignoring them.
'''
def check(self):
try:
time.sleep(5)
except Exception:
pass
return # ugly syntax to let us yield no results
yield

class plugin_slow_catching_exception_and_handling(Plugin):
'''
Some plugins will catch unexpected exceptions but handle them
sanely, such as by yielding an error result.

In this case, the user will get two failure results; one from
the plugin itself, and one from the exception handler.
'''
def check(self):
try:
time.sleep(5)
except Exception as e:
yield Result(
self, constants.ERROR,
key='test', msg='fail',
exception=str(e)
)

class plugin_slow_raising_exception(Plugin):
'''
Some plugins will catch unexpected exceptions and handle them in
unpredictable ways, such as raising their own exception.

The user will get one failure result, from the exception handler.
'''
def check(self):
try:
time.sleep(5)
except Exception as e:
raise Exception(
"I didn't expect an exception to be thrown"
) from e

class plugin_fast(Plugin):
def check(self):
yield Result(self, constants.SUCCESS, key='test', msg='pass')

# Create a registry
r = Registry()

# Register the plugins
r(plugin1)
r(plugin2)
r(plugin_slow_passing_exception_up)
r(plugin_slow_catching_exception_and_ignoring)
r(plugin_slow_catching_exception_and_handling)
r(plugin_slow_raising_exception)
r(plugin_fast)

# Collect the results
results = run_plugins(r.get_plugins(), (), None, None, {}, timeout=1)

assert len(results.results) == 2
assert 7 == len(results.results)

assert results.results[0].kw.get('exception') == 'Check' \
' test_plugins:plugin_slow_passing_exception_up' \
' cancelled after 1 sec'
assert results.results[0].result == constants.CRITICAL
assert results.results[0].kw.get('key') == 'healthcheck_timeout'

assert results.results[1].kw.get('exception') == 'Check' \
' test_plugins:plugin_slow_catching_exception_and_ignoring' \
' cancelled after 1 sec'
assert results.results[1].result == constants.CRITICAL
assert results.results[1].kw.get('key') == 'healthcheck_timeout'

assert results.results[2].kw.get('exception') == 'Check' \
' test_plugins:plugin_slow_catching_exception_and_handling' \
' cancelled after 1 sec'
assert results.results[2].result == constants.ERROR
assert results.results[2].kw.get('msg') == 'fail'
assert results.results[2].kw.get('key') == 'test'

assert results.results[3].kw.get('exception') == 'Check' \
' test_plugins:plugin_slow_catching_exception_and_handling' \
' cancelled after 1 sec'
assert results.results[3].result == constants.CRITICAL
assert results.results[3].kw.get('key') == 'healthcheck_timeout'

assert results.results[4].kw.get('exception') == "I didn't expect an" \
" exception to be thrown"
assert results.results[4].result == constants.CRITICAL
assert not results.results[4].kw.get('key')

assert results.results[0].result == constants.ERROR
assert results.results[0].kw.get('exception') == 'Request timed out'
assert results.results[5].kw.get('exception') == 'Check' \
' test_plugins:plugin_slow_raising_exception cancelled after 1 sec'
assert results.results[5].result == constants.CRITICAL
assert results.results[5].kw.get('key') == 'healthcheck_timeout'

assert results.results[1].result == constants.SUCCESS
assert results.results[1].kw.get('msg') == 'pass'
assert results.results[6].kw.get('msg') == 'pass'
assert not results.results[6].kw.get('exception')
assert results.results[6].result == constants.SUCCESS
assert results.results[6].kw.get('key') == 'test'
Loading