-
Notifications
You must be signed in to change notification settings - Fork 551
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[k8s] Add retries for pod and node fetching to handle transient errors #4543
Open
romilbhardwaj
wants to merge
5
commits into
master
Choose a base branch
from
k8s-add-api-retries
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -7,6 +7,7 @@ | |||||
import re | ||||||
import shutil | ||||||
import subprocess | ||||||
import time | ||||||
import typing | ||||||
from typing import Any, Dict, List, Optional, Set, Tuple, Union | ||||||
from urllib.parse import urlparse | ||||||
|
@@ -105,6 +106,75 @@ | |||||
|
||||||
logger = sky_logging.init_logger(__name__) | ||||||
|
||||||
# Default retry settings for Kubernetes API calls | ||||||
DEFAULT_MAX_RETRIES = 3 | ||||||
DEFAULT_RETRY_INTERVAL_SECONDS = 1 | ||||||
|
||||||
|
||||||
def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES, | ||||||
retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
resource_type: Optional[str] = None): | ||||||
"""Decorator to retry Kubernetes API calls on transient failures. | ||||||
|
||||||
Args: | ||||||
max_retries: Maximum number of retry attempts | ||||||
retry_interval: Initial seconds to wait between retries | ||||||
resource_type: Type of resource being accessed (e.g. 'node', 'pod'). | ||||||
Used to provide more specific error messages. | ||||||
""" | ||||||
|
||||||
def decorator(func): | ||||||
|
||||||
@functools.wraps(func) | ||||||
def wrapper(*args, **kwargs): | ||||||
last_exception = None | ||||||
backoff = common_utils.Backoff(initial_backoff=retry_interval, | ||||||
max_backoff_factor=3) | ||||||
|
||||||
for attempt in range(max_retries): | ||||||
try: | ||||||
return func(*args, **kwargs) | ||||||
except (kubernetes.max_retry_error(), | ||||||
kubernetes.api_exception(), | ||||||
kubernetes.config_exception()) as e: | ||||||
last_exception = e | ||||||
# Don't retry on permanent errors like 401 (Unauthorized) | ||||||
# or 403 (Forbidden) | ||||||
if (isinstance(e, kubernetes.api_exception()) and | ||||||
e.status in (401, 403)): | ||||||
raise | ||||||
if attempt < max_retries - 1: | ||||||
sleep_time = backoff.current_backoff() | ||||||
logger.debug(f'Kubernetes API call {func.__name__} ' | ||||||
f'failed with {str(e)}. Retrying in ' | ||||||
f'{sleep_time:.1f}s...') | ||||||
time.sleep(sleep_time) | ||||||
continue | ||||||
|
||||||
# Format error message based on the type of exception | ||||||
resource_msg = f' when trying to get {resource_type} info' \ | ||||||
if resource_type else '' | ||||||
debug_cmd = f' To debug, run: kubectl get {resource_type}s' \ | ||||||
if resource_type else '' | ||||||
|
||||||
if isinstance(last_exception, kubernetes.max_retry_error()): | ||||||
error_msg = f'Timed out{resource_msg} from Kubernetes cluster.' | ||||||
elif isinstance(last_exception, kubernetes.api_exception()): | ||||||
error_msg = (f'Kubernetes API error{resource_msg}: ' | ||||||
f'{str(last_exception)}') | ||||||
else: | ||||||
error_msg = (f'Kubernetes configuration error{resource_msg}: ' | ||||||
f'{str(last_exception)}') | ||||||
|
||||||
raise exceptions.ResourcesUnavailableError( | ||||||
f'{error_msg}' | ||||||
f' Please check if the cluster is healthy and retry.' | ||||||
f'{debug_cmd}') from last_exception | ||||||
|
||||||
return wrapper | ||||||
|
||||||
return decorator | ||||||
|
||||||
|
||||||
class GPULabelFormatter: | ||||||
"""Base class to define a GPU label formatter for a Kubernetes cluster | ||||||
|
@@ -445,6 +515,7 @@ def detect_accelerator_resource( | |||||
|
||||||
|
||||||
@functools.lru_cache(maxsize=10) | ||||||
@_retry_on_error(resource_type='node') | ||||||
def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]: | ||||||
"""Gets the kubernetes nodes in the context. | ||||||
|
||||||
|
@@ -453,17 +524,12 @@ def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]: | |||||
if context is None: | ||||||
context = get_current_kube_config_context_name() | ||||||
|
||||||
try: | ||||||
nodes = kubernetes.core_api(context).list_node( | ||||||
_request_timeout=kubernetes.API_TIMEOUT).items | ||||||
except kubernetes.max_retry_error(): | ||||||
raise exceptions.ResourcesUnavailableError( | ||||||
'Timed out when trying to get node info from Kubernetes cluster. ' | ||||||
'Please check if the cluster is healthy and retry. To debug, run: ' | ||||||
'kubectl get nodes') from None | ||||||
nodes = kubernetes.core_api(context).list_node( | ||||||
_request_timeout=kubernetes.API_TIMEOUT).items | ||||||
Comment on lines
+527
to
+528
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems we already have the |
||||||
return nodes | ||||||
|
||||||
|
||||||
@_retry_on_error(resource_type='pod') | ||||||
def get_all_pods_in_kubernetes_cluster( | ||||||
context: Optional[str] = None) -> List[Any]: | ||||||
"""Gets pods in all namespaces in kubernetes cluster indicated by context. | ||||||
|
@@ -473,14 +539,8 @@ def get_all_pods_in_kubernetes_cluster( | |||||
if context is None: | ||||||
context = get_current_kube_config_context_name() | ||||||
|
||||||
try: | ||||||
pods = kubernetes.core_api(context).list_pod_for_all_namespaces( | ||||||
_request_timeout=kubernetes.API_TIMEOUT).items | ||||||
except kubernetes.max_retry_error(): | ||||||
raise exceptions.ResourcesUnavailableError( | ||||||
'Timed out when trying to get pod info from Kubernetes cluster. ' | ||||||
'Please check if the cluster is healthy and retry. To debug, run: ' | ||||||
'kubectl get pods') from None | ||||||
pods = kubernetes.core_api(context).list_pod_for_all_namespaces( | ||||||
_request_timeout=kubernetes.API_TIMEOUT).items | ||||||
return pods | ||||||
|
||||||
|
||||||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can reduce the interval if need to be less than 1 second : )