Skip to content

Commit

Permalink
Merge pull request #7635 from gvnc/cluster-autoscaler-release-1.31
Browse files Browse the repository at this point in the history
OCI Cloudprovider fixes and features Backport for 1.31
  • Loading branch information
k8s-ci-robot authored Jan 8, 2025
2 parents 0d423e1 + 9fbe549 commit 3b6170e
Show file tree
Hide file tree
Showing 9 changed files with 333 additions and 24 deletions.
10 changes: 9 additions & 1 deletion cluster-autoscaler/cloudprovider/oci/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,13 @@ use-instance-principals = true
n/a
### Node Group Auto Discovery
`--node-group-auto-discovery` could be given in below pattern. It would discover the nodepools under given compartment by matching the nodepool tags (either they are Freeform or Defined tags). All of the parameters are mandatory.
```
clusterId:<clusterId>,compartmentId:<compartmentId>,nodepoolTags:<tagKey1>=<tagValue1>&<tagKey2>=<tagValue2>,min:<min>,max:<max>
```
Auto discovery can not be used along with static discovery (`node` parameter) to prevent conflicts.
## Deployment
### Create OCI config secret (only if _not_ using Instance Principals)
Expand Down Expand Up @@ -271,7 +278,8 @@ kubectl apply -f ./cloudprovider/oci/examples/oci-nodepool-cluster-autoscaler-w-
correctly (`oci-cloud-controller-manager`).
- Avoid manually changing pools that are managed by the Cluster Autoscaler. For example, do not add or remove nodes
using kubectl, or using the Console (or the Oracle Cloud Infrastructure CLI or API).
- `--node-group-auto-discovery` and `--node-autoprovisioning-enabled=true` are not supported.
- `--node-autoprovisioning-enabled=true` are not supported.
- `--node-group-auto-discovery` and `node` parameters can not be used together as it can cause conflicts.
- We set a `nvidia.com/gpu:NoSchedule` taint on nodes in a GPU enabled pools.
## Helpful links
Expand Down
2 changes: 2 additions & 0 deletions cluster-autoscaler/cloudprovider/oci/common/oci_ref.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ type OciRef struct {
PrivateIPAddress string
PublicIPAddress string
Shape string
IsNodeSelfManaged bool
}

// NodeToOciRef converts a node object into an oci reference
Expand All @@ -36,6 +37,7 @@ func NodeToOciRef(n *apiv1.Node) (OciRef, error) {
PrivateIPAddress: getNodeInternalAddress(n),
PublicIPAddress: getNodeExternalAddress(n),
Shape: getNodeShape(n),
IsNodeSelfManaged: n.Labels["oci.oraclecloud.com/node.info.byon"] == "true",
}, nil
}

Expand Down
21 changes: 21 additions & 0 deletions cluster-autoscaler/cloudprovider/oci/common/oci_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,24 @@ func GetAllPoolTypes(groups []string) (string, error) {
}
return ocidType, nil
}

// HasNodeGroupTags checks if nodepoolTags is provided
func HasNodeGroupTags(nodeGroupAutoDiscoveryList []string) (bool, bool, error) {
instancePoolTagsFound := false
nodePoolTagsFound := false
for _, arg := range nodeGroupAutoDiscoveryList {
if strings.Contains(arg, "nodepoolTags") {
nodePoolTagsFound = true
}
if strings.Contains(arg, "instancepoolTags") {
instancePoolTagsFound = true
}
}
if instancePoolTagsFound == true && nodePoolTagsFound == true {
return instancePoolTagsFound, nodePoolTagsFound, fmt.Errorf("can not use both instancepoolTags and nodepoolTags in node-group-auto-discovery")
}
if len(nodeGroupAutoDiscoveryList) > 0 && instancePoolTagsFound == false && nodePoolTagsFound == false {
return instancePoolTagsFound, nodePoolTagsFound, fmt.Errorf("either instancepoolTags or nodepoolTags should be provided in node-group-auto-discovery")
}
return instancePoolTagsFound, nodePoolTagsFound, nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,14 @@ func BuildOCI(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscover
if err != nil {
klog.Fatalf("Failed to get pool type: %v", err)
}
if strings.HasPrefix(ocidType, npconsts.OciNodePoolResourceIdent) {
manager, err := nodepools.CreateNodePoolManager(opts.CloudConfig, do, createKubeClient(opts))
_, nodepoolTagsFound, err := ocicommon.HasNodeGroupTags(opts.NodeGroupAutoDiscovery)
if err != nil {
klog.Fatalf("Failed to get auto discovery tags: %v", err)
}
if strings.HasPrefix(ocidType, npconsts.OciNodePoolResourceIdent) && nodepoolTagsFound == true {
klog.Fatalf("-nodes and -node-group-auto-discovery parameters can not be used together.")
} else if strings.HasPrefix(ocidType, npconsts.OciNodePoolResourceIdent) || nodepoolTagsFound == true {
manager, err := nodepools.CreateNodePoolManager(opts.CloudConfig, opts.NodeGroupAutoDiscovery, do, createKubeClient(opts))
if err != nil {
klog.Fatalf("Could not create OCI OKE cloud provider: %v", err)
}
Expand Down
24 changes: 15 additions & 9 deletions cluster-autoscaler/cloudprovider/oci/nodepools/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,8 @@ func (c *nodePoolCache) nodePools() map[string]*oke.NodePool {
return result
}

func (c *nodePoolCache) rebuild(staticNodePools map[string]NodePool, maxGetNodepoolRetries int) (httpStatusCode int, err error) {
func (c *nodePoolCache) rebuild(staticNodePools map[string]NodePool, maxGetNodepoolRetries int) (err error) {
klog.Infof("rebuilding cache")
var statusCode int
for id := range staticNodePools {
var resp oke.GetNodePoolResponse
for i := 1; i <= maxGetNodepoolRetries; i++ {
Expand All @@ -52,29 +51,36 @@ func (c *nodePoolCache) rebuild(staticNodePools map[string]NodePool, maxGetNodep
NodePoolId: common.String(id),
})
c.mu.Unlock()
httpResp := resp.HTTPResponse()
statusCode = httpResp.StatusCode
if err != nil {
klog.Errorf("Failed to fetch the nodepool : %v. Retries available : %v", id, maxGetNodepoolRetries-i)
} else {
break
}
}
if err != nil {
klog.Errorf("Failed to fetch the nodepool : %v", id)
return statusCode, err
// in order to let cluster autoscaler still do its work even with a wrong nodepoolid,
// we avoid returning an error but instead log and do not add it to cache so it won't be used for scaling.
klog.Errorf("The nodepool will not be considered for scaling until next check : %v", id)
} else {
c.set(&resp.NodePool)
}
c.set(&resp.NodePool)
}
return statusCode, nil
return nil
}

// removeInstance tries to remove the instance from the node pool.
func (c *nodePoolCache) removeInstance(nodePoolID, instanceID string) error {
func (c *nodePoolCache) removeInstance(nodePoolID, instanceID string, nodeName string) error {
c.mu.Lock()
defer c.mu.Unlock()

if instanceID == "" {
klog.Errorf("Node %s doesn't have an instance id so it can't be deleted.", nodeName)
klog.Errorf("This could be due to a Compute Instance issue in OCI such as Out Of Host Capacity error. Check the instance status on OCI Console.")
return errors.Errorf("Node %s doesn't have an instance id so it can't be deleted.", nodeName)
}

klog.Infof("Deleting instance %q from node pool %q", instanceID, nodePoolID)

// always try to remove the instance. This call is idempotent
scaleDown := true
resp, err := c.okeClient.DeleteNode(context.Background(), oke.DeleteNodeRequest{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ func (ocp *OciCloudProvider) NodeGroupForNode(n *apiv1.Node) (cloudprovider.Node
return nil, err
}

// self-managed-nodes aren't expected to be managed by cluster-autoscaler
if ociRef.IsNodeSelfManaged {
return nil, nil
}
ng, err := ocp.manager.GetNodePoolForInstance(ociRef)

// this instance may be part of a node pool that the autoscaler does not handle
Expand All @@ -75,6 +79,9 @@ func (ocp *OciCloudProvider) HasInstance(node *apiv1.Node) (bool, error) {
if err != nil {
return true, err
}
if instance.IsNodeSelfManaged {
return false, nil
}
np, err := ocp.manager.GetNodePoolForInstance(instance)
if err != nil {
return true, err
Expand Down
165 changes: 156 additions & 9 deletions cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"fmt"
"math"
"os"
"regexp"
"strconv"
"strings"
"time"
Expand All @@ -34,6 +35,11 @@ import (
const (
maxAddTaintRetries = 5
maxGetNodepoolRetries = 3
clusterId = "clusterId"
compartmentId = "compartmentId"
nodepoolTags = "nodepoolTags"
min = "min"
max = "max"
)

var (
Expand Down Expand Up @@ -75,10 +81,11 @@ type okeClient interface {
GetNodePool(context.Context, oke.GetNodePoolRequest) (oke.GetNodePoolResponse, error)
UpdateNodePool(context.Context, oke.UpdateNodePoolRequest) (oke.UpdateNodePoolResponse, error)
DeleteNode(context.Context, oke.DeleteNodeRequest) (oke.DeleteNodeResponse, error)
ListNodePools(ctx context.Context, request oke.ListNodePoolsRequest) (oke.ListNodePoolsResponse, error)
}

// CreateNodePoolManager creates an NodePoolManager that can manage autoscaling node pools
func CreateNodePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.NodeGroupDiscoveryOptions, kubeClient kubernetes.Interface) (NodePoolManager, error) {
func CreateNodePoolManager(cloudConfigPath string, nodeGroupAutoDiscoveryList []string, discoveryOpts cloudprovider.NodeGroupDiscoveryOptions, kubeClient kubernetes.Interface) (NodePoolManager, error) {

var err error
var configProvider common.ConfigurationProvider
Expand Down Expand Up @@ -151,6 +158,20 @@ func CreateNodePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.N
nodePoolCache: newNodePoolCache(&okeClient),
}

// auto discover nodepools from compartments with nodeGroupAutoDiscovery parameter
klog.Infof("checking node groups for autodiscovery ... ")
for _, arg := range nodeGroupAutoDiscoveryList {
nodeGroup, err := nodeGroupFromArg(arg)
if err != nil {
return nil, fmt.Errorf("unable to construct node group auto discovery from argument: %v", err)
}
nodeGroup.manager = manager
nodeGroup.kubeClient = kubeClient

manager.nodeGroups = append(manager.nodeGroups, *nodeGroup)
autoDiscoverNodeGroups(manager, manager.okeClient, *nodeGroup)
}

// Contains all the specs from the args that give us the pools.
for _, arg := range discoveryOpts.NodeGroupSpecs {
np, err := nodePoolFromArg(arg)
Expand Down Expand Up @@ -180,6 +201,48 @@ func CreateNodePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.N
return manager, nil
}

func autoDiscoverNodeGroups(m *ociManagerImpl, okeClient okeClient, nodeGroup nodeGroupAutoDiscovery) (bool, error) {
var resp, reqErr = okeClient.ListNodePools(context.Background(), oke.ListNodePoolsRequest{
ClusterId: common.String(nodeGroup.clusterId),
CompartmentId: common.String(nodeGroup.compartmentId),
})
if reqErr != nil {
klog.Errorf("failed to fetch the nodepool list with clusterId: %s, compartmentId: %s. Error: %v", nodeGroup.clusterId, nodeGroup.compartmentId, reqErr)
return false, reqErr
}
for _, nodePoolSummary := range resp.Items {
if validateNodepoolTags(nodeGroup.tags, nodePoolSummary.FreeformTags, nodePoolSummary.DefinedTags) {
nodepool := &nodePool{}
nodepool.id = *nodePoolSummary.Id
nodepool.minSize = nodeGroup.minSize
nodepool.maxSize = nodeGroup.maxSize

nodepool.manager = nodeGroup.manager
nodepool.kubeClient = nodeGroup.kubeClient

m.staticNodePools[nodepool.id] = nodepool
klog.V(5).Infof("auto discovered nodepool in compartment : %s , nodepoolid: %s", nodeGroup.compartmentId, nodepool.id)
} else {
klog.Warningf("nodepool ignored as the tags do not satisfy the requirement : %s , %v, %v", *nodePoolSummary.Id, nodePoolSummary.FreeformTags, nodePoolSummary.DefinedTags)
}
}
return true, nil
}

func validateNodepoolTags(nodeGroupTags map[string]string, freeFormTags map[string]string, definedTags map[string]map[string]interface{}) bool {
if nodeGroupTags != nil {
for tagKey, tagValue := range nodeGroupTags {
namespacedTagKey := strings.Split(tagKey, ".")
if len(namespacedTagKey) == 2 && tagValue != definedTags[namespacedTagKey[0]][namespacedTagKey[1]] {
return false
} else if len(namespacedTagKey) != 2 && tagValue != freeFormTags[tagKey] {
return false
}
}
}
return true
}

// nodePoolFromArg parses a node group spec represented in the form of `<minSize>:<maxSize>:<ocid>` and produces a node group spec object
func nodePoolFromArg(value string) (*nodePool, error) {
tokens := strings.SplitN(value, ":", 3)
Expand Down Expand Up @@ -207,6 +270,81 @@ func nodePoolFromArg(value string) (*nodePool, error) {
return spec, nil
}

// nodeGroupFromArg parses a node group spec represented in the form of
// `clusterId:<clusterId>,compartmentId:<compartmentId>,nodepoolTags:<tagKey1>=<tagValue1>&<tagKey2>=<tagValue2>,min:<min>,max:<max>`
// and produces a node group auto discovery object
func nodeGroupFromArg(value string) (*nodeGroupAutoDiscovery, error) {
// this regex will find the key-value pairs in any given order if separated with a colon
regexPattern := `(?:` + compartmentId + `:(?P<` + compartmentId + `>[^,]+)`
regexPattern = regexPattern + `|` + nodepoolTags + `:(?P<` + nodepoolTags + `>[^,]+)`
regexPattern = regexPattern + `|` + max + `:(?P<` + max + `>[^,]+)`
regexPattern = regexPattern + `|` + min + `:(?P<` + min + `>[^,]+)`
regexPattern = regexPattern + `|` + clusterId + `:(?P<` + clusterId + `>[^,]+)`
regexPattern = regexPattern + `)(?:,|$)`

re := regexp.MustCompile(regexPattern)

parametersMap := make(map[string]string)

// push key-value pairs into a map
for _, match := range re.FindAllStringSubmatch(value, -1) {
for i, name := range re.SubexpNames() {
if i != 0 && match[i] != "" {
parametersMap[name] = match[i]
}
}
}

spec := &nodeGroupAutoDiscovery{}

if parametersMap[clusterId] != "" {
spec.clusterId = parametersMap[clusterId]
} else {
return nil, fmt.Errorf("failed to set %s, it is missing in node-group-auto-discovery parameter", clusterId)
}

if parametersMap[compartmentId] != "" {
spec.compartmentId = parametersMap[compartmentId]
} else {
return nil, fmt.Errorf("failed to set %s, it is missing in node-group-auto-discovery parameter", compartmentId)
}

if size, err := strconv.Atoi(parametersMap[min]); err == nil {
spec.minSize = size
} else {
return nil, fmt.Errorf("failed to set %s size: %s, expected integer", min, parametersMap[min])
}

if size, err := strconv.Atoi(parametersMap[max]); err == nil {
spec.maxSize = size
} else {
return nil, fmt.Errorf("failed to set %s size: %s, expected integer", max, parametersMap[max])
}

if parametersMap[nodepoolTags] != "" {
nodepoolTags := parametersMap[nodepoolTags]

spec.tags = make(map[string]string)

pairs := strings.Split(nodepoolTags, "&")

for _, pair := range pairs {
parts := strings.Split(pair, "=")
if len(parts) == 2 {
spec.tags[parts[0]] = parts[1]
} else {
return nil, fmt.Errorf("nodepoolTags should be given in tagKey=tagValue format, this is not valid: %s", pair)
}
}
} else {
return nil, fmt.Errorf("failed to set %s, it is missing in node-group-auto-discovery parameter", nodepoolTags)
}

klog.Infof("node group auto discovery spec constructed: %+v", spec)

return spec, nil
}

type ociManagerImpl struct {
cfg *ocicommon.CloudConfig
okeClient okeClient
Expand All @@ -215,6 +353,7 @@ type ociManagerImpl struct {
ociTagsGetter ocicommon.TagsGetter
registeredTaintsGetter RegisteredTaintsGetter
staticNodePools map[string]NodePool
nodeGroups []nodeGroupAutoDiscovery

lastRefresh time.Time

Expand Down Expand Up @@ -253,15 +392,20 @@ func (m *ociManagerImpl) TaintToPreventFurtherSchedulingOnRestart(nodes []*apiv1
}

func (m *ociManagerImpl) forceRefresh() error {
httpStatusCode, err := m.nodePoolCache.rebuild(m.staticNodePools, maxGetNodepoolRetries)
if err != nil {
if httpStatusCode == 404 {
m.lastRefresh = time.Now()
klog.Errorf("Failed to fetch the nodepools. Retrying after %v", m.lastRefresh.Add(m.cfg.Global.RefreshInterval))
return err
// auto discover node groups
if m.nodeGroups != nil {
// empty previous nodepool map to do an auto discovery
m.staticNodePools = make(map[string]NodePool)
for _, nodeGroup := range m.nodeGroups {
autoDiscoverNodeGroups(m, m.okeClient, nodeGroup)
}
}
// rebuild nodepool cache
err := m.nodePoolCache.rebuild(m.staticNodePools, maxGetNodepoolRetries)
if err != nil {
return err
}

m.lastRefresh = time.Now()
klog.Infof("Refreshed NodePool list, next refresh after %v", m.lastRefresh.Add(m.cfg.Global.RefreshInterval))
return nil
Expand All @@ -276,7 +420,10 @@ func (m *ociManagerImpl) Cleanup() error {
func (m *ociManagerImpl) GetNodePools() []NodePool {
var nodePools []NodePool
for _, np := range m.staticNodePools {
nodePools = append(nodePools, np)
nodePoolInCache := m.nodePoolCache.cache[np.Id()]
if nodePoolInCache != nil {
nodePools = append(nodePools, np)
}
}
return nodePools
}
Expand Down Expand Up @@ -497,7 +644,7 @@ func (m *ociManagerImpl) SetNodePoolSize(np NodePool, size int) error {
func (m *ociManagerImpl) DeleteInstances(np NodePool, instances []ocicommon.OciRef) error {
klog.Infof("DeleteInstances called")
for _, instance := range instances {
err := m.nodePoolCache.removeInstance(np.Id(), instance.InstanceID)
err := m.nodePoolCache.removeInstance(np.Id(), instance.InstanceID, instance.Name)
if err != nil {
return err
}
Expand Down
Loading

0 comments on commit 3b6170e

Please sign in to comment.