profile
viewpoint

estroz/operator-sdk 0

SDK for building Kubernetes applications. Provides high level APIs, useful abstractions, and project scaffolding.

hasbro17/api 0

Contains the API definitions used by OLM and Marketplace

hasbro17/awesome-courses 0

:books: List of awesome university courses for learning Computer Science!

hasbro17/bbolt 0

An embedded key/value database for Go.

hasbro17/boom 0

HTTP(S) load generator, ApacheBench (ab) replacement, written in Go

hasbro17/bootkube 0

bootkube - Launch a self-hosted Kubernetes cluster

hasbro17/cluster-etcd-operator 0

Operator to manage the lifecycle of the etcd members of an OpenShift cluster

PullRequestReviewEvent
PullRequestReviewEvent
PullRequestReviewEvent

Pull request review commentopenshift/openshift-docs

[BZ1886092]: Resume all workloads after disaster recovery

 etcd-ip-10-0-154-194.ec2.internal                2/2     Running     0 etcd-ip-10-0-173-171.ec2.internal                2/2     Running     0          9h ---- +To ensure that all workloads return to normal operation following a recovery procedure, you should restart each pod that stores Kubernetes API information. This includes {product-title} components such as routers, operators, and third-party components.

SGTM as a general pointer, but deferring to @hexfusion if we don't want to be more specific/prescriptive about including steps to reboot worker machines. Or if that's too disruptive then we can leave it up to the users to decide how best to reboot individual components.

tmalove

comment created time in 11 days

PullRequestReviewEvent
PullRequestReviewEvent
PullRequestReviewEvent

Pull request review commentopenshift/cluster-etcd-operator

*: Improve observability of etcd client metrics and defrag controller

 func (c *DefragController) checkDefrag(ctx context.Context, recorder events.Reco 			recorder.Eventf("DefragControllerDefragmentAttempt", "Attempting defrag on member: %s, memberID: %d, dbSize: %d, dbInUse: %d, leader ID: %d", member.Name, member.ID, status.DbSize, status.DbSizeInUse, status.Leader) 			if _, err := c.etcdClient.Defragment(ctx, member); err != nil { 				// Defrag can timeout if defragmentation takes longer than etcdcli.DefragDialTimeout.-				errors = append(errors, fmt.Errorf("failed to defragment etcd member: %q :%v", member.Name, err))+				errMsg := fmt.Sprintf("failed defrag on member: %s, memberID: %d: %v", member.Name, member.ID, err)

Nit: member IDs are hex right? Or if all our logs/events are using base-10 then we can probably keep it the same.

				errMsg := fmt.Sprintf("failed defrag on member: %s, memberID: %x: %v", member.Name, member.ID, err)
hexfusion

comment created time in 13 days

PullRequestReviewEvent
PullRequestReviewEvent
PullRequestReviewEvent
PullRequestReviewEvent

Pull request review commentopenshift/cluster-etcd-operator

pkg/operator/etcdendpointscontroller: use etcd membership to populate endpoints

 func (c *EtcdEndpointsController) syncConfigMap(ctx context.Context, recorder ev 		klog.Warningf("required configmap %s/%s will be created because it was missing: %w", operatorclient.TargetNamespace, "etcd-endpoints", err) 	} -	// create endpoint addresses for each node-	nodes, err := c.nodeLister.List(labels.Set{"node-role.kubernetes.io/master": ""}.AsSelector())+	members, err := c.etcdClient.MemberList(ctx) 	if err != nil {-		return fmt.Errorf("unable to list expected etcd member nodes: %v", err)+		return fmt.Errorf("failed to get member list: %w", err) 	}-	endpointAddresses := map[string]string{}-	for _, node := range nodes {-		var nodeInternalIP string-		for _, nodeAddress := range node.Status.Addresses {-			if nodeAddress.Type == corev1.NodeInternalIP {-				nodeInternalIP = nodeAddress.Address-				break-			}++	endpointAddresses := make(map[string]string, len(members))+	// Create endpoint addresses for each member of the cluster.+	for _, member := range members {+		if member.Name == "etcd-bootstrap" {+			continue 		}-		if len(nodeInternalIP) == 0 {-			return fmt.Errorf("unable to determine internal ip address for node %s", node.Name)+		// Use of PeerURL is expected here because it is a mandatory field, and it will mirror ClientURL.+		ip, err := dnshelpers.GetIPFromAddress(member.PeerURLs[0])+		if err != nil {+			return err 		}-		endpointAddresses[base64.StdEncoding.WithPadding(base64.NoPadding).EncodeToString([]byte(nodeInternalIP))] = nodeInternalIP+		endpointAddresses[fmt.Sprintf("%016x", member.ID)] = ip

Nit: Just curious why this format is being chosen. This is an etcd member ID convention right, not some limitation of the configmap key length or format.

hexfusion

comment created time in 15 days

PullRequestReviewEvent
PullRequestReviewEvent
PullRequestReviewEvent

Pull request review commentopenshift/cluster-etcd-operator

WIP: pkg/operator/quorumguardcontroller: add pod affinity

 func (c *QuorumGuardController) ensureEtcdGuardDeployment(ctx context.Context, r 	// use image from release payload 	c.etcdQuorumGuard.Spec.Template.Spec.Containers[0].Image = c.cliImagePullSpec -	// if restart occurred, we will apply etcd guard deployment but if it is the same, nothing will happened+	affinity := &corev1.Affinity{+		// Ensure only a single instance is deployed per node+		PodAntiAffinity: &corev1.PodAntiAffinity{+			RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{+				{+					LabelSelector: &metav1.LabelSelector{+						MatchExpressions: []metav1.LabelSelectorRequirement{+							{+								Key:      "k8s-app",+								Operator: metav1.LabelSelectorOpIn,+								Values:   []string{"etcd-quorum-guard"},+							},+						},+					},+					TopologyKey: "kubernetes.io/hostname",

You're probably reworking this but this is already present here: https://github.com/openshift/cluster-etcd-operator/blob/83e4a83079cfffbd3d8048f02f6d0603cd00c15a/pkg/operator/etcd_assets/bindata.go#L966-L973

hexfusion

comment created time in 15 days

pull request commentopenshift/cluster-etcd-operator

pkg/operator/quorumguardcontroller: add pod affinity

/retitle WIP: pkg/operator/quorumguardcontroller: add pod affinity

hexfusion

comment created time in 15 days

PullRequestReviewEvent
PullRequestReviewEvent
PullRequestReviewEvent
PullRequestReviewEvent

pull request commentopenshift/cluster-etcd-operator

Bump go version to 1.17

/retest

hexfusion

comment created time in 18 days

PullRequestReviewEvent
PullRequestReviewEvent
PullRequestReviewEvent

Pull request review commentopenshift/library-go

pkg/operator/staticpod/controller/installer: add support for WithNodeFilter

 func nodeToStartRevisionWith(ctx context.Context, getStaticPodStateFn staticPodS 	return 0, reason, nil } +// nodeStatusesFilteredForRevision returns a list of nodeStatuses which have been selected for the next revision.+func nodeStatusesFilteredForRevision(ctx context.Context, nodeFilterFn func(ctx context.Context) (map[string]bool, error), nodeStatuses []operatorv1.NodeStatus) ([]operatorv1.NodeStatus, error) {+	if nodeFilterFn == nil {+		return nodeStatuses, nil+	}++	nodeSelectorHostnameMap, err := nodeFilterFn(ctx)+	if err != nil {+		return nil, err+	}+	if len(nodeSelectorHostnameMap) == 0 {+		return nodeStatuses, nil

I can't exactly think of when the etcd-operator's filter would return an empty node map (maybe some node list error not caught by the filter), but if for whatever reason it does, do we want to resort to the default of no-filter, or error out saying the filter is invalid?

	if len(nodeSelectorHostnameMap) == 0 {
		return nil, fmt.Errorf("invalid node filter: the node map cannot be empty")
hexfusion

comment created time in 18 days

Pull request review commentopenshift/library-go

pkg/operator/staticpod/controller/installer: add support for WithNodeFilter

 func nodeToStartRevisionWith(ctx context.Context, getStaticPodStateFn staticPodS 	return 0, reason, nil } +// nodeStatusesFilteredForRevision returns a list of nodeStatuses which have been selected for the next revision.+func nodeStatusesFilteredForRevision(ctx context.Context, nodeFilterFn func(ctx context.Context) (map[string]bool, error), nodeStatuses []operatorv1.NodeStatus) ([]operatorv1.NodeStatus, error) {+	if nodeFilterFn == nil {+		return nodeStatuses, nil+	}++	nodeSelectorHostnameMap, err := nodeFilterFn(ctx)+	if err != nil {+		return nil, err+	}+	if len(nodeSelectorHostnameMap) == 0 {+		return nodeStatuses, nil

And we are error-ing out below as well when the node map is not fully populated too. So maybe just the same error.

hexfusion

comment created time in 18 days

PullRequestReviewEvent
more