Ceph: Added removeOSDsIfOutAndSafeToRemove to Cluster CR

OSDs can be removed automatically with the current mechanism if a new setting removeOSDsIfOutAndSafeToRemove is set to true. The default for all new or upgraded clusters should be false. Signed-off-by: rohan47 <rohgupta@redhat.com> (cherry picked from commit 7f9611d4)

Ceph: Added removeOSDsIfOutAndSafeToRemove to Cluster CR
OSDs can be removed automatically with the current mechanism if a new setting removeOSDsIfOutAndSafeToRemove is set to true. The default for all new or upgraded clusters should be false. Signed-off-by: rohan47 <rohgupta@redhat.com> (cherry picked from commit 7f9611d4)
932d7e2d · rohan47 · mergify-bot · da6b8592 · 932d7e2d · 932d7e2d
Commit 932d7e2d authored 5 years ago by rohan47 Committed by mergify-bot 5 years ago
Hide whitespace changes
Inline Side-by-side

Showing

with 42 additions and 22 deletions
+42 -22
--- a/Documentation/ceph-cluster-crd.md
+++ b/Documentation/ceph-cluster-crd.md
@@ -136,6 +136,7 @@ For more details on the mons and when to choose a number other than `3`, see the
  - `osdMaintenanceTimeout`: is a duration in minutes that determines how long an entire failureDomain like `region/zone/host` will be held in `noout` (in addition to the default DOWN/OUT interval) when it is draining. This is only relevant when  `managePodBudgets` is `true`. The default value is `30` minutes.
  - `manageMachineDisruptionBudgets`: if `true`, the operator will create and manage MachineDisruptionBudgets to ensure OSDs are only fenced when the cluster is healthy. Only available on OpenShift.
  - `machineDisruptionBudgetNamespace`: the namespace in which to watch the MachineDisruptionBudgets.
+- `removeOSDsIfOutAndSafeToRemove`: If `true` the operator will remove the OSDs that are down and whose data has been restored to other OSDs. In Ceph terms, the osds are `out` and `safe-to-destroy` when then would be removed. 

 ### Mon Settings


--- a/cluster/examples/kubernetes/ceph/cluster.yaml
+++ b/cluster/examples/kubernetes/ceph/cluster.yaml
@@ -115,6 +115,8 @@ spec:
 #    mon:
 #    osd:
 #    prepareosd:
+  # The option to automatically remove OSDs that are out and are safe to destroy.
+  removeOSDsIfOutAndSafeToRemove: false
  storage: # cluster level storage configuration and selection
    useAllNodes: true
    useAllDevices: true

--- a/pkg/apis/ceph.rook.io/v1/types.go
+++ b/pkg/apis/ceph.rook.io/v1/types.go
@@ -95,6 +95,9 @@ type ClusterSpec struct {

 	// A spec for mgr related options
 	Mgr MgrSpec `json:"mgr,omitempty"`
+
+	// Remove the OSD that is out and safe to remove only if this option is true
+	RemoveOSDsIfOutAndSafeToRemove bool `json:"removeOSDsIfOutAndSafeToRemove"`
 }

 // VersionSpec represents the settings for the Ceph version that Rook is orchestrating.

--- a/pkg/operator/ceph/cluster/controller.go
+++ b/pkg/operator/ceph/cluster/controller.go
@@ -106,6 +106,7 @@ type ClusterController struct {
 	addClusterCallbacks []func(*cephv1.ClusterSpec) error
 	csiConfigMutex      *sync.Mutex
 	nodeStore           cache.Store
+	osdChecker          *osd.Monitor
 }

 // NewClusterController create controller for watching cluster custom resources created
@@ -439,8 +440,8 @@ func (c *ClusterController) initializeCluster(cluster *cluster, clusterObj *ceph

 	if !cluster.Spec.External.Enable {
 		// Start the osd health checker only if running OSDs in the local ceph cluster
-		osdChecker := osd.NewMonitor(c.context, cluster.Namespace)
-		go osdChecker.Start(cluster.stopCh)
+		c.osdChecker = osd.NewMonitor(c.context, cluster.Namespace, cluster.Spec.RemoveOSDsIfOutAndSafeToRemove)
+		go c.osdChecker.Start(cluster.stopCh)
 	}

 	// Start the ceph status checker
@@ -565,6 +566,11 @@ func (c *ClusterController) onUpdate(oldObj, newObj interface{}) {

 	logger.Infof("update event for cluster %s is supported, orchestrating update now", newClust.Namespace)

+	if oldClust.Spec.RemoveOSDsIfOutAndSafeToRemove != newClust.Spec.RemoveOSDsIfOutAndSafeToRemove {
+		logger.Infof("removeOSDsIfOutAndSafeToRemove is set to %t", newClust.Spec.RemoveOSDsIfOutAndSafeToRemove)
+		c.osdChecker.Update(newClust.Spec.RemoveOSDsIfOutAndSafeToRemove)
+	}
+
 	// if the image changed, we need to detect the new image version
 	versionChanged := false
 	if oldClust.Spec.CephVersion.Image != newClust.Spec.CephVersion.Image {

--- a/pkg/operator/ceph/cluster/osd/health.go
+++ b/pkg/operator/ceph/cluster/osd/health.go
@@ -38,13 +38,14 @@ var (

 // Monitor defines OSD process monitoring
 type Monitor struct {
-	context     *clusterd.Context
-	clusterName string
+	context                        *clusterd.Context
+	clusterName                    string
+	removeOSDsIfOUTAndSafeToRemove bool
 }

 // NewMonitor instantiates OSD monitoring
-func NewMonitor(context *clusterd.Context, clusterName string) *Monitor {
-	return &Monitor{context, clusterName}
+func NewMonitor(context *clusterd.Context, clusterName string, removeOSDsIfOUTAndSafeToRemove bool) *Monitor {
+	return &Monitor{context, clusterName, removeOSDsIfOUTAndSafeToRemove}
 }

 // Start runs monitoring logic for osds status at set intervals
@@ -66,6 +67,11 @@ func (m *Monitor) Start(stopCh chan struct{}) {
 	}
 }

+// Update updates the removeOSDsIfOUTAndSafeToRemove
+func (m *Monitor) Update(removeOSDsIfOUTAndSafeToRemove bool) {
+	m.removeOSDsIfOUTAndSafeToRemove = removeOSDsIfOUTAndSafeToRemove
+}
+
 // OSDStatus validates osd dump output
 func (m *Monitor) osdStatus() error {
 	osdDump, err := client.GetOSDDump(m.context, m.clusterName)
@@ -97,8 +103,10 @@ func (m *Monitor) osdStatus() error {

 		if in != inStatus {
 			logger.Debugf("osd.%d is marked 'OUT'", id)
-			if err := m.handleOSDMarkedOut(id); err != nil {
-				logger.Errorf("Error handling marked out osd osd.%d: %v", id, err)
+			if m.removeOSDsIfOUTAndSafeToRemove {
+				if err := m.handleOSDMarkedOut(id); err != nil {
+					logger.Errorf("Error handling marked out osd osd.%d: %v", id, err)
+				}
 			}
 		}
 	}
@@ -107,26 +115,26 @@ func (m *Monitor) osdStatus() error {
 }

 func (m *Monitor) handleOSDMarkedOut(outOSDid int) error {
-	safeToDestroyOSD, err := client.OsdSafeToDestroy(m.context, m.clusterName, outOSDid)
+	label := fmt.Sprintf("ceph-osd-id=%d", outOSDid)
+	dp, err := k8sutil.GetDeployments(m.context.Clientset, m.clusterName, label)
 	if err != nil {
-		return err
+		if errors.IsNotFound(err) {
+			return nil
+		}
+		return fmt.Errorf("failed to get osd deployment of osd id %d: %+v", outOSDid, err)
 	}
-
-	if safeToDestroyOSD {
-		logger.Infof("osd.%d is 'safe-to-destroy'", outOSDid)
-		label := fmt.Sprintf("ceph-osd-id=%d", outOSDid)
-		dp, err := k8sutil.GetDeployments(m.context.Clientset, m.clusterName, label)
+	if len(dp.Items) != 0 {
+		safeToDestroyOSD, err := client.OsdSafeToDestroy(m.context, m.clusterName, outOSDid)
 		if err != nil {
-			if errors.IsNotFound(err) {
-				return nil
-			}
-			return fmt.Errorf("failed to get osd deployment of osd id %d: %+v", outOSDid, err)
+			return err
 		}
-		if len(dp.Items) != 0 {
+
+		if safeToDestroyOSD {
 			podCreationTimestamp := dp.Items[0].GetCreationTimestamp()
 			podDeletionTimeStamp := podCreationTimestamp.Add(graceTime)
 			currentTime := time.Now().UTC()
 			if podDeletionTimeStamp.Before(currentTime) {
+				logger.Infof("osd.%d is 'safe-to-destroy'. removing the osd deployment.", outOSDid)
 				if err := k8sutil.DeleteDeployment(m.context.Clientset, dp.Items[0].Namespace, dp.Items[0].Name); err != nil {
 					return fmt.Errorf("failed to delete osd deployment %s: %+v", dp.Items[0].Name, err)
 				}

--- a/pkg/operator/ceph/cluster/osd/health_test.go
+++ b/pkg/operator/ceph/cluster/osd/health_test.go
@@ -84,7 +84,7 @@ func TestOSDStatus(t *testing.T) {
 	assert.Equal(t, 1, len(dp.Items))

 	// Initializing an OSD monitoring
-	osdMon := NewMonitor(context, cluster)
+	osdMon := NewMonitor(context, cluster, true)

 	// Run OSD monitoring routine
 	err := osdMon.osdStatus()
@@ -99,7 +99,7 @@ func TestOSDStatus(t *testing.T) {

 func TestMonitorStart(t *testing.T) {
 	stopCh := make(chan struct{})
-	osdMon := NewMonitor(&clusterd.Context{}, "cluster")
+	osdMon := NewMonitor(&clusterd.Context{}, "cluster", true)
 	logger.Infof("starting osd monitor")
 	go osdMon.Start(stopCh)
 	close(stopCh)