Commit 932d7e2d authored by rohan47's avatar rohan47 Committed by mergify-bot
Browse files

Ceph: Added removeOSDsIfOutAndSafeToRemove to Cluster CR


OSDs can be removed automatically with the current mechanism if a new
setting removeOSDsIfOutAndSafeToRemove is set to true. The default for
all new or upgraded clusters should be false.
Signed-off-by: default avatarrohan47 <rohgupta@redhat.com>
(cherry picked from commit 7f9611d4)
parent da6b8592
Showing with 42 additions and 22 deletions
+42 -22
......@@ -136,6 +136,7 @@ For more details on the mons and when to choose a number other than `3`, see the
- `osdMaintenanceTimeout`: is a duration in minutes that determines how long an entire failureDomain like `region/zone/host` will be held in `noout` (in addition to the default DOWN/OUT interval) when it is draining. This is only relevant when `managePodBudgets` is `true`. The default value is `30` minutes.
- `manageMachineDisruptionBudgets`: if `true`, the operator will create and manage MachineDisruptionBudgets to ensure OSDs are only fenced when the cluster is healthy. Only available on OpenShift.
- `machineDisruptionBudgetNamespace`: the namespace in which to watch the MachineDisruptionBudgets.
- `removeOSDsIfOutAndSafeToRemove`: If `true` the operator will remove the OSDs that are down and whose data has been restored to other OSDs. In Ceph terms, the osds are `out` and `safe-to-destroy` when then would be removed.
### Mon Settings
......
......@@ -115,6 +115,8 @@ spec:
# mon:
# osd:
# prepareosd:
# The option to automatically remove OSDs that are out and are safe to destroy.
removeOSDsIfOutAndSafeToRemove: false
storage: # cluster level storage configuration and selection
useAllNodes: true
useAllDevices: true
......
......@@ -95,6 +95,9 @@ type ClusterSpec struct {
// A spec for mgr related options
Mgr MgrSpec `json:"mgr,omitempty"`
// Remove the OSD that is out and safe to remove only if this option is true
RemoveOSDsIfOutAndSafeToRemove bool `json:"removeOSDsIfOutAndSafeToRemove"`
}
// VersionSpec represents the settings for the Ceph version that Rook is orchestrating.
......
......@@ -106,6 +106,7 @@ type ClusterController struct {
addClusterCallbacks []func(*cephv1.ClusterSpec) error
csiConfigMutex *sync.Mutex
nodeStore cache.Store
osdChecker *osd.Monitor
}
// NewClusterController create controller for watching cluster custom resources created
......@@ -439,8 +440,8 @@ func (c *ClusterController) initializeCluster(cluster *cluster, clusterObj *ceph
if !cluster.Spec.External.Enable {
// Start the osd health checker only if running OSDs in the local ceph cluster
osdChecker := osd.NewMonitor(c.context, cluster.Namespace)
go osdChecker.Start(cluster.stopCh)
c.osdChecker = osd.NewMonitor(c.context, cluster.Namespace, cluster.Spec.RemoveOSDsIfOutAndSafeToRemove)
go c.osdChecker.Start(cluster.stopCh)
}
// Start the ceph status checker
......@@ -565,6 +566,11 @@ func (c *ClusterController) onUpdate(oldObj, newObj interface{}) {
logger.Infof("update event for cluster %s is supported, orchestrating update now", newClust.Namespace)
if oldClust.Spec.RemoveOSDsIfOutAndSafeToRemove != newClust.Spec.RemoveOSDsIfOutAndSafeToRemove {
logger.Infof("removeOSDsIfOutAndSafeToRemove is set to %t", newClust.Spec.RemoveOSDsIfOutAndSafeToRemove)
c.osdChecker.Update(newClust.Spec.RemoveOSDsIfOutAndSafeToRemove)
}
// if the image changed, we need to detect the new image version
versionChanged := false
if oldClust.Spec.CephVersion.Image != newClust.Spec.CephVersion.Image {
......
......@@ -38,13 +38,14 @@ var (
// Monitor defines OSD process monitoring
type Monitor struct {
context *clusterd.Context
clusterName string
context *clusterd.Context
clusterName string
removeOSDsIfOUTAndSafeToRemove bool
}
// NewMonitor instantiates OSD monitoring
func NewMonitor(context *clusterd.Context, clusterName string) *Monitor {
return &Monitor{context, clusterName}
func NewMonitor(context *clusterd.Context, clusterName string, removeOSDsIfOUTAndSafeToRemove bool) *Monitor {
return &Monitor{context, clusterName, removeOSDsIfOUTAndSafeToRemove}
}
// Start runs monitoring logic for osds status at set intervals
......@@ -66,6 +67,11 @@ func (m *Monitor) Start(stopCh chan struct{}) {
}
}
// Update updates the removeOSDsIfOUTAndSafeToRemove
func (m *Monitor) Update(removeOSDsIfOUTAndSafeToRemove bool) {
m.removeOSDsIfOUTAndSafeToRemove = removeOSDsIfOUTAndSafeToRemove
}
// OSDStatus validates osd dump output
func (m *Monitor) osdStatus() error {
osdDump, err := client.GetOSDDump(m.context, m.clusterName)
......@@ -97,8 +103,10 @@ func (m *Monitor) osdStatus() error {
if in != inStatus {
logger.Debugf("osd.%d is marked 'OUT'", id)
if err := m.handleOSDMarkedOut(id); err != nil {
logger.Errorf("Error handling marked out osd osd.%d: %v", id, err)
if m.removeOSDsIfOUTAndSafeToRemove {
if err := m.handleOSDMarkedOut(id); err != nil {
logger.Errorf("Error handling marked out osd osd.%d: %v", id, err)
}
}
}
}
......@@ -107,26 +115,26 @@ func (m *Monitor) osdStatus() error {
}
func (m *Monitor) handleOSDMarkedOut(outOSDid int) error {
safeToDestroyOSD, err := client.OsdSafeToDestroy(m.context, m.clusterName, outOSDid)
label := fmt.Sprintf("ceph-osd-id=%d", outOSDid)
dp, err := k8sutil.GetDeployments(m.context.Clientset, m.clusterName, label)
if err != nil {
return err
if errors.IsNotFound(err) {
return nil
}
return fmt.Errorf("failed to get osd deployment of osd id %d: %+v", outOSDid, err)
}
if safeToDestroyOSD {
logger.Infof("osd.%d is 'safe-to-destroy'", outOSDid)
label := fmt.Sprintf("ceph-osd-id=%d", outOSDid)
dp, err := k8sutil.GetDeployments(m.context.Clientset, m.clusterName, label)
if len(dp.Items) != 0 {
safeToDestroyOSD, err := client.OsdSafeToDestroy(m.context, m.clusterName, outOSDid)
if err != nil {
if errors.IsNotFound(err) {
return nil
}
return fmt.Errorf("failed to get osd deployment of osd id %d: %+v", outOSDid, err)
return err
}
if len(dp.Items) != 0 {
if safeToDestroyOSD {
podCreationTimestamp := dp.Items[0].GetCreationTimestamp()
podDeletionTimeStamp := podCreationTimestamp.Add(graceTime)
currentTime := time.Now().UTC()
if podDeletionTimeStamp.Before(currentTime) {
logger.Infof("osd.%d is 'safe-to-destroy'. removing the osd deployment.", outOSDid)
if err := k8sutil.DeleteDeployment(m.context.Clientset, dp.Items[0].Namespace, dp.Items[0].Name); err != nil {
return fmt.Errorf("failed to delete osd deployment %s: %+v", dp.Items[0].Name, err)
}
......
......@@ -84,7 +84,7 @@ func TestOSDStatus(t *testing.T) {
assert.Equal(t, 1, len(dp.Items))
// Initializing an OSD monitoring
osdMon := NewMonitor(context, cluster)
osdMon := NewMonitor(context, cluster, true)
// Run OSD monitoring routine
err := osdMon.osdStatus()
......@@ -99,7 +99,7 @@ func TestOSDStatus(t *testing.T) {
func TestMonitorStart(t *testing.T) {
stopCh := make(chan struct{})
osdMon := NewMonitor(&clusterd.Context{}, "cluster")
osdMon := NewMonitor(&clusterd.Context{}, "cluster", true)
logger.Infof("starting osd monitor")
go osdMon.Start(stopCh)
close(stopCh)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment