Unverified Commit 5adaff81 authored by Rohan CJ's avatar Rohan CJ Committed by Blaine Gardner
Browse files

Fix topologyAware on PVC-based OSDs

Signed-off-by: default avatarRohan CJ <rohantmp@gmail.com>
parent 1647fb25
Showing with 225 additions and 10 deletions
+225 -10
......@@ -247,6 +247,19 @@ rules:
- "*"
{{- if ((.Values.agent) and .Values.agent.mountSecurityMode) and ne .Values.agent.mountSecurityMode "Any" }}
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: rook-ceph-osd
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
......
......@@ -31,6 +31,20 @@ subjects:
name: rook-ceph-mgr
namespace: {{ .Release.Namespace }}
---
# Allow the ceph osd to access cluster-wide resources necessary for determining their topology location
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: rook-ceph-osd
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: rook-ceph-osd
subjects:
- kind: ServiceAccount
name: rook-ceph-osd
namespace: {{ .Release.Namespace }}
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
......
......@@ -817,6 +817,20 @@ rules:
resources: ["cephclusters", "cephclusters/finalizers"]
verbs: [ "get", "list", "create", "update", "delete" ]
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: rook-ceph-osd
namespace: rook-ceph
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
---
# Aspects of ceph-mgr that require access to the system namespace
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1beta1
......@@ -976,6 +990,22 @@ subjects:
- kind: ServiceAccount
name: rook-ceph-mgr
namespace: rook-ceph
---
# Allow the ceph osd to access cluster-wide resources necessary for determining their topology location
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: rook-ceph-osd
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: rook-ceph-osd
subjects:
- kind: ServiceAccount
name: rook-ceph-osd
namespace: rook-ceph
# OLM: END CLUSTER ROLEBINDING
# OLM: BEGIN CMD REPORTER ROLEBINDING
---
......
......@@ -51,6 +51,7 @@ type config struct {
networkInfo clusterd.NetworkInfo
monEndpoints string
nodeName string
topologyAware bool
pvcBacked bool
}
......
......@@ -22,6 +22,8 @@ import (
"strconv"
"strings"
"k8s.io/client-go/kubernetes"
"github.com/rook/rook/cmd/rook/rook"
"github.com/rook/rook/pkg/daemon/ceph/client"
osddaemon "github.com/rook/rook/pkg/daemon/ceph/osd"
......@@ -32,6 +34,8 @@ import (
"github.com/rook/rook/pkg/operator/k8sutil"
"github.com/rook/rook/pkg/util/flags"
"github.com/spf13/cobra"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
......@@ -82,7 +86,7 @@ func addOSDFlags(command *cobra.Command) {
provisionCmd.Flags().BoolVar(&cfg.forceFormat, "force-format", false,
"true to force the format of any specified devices, even if they already have a filesystem. BE CAREFUL!")
provisionCmd.Flags().BoolVar(&cfg.pvcBacked, "pvc-backed-osd", false, "true to specify a block mode pvc is backing the OSD")
provisionCmd.Flags().BoolVar(&cfg.topologyAware, "topology-aware", false, "true to specify crush location should be set based on the node's zone/region labels")
// flags for generating the osd config
osdConfigCmd.Flags().IntVar(&osdID, "osd-id", -1, "osd id for which to generate config")
osdConfigCmd.Flags().BoolVar(&osdIsDevice, "is-device", false, "whether the osd is a device")
......@@ -195,11 +199,10 @@ func writeOSDConfig(cmd *cobra.Command, args []string) error {
context := createContext()
commonOSDInit(osdConfigCmd)
locArgs, err := client.FormatLocation(cfg.location, cfg.nodeName)
crushLocation, err := getLocation(context.Clientset, cfg.location, cfg.topologyAware)
if err != nil {
rook.TerminateFatal(fmt.Errorf("invalid location %s. %+v\n", cfg.location, err))
rook.TerminateFatal(err)
}
crushLocation := strings.Join(locArgs, " ")
kv := k8sutil.NewConfigMapKVStore(clusterInfo.Name, context.Clientset, metav1.OwnerReference{})
if err := osddaemon.WriteConfigFile(context, &clusterInfo, kv, osdID, osdIsDevice, cfg.storeConfig, cfg.nodeName, crushLocation); err != nil {
......@@ -238,12 +241,11 @@ func prepareOSD(cmd *cobra.Command, args []string) error {
context := createContext()
commonOSDInit(provisionCmd)
locArgs, err := client.FormatLocation(cfg.location, cfg.nodeName)
crushLocation, err := getLocation(context.Clientset, cfg.location, cfg.topologyAware)
if err != nil {
rook.TerminateFatal(fmt.Errorf("invalid location. %+v", err))
rook.TerminateFatal(err)
}
crushLocation := strings.Join(locArgs, " ")
logger.Infof("crush location of osd: %s", crushLocation)
forceFormat := false
ownerRef := cluster.ClusterOwnerRef(clusterInfo.Name, ownerRefID)
......@@ -274,6 +276,56 @@ func commonOSDInit(cmd *cobra.Command) {
clusterInfo.Monitors = mon.ParseMonEndpoints(cfg.monEndpoints)
}
func getLocation(clientset kubernetes.Interface, location string, topologyAware bool) (string, error) {
locArgs, err := client.FormatLocation(cfg.location, cfg.nodeName)
if err != nil {
return "", fmt.Errorf("invalid location. %+v", err)
}
// use zone/region/hostname labels in the crushmap
if cfg.topologyAware {
nodeName := os.Getenv(k8sutil.NodeNameEnvVar)
node, err := getNode(clientset, nodeName)
if err != nil {
return "", fmt.Errorf("topologyAware is set, but could not get the node: %+v", err)
}
nodeLabels := node.GetLabels()
// get zone
zone, ok := nodeLabels[corev1.LabelZoneFailureDomain]
if ok {
client.UpdateCrushMapValue(&locArgs, "zone", client.NormalizeCrushName(zone))
}
// get region
region, ok := nodeLabels[corev1.LabelZoneRegion]
if ok {
client.UpdateCrushMapValue(&locArgs, "region", client.NormalizeCrushName(region))
}
}
return strings.Join(locArgs, " "), nil
}
// getNode will try to get the node object for the provided nodeName
// it will try using the node's name it's hostname label
func getNode(clientset kubernetes.Interface, nodeName string) (*corev1.Node, error) {
var node *corev1.Node
var err error
// try to find by the node by matching the provided nodeName
node, err = clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
if errors.IsNotFound(err) {
listOpts := metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", corev1.LabelHostname, nodeName)}
nodeList, err := clientset.CoreV1().Nodes().List(listOpts)
if err != nil || len(nodeList.Items) < 1 {
return nil, fmt.Errorf("could not find node '%s'hostname label: %+v", nodeName, err)
}
return &nodeList.Items[0], nil
} else if err != nil {
return nil, fmt.Errorf("could not find node '%s' by name: %+v", nodeName, err)
}
return node, nil
}
// Parse the devices, which are comma separated. A colon indicates a non-default number of osds per device
// or a non collocated metadata device.
// For example, one osd will be created on each of sda and sdb, with 5 osds on the nvme01 device.
......
......@@ -154,13 +154,44 @@ func FormatLocation(location, hostName string) ([]string, error) {
// set the host name
if !isCrushFieldSet("host", pairs) {
// keep the fully qualified host name in the crush map, but replace the dots with dashes to satisfy ceph
hostName = strings.Replace(hostName, ".", "-", -1)
hostName = NormalizeCrushName(hostName)
pairs = append(pairs, formatProperty("host", hostName))
}
return pairs, nil
}
// NormalizeCrushName replaces . with -
func NormalizeCrushName(name string) string {
return strings.Replace(name, ".", "-", -1)
}
// IsNormalizedCrushNameEqual returns true if normalized is either equal to or the normalized version of notNormalized
// a crush name is normalized if it comes from the crushmap or has passed through the NormalizeCrushName function.
func IsNormalizedCrushNameEqual(notNormalized, normalized string) bool {
if notNormalized == normalized || NormalizeCrushName(notNormalized) == normalized {
return true
}
return false
}
// UpdateCrushMapValue is for updating the output of FormatLocation(location, hostName)
// this is not safe for incorrectly formatted strings
func UpdateCrushMapValue(pairs *[]string, key, value string) {
found := false
property := formatProperty(key, value)
for i, pair := range *pairs {
entry := strings.Split(pair, "=")
if key == entry[0] {
(*pairs)[i] = property
found = true
}
}
if !found {
*pairs = append(*pairs, property)
}
}
func isValidCrushFieldFormat(pair string) bool {
matched, err := regexp.MatchString("^.+=.+$", pair)
return matched && err == nil
......
......@@ -273,3 +273,39 @@ func TestCrushLocation(t *testing.T) {
assert.NotNil(t, err)
assert.Contains(t, err.Error(), "is not in a valid format")
}
func TestCrushName(t *testing.T) {
// each is slightly different than the last
crushNames := []string{
"www.zxyz.com",
"www.abcd.com",
"ip-10-0-132-84.us-east-2.compute.internal",
"ip-10-0-132-85.us-east-2.compute.internal",
"worker1",
"worker2",
"master1",
"master2",
"us-east-2b",
"us-east-2c",
"us-east-1",
"us-east-2",
"ip-10-0-175-140",
"ip-10-0-175-141",
}
for i, crushName := range crushNames {
normalizedCrushName := NormalizeCrushName(crushName)
fmt.Printf("crushName: %s, normalizedCrushName: %s\n", crushName, normalizedCrushName)
assert.True(t, IsNormalizedCrushNameEqual(crushName, normalizedCrushName))
assert.True(t, IsNormalizedCrushNameEqual(crushName, crushName))
assert.True(t, IsNormalizedCrushNameEqual(normalizedCrushName, normalizedCrushName))
if i > 0 {
// slightly different crush name
differentCrushName := crushNames[i-1]
differentNormalizedCrushName := NormalizeCrushName(differentCrushName)
assert.False(t, IsNormalizedCrushNameEqual(crushName, differentNormalizedCrushName))
assert.False(t, IsNormalizedCrushNameEqual(crushName, differentCrushName))
assert.False(t, IsNormalizedCrushNameEqual(normalizedCrushName, differentNormalizedCrushName))
}
}
}
......@@ -50,6 +50,7 @@ const (
osdMetadataDeviceEnvVarName = "ROOK_METADATA_DEVICE"
pvcBackedOSDVarName = "ROOK_PVC_BACKED_OSD"
lvPathVarName = "ROOK_LV_PATH"
topologyAwareEnvVarName = "ROOK_TOPOLOGY_AWARE"
rookBinariesMountPath = "/rook"
rookBinariesVolumeName = "rook-binaries"
blockPVCMapperInitContainer = "blkdevmapper"
......@@ -507,6 +508,11 @@ func (c *Cluster) getConfigEnvVars(storeConfig config.StoreConfig, dataDir, node
Key: "fsid",
},
}},
k8sutil.NodeEnvVar(),
}
// pass on the topologyAware flag to the provion pod so that portable OSDs can reconcile zone/region
if c.DesiredStorage.TopologyAware {
envVars = append(envVars, topologyAwareEnvVar("true"))
}
if storeConfig.StoreType != "" {
......@@ -543,6 +549,7 @@ func (c *Cluster) getConfigEnvVars(storeConfig config.StoreConfig, dataDir, node
func (c *Cluster) provisionOSDContainer(osdProps osdProperties, copyBinariesMount v1.VolumeMount) v1.Container {
envVars := c.getConfigEnvVars(osdProps.storeConfig, k8sutil.DataDir, osdProps.crushHostname, osdProps.location)
devMountNeeded := false
privileged := false
......@@ -714,6 +721,10 @@ func lvPathEnvVariable(lvPath string) v1.EnvVar {
return v1.EnvVar{Name: lvPathVarName, Value: lvPath}
}
func topologyAwareEnvVar(topologyAware string) v1.EnvVar {
return v1.EnvVar{Name: topologyAwareEnvVarName, Value: topologyAware}
}
func getDirectoriesFromContainer(osdContainer v1.Container) []rookalpha.Directory {
var dirsArg string
for _, envVar := range osdContainer.Env {
......
......@@ -171,7 +171,7 @@ func getOSDsForNodes(osdDataList []OsdData, nodeList []*corev1.Node, failureDoma
if !ok {
return nil, fmt.Errorf("could not find the %s label on node %s", failureDomainLabel, node.ObjectMeta.Name)
}
if crushFailureDomain == nodeFailureDomain || secondaryCrushHostname == nodeFailureDomain {
if cephClient.IsNormalizedCrushNameEqual(nodeFailureDomain, crushFailureDomain) || cephClient.IsNormalizedCrushNameEqual(secondaryCrushHostname, crushFailureDomain) {
nodeOsdDataList = append(nodeOsdDataList, osdData)
}
}
......
......@@ -696,6 +696,33 @@ rules:
- get
- watch
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: rook-ceph-osd
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
---
# Allow the ceph osd to access cluster-wide resources necessary for determining their topology location
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: rook-ceph-osd
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: rook-ceph-osd
subjects:
- kind: ServiceAccount
name: rook-ceph-osd
namespace: ` + namespace + `
---
# Aspects of Rook Ceph Agent that require access to secrets
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment