Unverified Commit d8344393 authored by Alex Dadgar's avatar Alex Dadgar Committed by GitHub
Browse files

Merge pull request #4277 from hashicorp/f-retry-join-clients

Add go-discover support to Nomad clients
parents 8c881465 84b4e2c0
Showing with 1109 additions and 257 deletions
+1109 -257
......@@ -283,7 +283,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic
// Set the preconfigured list of static servers
c.configLock.RLock()
if len(c.configCopy.Servers) > 0 {
if err := c.setServersImpl(c.configCopy.Servers, true); err != nil {
if _, err := c.setServersImpl(c.configCopy.Servers, true); err != nil {
logger.Printf("[WARN] client: None of the configured servers are valid: %v", err)
}
}
......@@ -623,7 +623,7 @@ func (c *Client) GetServers() []string {
// SetServers sets a new list of nomad servers to connect to. As long as one
// server is resolvable no error is returned.
func (c *Client) SetServers(in []string) error {
func (c *Client) SetServers(in []string) (int, error) {
return c.setServersImpl(in, false)
}
......@@ -633,7 +633,7 @@ func (c *Client) SetServers(in []string) error {
//
// Force should be used when setting the servers from the initial configuration
// since the server may be starting up in parallel and initial pings may fail.
func (c *Client) setServersImpl(in []string, force bool) error {
func (c *Client) setServersImpl(in []string, force bool) (int, error) {
var mu sync.Mutex
var wg sync.WaitGroup
var merr multierror.Error
......@@ -673,13 +673,13 @@ func (c *Client) setServersImpl(in []string, force bool) error {
// Only return errors if no servers are valid
if len(endpoints) == 0 {
if len(merr.Errors) > 0 {
return merr.ErrorOrNil()
return 0, merr.ErrorOrNil()
}
return noServersErr
return 0, noServersErr
}
c.servers.SetServers(endpoints)
return nil
return len(endpoints), nil
}
// restoreState is used to restore our state from the data dir
......
......@@ -975,13 +975,13 @@ func TestClient_ServerList(t *testing.T) {
if s := client.GetServers(); len(s) != 0 {
t.Fatalf("expected server lit to be empty but found: %+q", s)
}
if err := client.SetServers(nil); err != noServersErr {
if _, err := client.SetServers(nil); err != noServersErr {
t.Fatalf("expected setting an empty list to return a 'no servers' error but received %v", err)
}
if err := client.SetServers([]string{"123.456.13123.123.13:80"}); err == nil {
if _, err := client.SetServers([]string{"123.456.13123.123.13:80"}); err == nil {
t.Fatalf("expected setting a bad server to return an error")
}
if err := client.SetServers([]string{"123.456.13123.123.13:80", "127.0.0.1:1234", "127.0.0.1"}); err == nil {
if _, err := client.SetServers([]string{"123.456.13123.123.13:80", "127.0.0.1:1234", "127.0.0.1"}); err == nil {
t.Fatalf("expected setting at least one good server to succeed but received: %v", err)
}
s := client.GetServers()
......
......@@ -222,7 +222,7 @@ func (s *HTTPServer) updateServers(resp http.ResponseWriter, req *http.Request)
// Set the servers list into the client
s.agent.logger.Printf("[TRACE] Adding servers %+q to the client's primary server list", servers)
if err := client.SetServers(servers); err != nil {
if _, err := client.SetServers(servers); err != nil {
s.agent.logger.Printf("[ERR] Attempt to add servers %q to client failed: %v", servers, err)
//TODO is this the right error to return?
return nil, CodedError(400, err.Error())
......
......@@ -63,9 +63,11 @@ func (c *Command) readConfig() *Config {
Client: &ClientConfig{},
Consul: &config.ConsulConfig{},
Ports: &Ports{},
Server: &ServerConfig{},
Vault: &config.VaultConfig{},
ACL: &ACLConfig{},
Server: &ServerConfig{
ServerJoin: &ServerJoin{},
},
Vault: &config.VaultConfig{},
ACL: &ACLConfig{},
}
flags := flag.NewFlagSet("agent", flag.ContinueOnError)
......@@ -78,13 +80,16 @@ func (c *Command) readConfig() *Config {
// Server-only options
flags.IntVar(&cmdConfig.Server.BootstrapExpect, "bootstrap-expect", 0, "")
flags.BoolVar(&cmdConfig.Server.RejoinAfterLeave, "rejoin", false, "")
flags.Var((*flaghelper.StringFlag)(&cmdConfig.Server.StartJoin), "join", "")
flags.Var((*flaghelper.StringFlag)(&cmdConfig.Server.RetryJoin), "retry-join", "")
flags.IntVar(&cmdConfig.Server.RetryMaxAttempts, "retry-max", 0, "")
flags.StringVar(&cmdConfig.Server.RetryInterval, "retry-interval", "", "")
flags.StringVar(&cmdConfig.Server.EncryptKey, "encrypt", "", "gossip encryption key")
flags.IntVar(&cmdConfig.Server.RaftProtocol, "raft-protocol", 0, "")
flags.BoolVar(&cmdConfig.Server.RejoinAfterLeave, "rejoin", false, "")
flags.Var((*flaghelper.StringFlag)(&cmdConfig.Server.ServerJoin.StartJoin), "join", "")
flags.Var((*flaghelper.StringFlag)(&cmdConfig.Server.ServerJoin.RetryJoin), "retry-join", "")
flags.IntVar(&cmdConfig.Server.ServerJoin.RetryMaxAttempts, "retry-max", 0, "")
flags.Var((flaghelper.FuncDurationVar)(func(d time.Duration) error {
cmdConfig.Server.ServerJoin.RetryInterval = d
return nil
}), "retry-interval", "")
// Client-only options
flags.StringVar(&cmdConfig.Client.StateDir, "state-dir", "", "")
......@@ -267,14 +272,6 @@ func (c *Command) readConfig() *Config {
}
}
// Parse the RetryInterval.
dur, err := time.ParseDuration(config.Server.RetryInterval)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error parsing retry interval: %s", err))
return nil
}
config.Server.retryInterval = dur
// Check that the server is running in at least one mode.
if !(config.Server.Enabled || config.Client.Enabled) {
c.Ui.Error("Must specify either server, client or dev mode for the agent.")
......@@ -547,20 +544,89 @@ func (c *Command) Run(args []string) int {
logGate.Flush()
// Start retry join process
c.retryJoinErrCh = make(chan struct{})
joiner := retryJoiner{
join: c.agent.server.Join,
discover: &discover.Discover{},
errCh: c.retryJoinErrCh,
logger: c.agent.logger,
if err := c.handleRetryJoin(config); err != nil {
c.Ui.Error(err.Error())
return 1
}
go joiner.RetryJoin(config)
// Wait for exit
return c.handleSignals()
}
// handleRetryJoin is used to start retry joining if it is configured.
func (c *Command) handleRetryJoin(config *Config) error {
c.retryJoinErrCh = make(chan struct{})
if config.Server.Enabled && len(config.Server.RetryJoin) != 0 {
joiner := retryJoiner{
discover: &discover.Discover{},
errCh: c.retryJoinErrCh,
logger: c.agent.logger,
serverJoin: c.agent.server.Join,
serverEnabled: true,
}
if err := joiner.Validate(config); err != nil {
return err
}
// Remove the duplicate fields
if len(config.Server.RetryJoin) != 0 {
config.Server.ServerJoin.RetryJoin = config.Server.RetryJoin
config.Server.RetryJoin = nil
}
if config.Server.RetryMaxAttempts != 0 {
config.Server.ServerJoin.RetryMaxAttempts = config.Server.RetryMaxAttempts
config.Server.RetryMaxAttempts = 0
}
if config.Server.RetryInterval != 0 {
config.Server.ServerJoin.RetryInterval = config.Server.RetryInterval
config.Server.RetryInterval = 0
}
c.agent.logger.Printf("[WARN] agent: Using deprecated retry_join fields. Upgrade configuration to use server_join")
}
if config.Server.Enabled &&
config.Server.ServerJoin != nil &&
len(config.Server.ServerJoin.RetryJoin) != 0 {
joiner := retryJoiner{
discover: &discover.Discover{},
errCh: c.retryJoinErrCh,
logger: c.agent.logger,
serverJoin: c.agent.server.Join,
serverEnabled: true,
}
if err := joiner.Validate(config); err != nil {
return err
}
go joiner.RetryJoin(config.Server.ServerJoin)
}
if config.Client.Enabled &&
config.Client.ServerJoin != nil &&
len(config.Client.ServerJoin.RetryJoin) != 0 {
joiner := retryJoiner{
discover: &discover.Discover{},
errCh: c.retryJoinErrCh,
logger: c.agent.logger,
clientJoin: c.agent.client.SetServers,
clientEnabled: true,
}
if err := joiner.Validate(config); err != nil {
return err
}
go joiner.RetryJoin(config.Client.ServerJoin)
}
return nil
}
// handleSignals blocks until we get an exit-causing signal
func (c *Command) handleSignals() int {
signalCh := make(chan os.Signal, 4)
......@@ -831,12 +897,34 @@ func (c *Command) setupTelemetry(config *Config) (*metrics.InmemSink, error) {
}
func (c *Command) startupJoin(config *Config) error {
if len(config.Server.StartJoin) == 0 || !config.Server.Enabled {
// Nothing to do
if !config.Server.Enabled {
return nil
}
// Validate both old and new aren't being set
old := len(config.Server.StartJoin)
var new int
if config.Server.ServerJoin != nil {
new = len(config.Server.ServerJoin.StartJoin)
}
if old != 0 && new != 0 {
return fmt.Errorf("server_join and start_join cannot both be defined; prefer setting the server_join stanza")
}
// Nothing to do
if old+new == 0 {
return nil
}
// Combine the lists and join
joining := config.Server.StartJoin
if new != 0 {
joining = append(joining, config.Server.ServerJoin.StartJoin...)
}
c.Ui.Output("Joining cluster...")
n, err := c.agent.server.Join(config.Server.StartJoin)
n, err := c.agent.server.Join(joining)
if err != nil {
return err
}
......
......@@ -19,6 +19,7 @@ advertise {
rpc = "127.0.0.3"
serf = "127.0.0.4"
}
client {
enabled = true
state_dir = "/tmp/client-state"
......@@ -29,6 +30,11 @@ client {
foo = "bar"
baz = "zip"
}
server_join {
retry_join = [ "1.1.1.1", "2.2.2.2" ]
retry_max = 3
retry_interval = "15s"
}
options {
foo = "bar"
baz = "zip"
......@@ -49,17 +55,17 @@ client {
}
client_min_port = 1000
client_max_port = 2000
max_kill_timeout = "10s"
stats {
data_points = 35
collection_interval = "5s"
}
gc_interval = "6s"
gc_parallel_destroys = 6
gc_disk_usage_threshold = 82
gc_inode_usage_threshold = 91
gc_max_allocs = 50
no_host_uuid = false
max_kill_timeout = "10s"
stats {
data_points = 35
collection_interval = "5s"
}
gc_interval = "6s"
gc_parallel_destroys = 6
gc_disk_usage_threshold = 82
gc_inode_usage_threshold = 91
gc_max_allocs = 50
no_host_uuid = false
}
server {
enabled = true
......@@ -86,23 +92,28 @@ server {
redundancy_zone = "foo"
upgrade_version = "0.8.0"
encrypt = "abc"
server_join {
retry_join = [ "1.1.1.1", "2.2.2.2" ]
retry_max = 3
retry_interval = "15s"
}
}
acl {
enabled = true
token_ttl = "60s"
policy_ttl = "60s"
replication_token = "foobar"
enabled = true
token_ttl = "60s"
policy_ttl = "60s"
replication_token = "foobar"
}
telemetry {
statsite_address = "127.0.0.1:1234"
statsd_address = "127.0.0.1:2345"
prometheus_metrics = true
disable_hostname = true
collection_interval = "3s"
publish_allocation_metrics = true
publish_node_metrics = true
disable_tagged_metrics = true
backwards_compatible_metrics = true
collection_interval = "3s"
publish_allocation_metrics = true
publish_node_metrics = true
disable_tagged_metrics = true
backwards_compatible_metrics = true
}
leave_on_interrupt = true
leave_on_terminate = true
......@@ -114,68 +125,68 @@ http_api_response_headers {
Access-Control-Allow-Origin = "*"
}
consul {
server_service_name = "nomad"
server_http_check_name = "nomad-server-http-health-check"
server_serf_check_name = "nomad-server-serf-health-check"
server_rpc_check_name = "nomad-server-rpc-health-check"
client_service_name = "nomad-client"
client_http_check_name = "nomad-client-http-health-check"
address = "127.0.0.1:9500"
token = "token1"
auth = "username:pass"
ssl = true
verify_ssl = true
ca_file = "/path/to/ca/file"
cert_file = "/path/to/cert/file"
key_file = "/path/to/key/file"
server_auto_join = true
client_auto_join = true
auto_advertise = true
checks_use_advertise = true
server_service_name = "nomad"
server_http_check_name = "nomad-server-http-health-check"
server_serf_check_name = "nomad-server-serf-health-check"
server_rpc_check_name = "nomad-server-rpc-health-check"
client_service_name = "nomad-client"
client_http_check_name = "nomad-client-http-health-check"
address = "127.0.0.1:9500"
token = "token1"
auth = "username:pass"
ssl = true
verify_ssl = true
ca_file = "/path/to/ca/file"
cert_file = "/path/to/cert/file"
key_file = "/path/to/key/file"
server_auto_join = true
client_auto_join = true
auto_advertise = true
checks_use_advertise = true
}
vault {
address = "127.0.0.1:9500"
allow_unauthenticated = true
task_token_ttl = "1s"
enabled = false
token = "12345"
ca_file = "/path/to/ca/file"
ca_path = "/path/to/ca"
cert_file = "/path/to/cert/file"
key_file = "/path/to/key/file"
tls_server_name = "foobar"
tls_skip_verify = true
create_from_role = "test_role"
address = "127.0.0.1:9500"
allow_unauthenticated = true
task_token_ttl = "1s"
enabled = false
token = "12345"
ca_file = "/path/to/ca/file"
ca_path = "/path/to/ca"
cert_file = "/path/to/cert/file"
key_file = "/path/to/key/file"
tls_server_name = "foobar"
tls_skip_verify = true
create_from_role = "test_role"
}
tls {
http = true
rpc = true
verify_server_hostname = true
ca_file = "foo"
cert_file = "bar"
key_file = "pipe"
rpc_upgrade_mode = true
verify_https_client = true
tls_prefer_server_cipher_suites = true
tls_cipher_suites = "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256"
tls_min_version = "tls12"
http = true
rpc = true
verify_server_hostname = true
ca_file = "foo"
cert_file = "bar"
key_file = "pipe"
rpc_upgrade_mode = true
verify_https_client = true
tls_prefer_server_cipher_suites = true
tls_cipher_suites = "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256"
tls_min_version = "tls12"
}
sentinel {
import "foo" {
path = "foo"
args = ["a", "b", "c"]
}
import "bar" {
path = "bar"
args = ["x", "y", "z"]
}
import "foo" {
path = "foo"
args = ["a", "b", "c"]
}
import "bar" {
path = "bar"
args = ["x", "y", "z"]
}
}
autopilot {
cleanup_dead_servers = true
disable_upgrade_migration = true
last_contact_threshold = "12705s"
max_trailing_logs = 17849
enable_redundancy_zones = true
server_stabilization_time = "23057s"
enable_custom_upgrades = true
cleanup_dead_servers = true
disable_upgrade_migration = true
last_contact_threshold = "12705s"
max_trailing_logs = 17849
enable_redundancy_zones = true
server_stabilization_time = "23057s"
enable_custom_upgrades = true
}
......@@ -217,6 +217,9 @@ type ClientConfig struct {
// NoHostUUID disables using the host's UUID and will force generation of a
// random UUID.
NoHostUUID *bool `mapstructure:"no_host_uuid"`
// ServerJoin contains information that is used to attempt to join servers
ServerJoin *ServerJoin `mapstructure:"server_join"`
}
// ACLConfig is configuration specific to the ACL system
......@@ -311,21 +314,24 @@ type ServerConfig struct {
// StartJoin is a list of addresses to attempt to join when the
// agent starts. If Serf is unable to communicate with any of these
// addresses, then the agent will error and exit.
// Deprecated in Nomad 0.10
StartJoin []string `mapstructure:"start_join"`
// RetryJoin is a list of addresses to join with retry enabled.
// Deprecated in Nomad 0.10
RetryJoin []string `mapstructure:"retry_join"`
// RetryMaxAttempts specifies the maximum number of times to retry joining a
// host on startup. This is useful for cases where we know the node will be
// online eventually.
// Deprecated in Nomad 0.10
RetryMaxAttempts int `mapstructure:"retry_max"`
// RetryInterval specifies the amount of time to wait in between join
// attempts on agent start. The minimum allowed value is 1 second and
// the default is 30s.
RetryInterval string `mapstructure:"retry_interval"`
retryInterval time.Duration `mapstructure:"-"`
// Deprecated in Nomad 0.10
RetryInterval time.Duration `mapstructure:"retry_interval"`
// RejoinAfterLeave controls our interaction with the cluster after leave.
// When set to false (default), a leave causes Consul to not rejoin
......@@ -346,6 +352,59 @@ type ServerConfig struct {
// Encryption key to use for the Serf communication
EncryptKey string `mapstructure:"encrypt" json:"-"`
// ServerJoin contains information that is used to attempt to join servers
ServerJoin *ServerJoin `mapstructure:"server_join"`
}
// ServerJoin is used in both clients and servers to bootstrap connections to
// servers
type ServerJoin struct {
// StartJoin is a list of addresses to attempt to join when the
// agent starts. If Serf is unable to communicate with any of these
// addresses, then the agent will error and exit.
StartJoin []string `mapstructure:"start_join"`
// RetryJoin is a list of addresses to join with retry enabled, or a single
// value to find multiple servers using go-discover syntax.
RetryJoin []string `mapstructure:"retry_join"`
// RetryMaxAttempts specifies the maximum number of times to retry joining a
// host on startup. This is useful for cases where we know the node will be
// online eventually.
RetryMaxAttempts int `mapstructure:"retry_max"`
// RetryInterval specifies the amount of time to wait in between join
// attempts on agent start. The minimum allowed value is 1 second and
// the default is 30s.
RetryInterval time.Duration `mapstructure:"retry_interval"`
}
func (s *ServerJoin) Merge(b *ServerJoin) *ServerJoin {
if s == nil {
return b
}
result := *s
if b == nil {
return &result
}
if len(b.StartJoin) != 0 {
result.StartJoin = b.StartJoin
}
if len(b.RetryJoin) != 0 {
result.RetryJoin = b.RetryJoin
}
if b.RetryMaxAttempts != 0 {
result.RetryMaxAttempts = b.RetryMaxAttempts
}
if b.RetryInterval != 0 {
result.RetryInterval = b.RetryInterval
}
return &result
}
// EncryptBytes returns the encryption key configured.
......@@ -601,13 +660,20 @@ func DefaultConfig() *Config {
GCInodeUsageThreshold: 70,
GCMaxAllocs: 50,
NoHostUUID: helper.BoolToPtr(true),
ServerJoin: &ServerJoin{
RetryJoin: []string{},
RetryInterval: 30 * time.Second,
RetryMaxAttempts: 0,
},
},
Server: &ServerConfig{
Enabled: false,
StartJoin: []string{},
RetryJoin: []string{},
RetryInterval: "30s",
RetryMaxAttempts: 0,
Enabled: false,
StartJoin: []string{},
ServerJoin: &ServerJoin{
RetryJoin: []string{},
RetryInterval: 30 * time.Second,
RetryMaxAttempts: 0,
},
},
ACL: &ACLConfig{
Enabled: false,
......@@ -1036,9 +1102,8 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
if b.RetryMaxAttempts != 0 {
result.RetryMaxAttempts = b.RetryMaxAttempts
}
if b.RetryInterval != "" {
if b.RetryInterval != 0 {
result.RetryInterval = b.RetryInterval
result.retryInterval = b.retryInterval
}
if b.RejoinAfterLeave {
result.RejoinAfterLeave = true
......@@ -1055,6 +1120,9 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
if b.EncryptKey != "" {
result.EncryptKey = b.EncryptKey
}
if b.ServerJoin != nil {
result.ServerJoin = result.ServerJoin.Merge(b.ServerJoin)
}
// Add the schedulers
result.EnabledSchedulers = append(result.EnabledSchedulers, b.EnabledSchedulers...)
......@@ -1162,6 +1230,10 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig {
result.ChrootEnv[k] = v
}
if b.ServerJoin != nil {
result.ServerJoin = result.ServerJoin.Merge(b.ServerJoin)
}
return &result
}
......
......@@ -370,6 +370,7 @@ func parseClient(result **ClientConfig, list *ast.ObjectList) error {
"gc_parallel_destroys",
"gc_max_allocs",
"no_host_uuid",
"server_join",
}
if err := helper.CheckHCLKeys(listVal, valid); err != nil {
return err
......@@ -385,6 +386,7 @@ func parseClient(result **ClientConfig, list *ast.ObjectList) error {
delete(m, "chroot_env")
delete(m, "reserved")
delete(m, "stats")
delete(m, "server_join")
var config ClientConfig
dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
......@@ -448,6 +450,13 @@ func parseClient(result **ClientConfig, list *ast.ObjectList) error {
}
}
// Parse ServerJoin config
if o := listVal.Filter("server_join"); len(o.Items) > 0 {
if err := parseServerJoin(&config.ServerJoin, o); err != nil {
return multierror.Prefix(err, "server_join->")
}
}
*result = &config
return nil
}
......@@ -531,16 +540,20 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error {
"heartbeat_grace",
"min_heartbeat_ttl",
"max_heartbeats_per_second",
"start_join",
"retry_join",
"retry_max",
"retry_interval",
"rejoin_after_leave",
"encrypt",
"authoritative_region",
"non_voting_server",
"redundancy_zone",
"upgrade_version",
"server_join",
// For backwards compatibility
"start_join",
"retry_join",
"retry_max",
"retry_interval",
}
if err := helper.CheckHCLKeys(listVal, valid); err != nil {
return err
......@@ -551,6 +564,8 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error {
return err
}
delete(m, "server_join")
var config ServerConfig
dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
DecodeHook: mapstructure.StringToTimeDurationHookFunc(),
......@@ -570,10 +585,59 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error {
}
}
// Parse ServerJoin config
if o := listVal.Filter("server_join"); len(o.Items) > 0 {
if err := parseServerJoin(&config.ServerJoin, o); err != nil {
return multierror.Prefix(err, "server_join->")
}
}
*result = &config
return nil
}
func parseServerJoin(result **ServerJoin, list *ast.ObjectList) error {
list = list.Elem()
if len(list.Items) > 1 {
return fmt.Errorf("only one 'server_join' block allowed")
}
// Get our object
listVal := list.Items[0].Val
// Check for invalid keys
valid := []string{
"start_join",
"retry_join",
"retry_max",
"retry_interval",
}
if err := helper.CheckHCLKeys(listVal, valid); err != nil {
return err
}
var m map[string]interface{}
if err := hcl.DecodeObject(&m, listVal); err != nil {
return err
}
var serverJoinInfo ServerJoin
dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
DecodeHook: mapstructure.StringToTimeDurationHookFunc(),
WeaklyTypedInput: true,
Result: &serverJoinInfo,
})
if err != nil {
return err
}
if err := dec.Decode(m); err != nil {
return err
}
*result = &serverJoinInfo
return nil
}
func parseACL(result **ACLConfig, list *ast.ObjectList) error {
list = list.Elem()
if len(list.Items) > 1 {
......
......@@ -47,6 +47,11 @@ func TestConfig_Parse(t *testing.T) {
AllocDir: "/tmp/alloc",
Servers: []string{"a.b.c:80", "127.0.0.1:1234"},
NodeClass: "linux-medium-64bit",
ServerJoin: &ServerJoin{
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
RetryInterval: time.Duration(15) * time.Second,
RetryMaxAttempts: 3,
},
Meta: map[string]string{
"foo": "bar",
"baz": "zip",
......@@ -99,13 +104,18 @@ func TestConfig_Parse(t *testing.T) {
MaxHeartbeatsPerSecond: 11.0,
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
StartJoin: []string{"1.1.1.1", "2.2.2.2"},
RetryInterval: "15s",
RetryInterval: 15 * time.Second,
RejoinAfterLeave: true,
RetryMaxAttempts: 3,
NonVotingServer: true,
RedundancyZone: "foo",
UpgradeVersion: "0.8.0",
EncryptKey: "abc",
ServerJoin: &ServerJoin{
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
RetryInterval: time.Duration(15) * time.Second,
RetryMaxAttempts: 3,
},
},
ACL: &ACLConfig{
Enabled: true,
......
......@@ -14,6 +14,7 @@ import (
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/nomad/structs/config"
"github.com/stretchr/testify/require"
)
var (
......@@ -264,8 +265,7 @@ func TestConfig_Merge(t *testing.T) {
RejoinAfterLeave: true,
StartJoin: []string{"1.1.1.1"},
RetryJoin: []string{"1.1.1.1"},
RetryInterval: "10s",
retryInterval: time.Second * 10,
RetryInterval: time.Second * 10,
NonVotingServer: true,
RedundancyZone: "bar",
UpgradeVersion: "bar",
......@@ -907,3 +907,109 @@ func TestIsMissingPort(t *testing.T) {
t.Errorf("expected no error, but got %v", err)
}
}
func TestMergeServerJoin(t *testing.T) {
require := require.New(t)
{
retryJoin := []string{"127.0.0.1", "127.0.0.2"}
startJoin := []string{"127.0.0.1", "127.0.0.2"}
retryMaxAttempts := 1
retryInterval := time.Duration(0)
a := &ServerJoin{
RetryJoin: retryJoin,
StartJoin: startJoin,
RetryMaxAttempts: retryMaxAttempts,
RetryInterval: time.Duration(retryInterval),
}
b := &ServerJoin{}
result := a.Merge(b)
require.Equal(result.RetryJoin, retryJoin)
require.Equal(result.StartJoin, startJoin)
require.Equal(result.RetryMaxAttempts, retryMaxAttempts)
require.Equal(result.RetryInterval, retryInterval)
}
{
retryJoin := []string{"127.0.0.1", "127.0.0.2"}
startJoin := []string{"127.0.0.1", "127.0.0.2"}
retryMaxAttempts := 1
retryInterval := time.Duration(0)
a := &ServerJoin{}
b := &ServerJoin{
RetryJoin: retryJoin,
StartJoin: startJoin,
RetryMaxAttempts: retryMaxAttempts,
RetryInterval: time.Duration(retryInterval),
}
result := a.Merge(b)
require.Equal(result.RetryJoin, retryJoin)
require.Equal(result.StartJoin, startJoin)
require.Equal(result.RetryMaxAttempts, retryMaxAttempts)
require.Equal(result.RetryInterval, retryInterval)
}
{
retryJoin := []string{"127.0.0.1", "127.0.0.2"}
startJoin := []string{"127.0.0.1", "127.0.0.2"}
retryMaxAttempts := 1
retryInterval := time.Duration(0)
var a *ServerJoin
b := &ServerJoin{
RetryJoin: retryJoin,
StartJoin: startJoin,
RetryMaxAttempts: retryMaxAttempts,
RetryInterval: time.Duration(retryInterval),
}
result := a.Merge(b)
require.Equal(result.RetryJoin, retryJoin)
require.Equal(result.StartJoin, startJoin)
require.Equal(result.RetryMaxAttempts, retryMaxAttempts)
require.Equal(result.RetryInterval, retryInterval)
}
{
retryJoin := []string{"127.0.0.1", "127.0.0.2"}
startJoin := []string{"127.0.0.1", "127.0.0.2"}
retryMaxAttempts := 1
retryInterval := time.Duration(0)
a := &ServerJoin{
RetryJoin: retryJoin,
StartJoin: startJoin,
RetryMaxAttempts: retryMaxAttempts,
RetryInterval: time.Duration(retryInterval),
}
var b *ServerJoin
result := a.Merge(b)
require.Equal(result.RetryJoin, retryJoin)
require.Equal(result.StartJoin, startJoin)
require.Equal(result.RetryMaxAttempts, retryMaxAttempts)
require.Equal(result.RetryInterval, retryInterval)
}
{
retryJoin := []string{"127.0.0.1", "127.0.0.2"}
startJoin := []string{"127.0.0.1", "127.0.0.2"}
retryMaxAttempts := 1
retryInterval := time.Duration(0)
a := &ServerJoin{
RetryJoin: retryJoin,
StartJoin: startJoin,
}
b := &ServerJoin{
RetryMaxAttempts: retryMaxAttempts,
RetryInterval: time.Duration(retryInterval),
}
result := a.Merge(b)
require.Equal(result.RetryJoin, retryJoin)
require.Equal(result.StartJoin, startJoin)
require.Equal(result.RetryMaxAttempts, retryMaxAttempts)
require.Equal(result.RetryInterval, retryInterval)
}
}
package agent
import (
"fmt"
"log"
"strings"
"time"
......@@ -27,8 +28,17 @@ type DiscoverInterface interface {
// retryJoiner is used to handle retrying a join until it succeeds or all of
// its tries are exhausted.
type retryJoiner struct {
// join adds the specified servers to the serf cluster
join func([]string) (int, error)
// serverJoin adds the specified servers to the serf cluster
serverJoin func([]string) (int, error)
// serverEnabled indicates whether the nomad agent will run in server mode
serverEnabled bool
// clientJoin adds the specified servers to the serf cluster
clientJoin func([]string) (int, error)
// clientEnabled indicates whether the nomad agent will run in client mode
clientEnabled bool
// discover is of type Discover, where this is either the go-discover
// implementation or a mock used for testing
......@@ -42,23 +52,62 @@ type retryJoiner struct {
logger *log.Logger
}
// Validate ensures that the configuration passes validity checks for the
// retry_join stanza. If the configuration is not valid, returns an error that
// will be displayed to the operator, otherwise nil.
func (r *retryJoiner) Validate(config *Config) error {
// If retry_join is defined for the server, ensure that deprecated
// fields and the server_join stanza are not both set
if config.Server != nil && config.Server.ServerJoin != nil && len(config.Server.ServerJoin.RetryJoin) != 0 {
if len(config.Server.RetryJoin) != 0 {
return fmt.Errorf("server_join and retry_join cannot both be defined; prefer setting the server_join stanza")
}
if len(config.Server.StartJoin) != 0 {
return fmt.Errorf("server_join and start_join cannot both be defined; prefer setting the server_join stanza")
}
if config.Server.RetryMaxAttempts != 0 {
return fmt.Errorf("server_join and retry_max cannot both be defined; prefer setting the server_join stanza")
}
if config.Server.RetryInterval != 0 {
return fmt.Errorf("server_join and retry_interval cannot both be defined; prefer setting the server_join stanza")
}
if len(config.Server.ServerJoin.StartJoin) != 0 {
return fmt.Errorf("retry_join and start_join cannot both be defined")
}
}
// if retry_join is defined for the client, ensure that start_join is not
// set as this configuration is only defined for servers.
if config.Client != nil && config.Client.ServerJoin != nil {
if config.Client.ServerJoin.StartJoin != nil {
return fmt.Errorf("start_join is not supported for Nomad clients")
}
}
return nil
}
// retryJoin is used to handle retrying a join until it succeeds or all retries
// are exhausted.
func (r *retryJoiner) RetryJoin(config *Config) {
if len(config.Server.RetryJoin) == 0 || !config.Server.Enabled {
func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) {
if len(serverJoin.RetryJoin) == 0 {
return
}
attempt := 0
addrsToJoin := strings.Join(config.Server.RetryJoin, " ")
addrsToJoin := strings.Join(serverJoin.RetryJoin, " ")
r.logger.Printf("[INFO] agent: Joining cluster... %s", addrsToJoin)
for {
var addrs []string
var n int
var err error
for _, addr := range config.Server.RetryJoin {
for _, addr := range serverJoin.RetryJoin {
switch {
case strings.HasPrefix(addr, "provider="):
servers, err := r.discover.Addrs(addr, r.logger)
......@@ -73,23 +122,33 @@ func (r *retryJoiner) RetryJoin(config *Config) {
}
if len(addrs) > 0 {
n, err := r.join(addrs)
if err == nil {
r.logger.Printf("[INFO] agent: Join completed. Synced with %d initial agents", n)
if r.serverEnabled && r.serverJoin != nil {
n, err = r.serverJoin(addrs)
if err == nil {
r.logger.Printf("[INFO] agent: Join completed. Server synced with %d initial servers", n)
return
}
}
if r.clientEnabled && r.clientJoin != nil {
n, err = r.clientJoin(addrs)
if err == nil {
r.logger.Printf("[INFO] agent: Join completed. Client synced with %d initial servers", n)
return
}
}
}
attempt++
if config.Server.RetryMaxAttempts > 0 && attempt > config.Server.RetryMaxAttempts {
if serverJoin.RetryMaxAttempts > 0 && attempt > serverJoin.RetryMaxAttempts {
r.logger.Printf("[ERR] agent: max join retry exhausted, exiting")
close(r.errCh)
return
}
if err != nil {
r.logger.Printf("[WARN] agent: Join failed: %v, retrying in %v", err,
config.Server.RetryInterval)
r.logger.Printf("[WARN] agent: Join failed: %q, retrying in %v", err,
serverJoin.RetryInterval)
}
time.Sleep(config.Server.retryInterval)
time.Sleep(serverJoin.RetryInterval)
}
}
......@@ -6,9 +6,9 @@ import (
"log"
"os"
"testing"
"time"
"github.com/hashicorp/nomad/testutil"
"github.com/hashicorp/nomad/version"
"github.com/mitchellh/cli"
"github.com/stretchr/testify/require"
)
......@@ -30,43 +30,37 @@ func (m *MockDiscover) Names() []string {
func TestRetryJoin_Integration(t *testing.T) {
t.Parallel()
// Create two agents and have one retry join the other
agent := NewTestAgent(t, t.Name(), nil)
defer agent.Shutdown()
doneCh := make(chan struct{})
shutdownCh := make(chan struct{})
defer func() {
close(shutdownCh)
<-doneCh
}()
agent2 := NewTestAgent(t, t.Name(), func(c *Config) {
c.NodeName = "foo"
if c.Server.ServerJoin == nil {
c.Server.ServerJoin = &ServerJoin{}
}
c.Server.ServerJoin.RetryJoin = []string{agent.Config.normalizedAddrs.Serf}
c.Server.ServerJoin.RetryInterval = 1 * time.Second
})
defer agent2.Shutdown()
// Create a fake command and have it wrap the second agent and run the retry
// join handler
cmd := &Command{
Version: version.GetVersion(),
ShutdownCh: shutdownCh,
Ui: &cli.BasicUi{
Reader: os.Stdin,
Writer: os.Stdout,
ErrorWriter: os.Stderr,
},
agent: agent2.Agent,
}
serfAddr := agent.Config.normalizedAddrs.Serf
args := []string{
"-dev",
"-node", "foo",
"-retry-join", serfAddr,
"-retry-interval", "1s",
if err := cmd.handleRetryJoin(agent2.Config); err != nil {
t.Fatalf("handleRetryJoin failed: %v", err)
}
go func() {
if code := cmd.Run(args); code != 0 {
t.Logf("bad: %d", code)
}
close(doneCh)
}()
// Ensure the retry join occured.
testutil.WaitForResult(func() (bool, error) {
mem := agent.server.Members()
if len(mem) != 2 {
......@@ -78,16 +72,13 @@ func TestRetryJoin_Integration(t *testing.T) {
})
}
func TestRetryJoin_NonCloud(t *testing.T) {
func TestRetryJoin_Server_NonCloud(t *testing.T) {
t.Parallel()
require := require.New(t)
newConfig := &Config{
Server: &ServerConfig{
RetryMaxAttempts: 1,
RetryJoin: []string{"127.0.0.1"},
Enabled: true,
},
serverJoin := &ServerJoin{
RetryMaxAttempts: 1,
RetryJoin: []string{"127.0.0.1"},
}
var output []string
......@@ -98,28 +89,26 @@ func TestRetryJoin_NonCloud(t *testing.T) {
}
joiner := retryJoiner{
discover: &MockDiscover{},
join: mockJoin,
logger: log.New(ioutil.Discard, "", 0),
errCh: make(chan struct{}),
discover: &MockDiscover{},
serverJoin: mockJoin,
serverEnabled: true,
logger: log.New(ioutil.Discard, "", 0),
errCh: make(chan struct{}),
}
joiner.RetryJoin(newConfig)
joiner.RetryJoin(serverJoin)
require.Equal(1, len(output))
require.Equal(stubAddress, output[0])
}
func TestRetryJoin_Cloud(t *testing.T) {
func TestRetryJoin_Server_Cloud(t *testing.T) {
t.Parallel()
require := require.New(t)
newConfig := &Config{
Server: &ServerConfig{
RetryMaxAttempts: 1,
RetryJoin: []string{"provider=aws, tag_value=foo"},
Enabled: true,
},
serverJoin := &ServerJoin{
RetryMaxAttempts: 1,
RetryJoin: []string{"provider=aws, tag_value=foo"},
}
var output []string
......@@ -131,29 +120,27 @@ func TestRetryJoin_Cloud(t *testing.T) {
mockDiscover := &MockDiscover{}
joiner := retryJoiner{
discover: mockDiscover,
join: mockJoin,
logger: log.New(ioutil.Discard, "", 0),
errCh: make(chan struct{}),
discover: mockDiscover,
serverJoin: mockJoin,
serverEnabled: true,
logger: log.New(ioutil.Discard, "", 0),
errCh: make(chan struct{}),
}
joiner.RetryJoin(newConfig)
joiner.RetryJoin(serverJoin)
require.Equal(1, len(output))
require.Equal("provider=aws, tag_value=foo", mockDiscover.ReceivedAddrs)
require.Equal(stubAddress, output[0])
}
func TestRetryJoin_MixedProvider(t *testing.T) {
func TestRetryJoin_Server_MixedProvider(t *testing.T) {
t.Parallel()
require := require.New(t)
newConfig := &Config{
Server: &ServerConfig{
RetryMaxAttempts: 1,
RetryJoin: []string{"provider=aws, tag_value=foo", "127.0.0.1"},
Enabled: true,
},
serverJoin := &ServerJoin{
RetryMaxAttempts: 1,
RetryJoin: []string{"provider=aws, tag_value=foo", "127.0.0.1"},
}
var output []string
......@@ -165,15 +152,197 @@ func TestRetryJoin_MixedProvider(t *testing.T) {
mockDiscover := &MockDiscover{}
joiner := retryJoiner{
discover: mockDiscover,
join: mockJoin,
logger: log.New(ioutil.Discard, "", 0),
errCh: make(chan struct{}),
discover: mockDiscover,
serverJoin: mockJoin,
serverEnabled: true,
logger: log.New(ioutil.Discard, "", 0),
errCh: make(chan struct{}),
}
joiner.RetryJoin(newConfig)
joiner.RetryJoin(serverJoin)
require.Equal(2, len(output))
require.Equal("provider=aws, tag_value=foo", mockDiscover.ReceivedAddrs)
require.Equal(stubAddress, output[0])
}
func TestRetryJoin_Client(t *testing.T) {
t.Parallel()
require := require.New(t)
serverJoin := &ServerJoin{
RetryMaxAttempts: 1,
RetryJoin: []string{"127.0.0.1"},
}
var output []string
mockJoin := func(s []string) (int, error) {
output = s
return 0, nil
}
joiner := retryJoiner{
discover: &MockDiscover{},
clientJoin: mockJoin,
clientEnabled: true,
logger: log.New(ioutil.Discard, "", 0),
errCh: make(chan struct{}),
}
joiner.RetryJoin(serverJoin)
require.Equal(1, len(output))
require.Equal(stubAddress, output[0])
}
func TestRetryJoin_Validate(t *testing.T) {
t.Parallel()
type validateExpect struct {
config *Config
isValid bool
reason string
}
scenarios := []*validateExpect{
{
config: &Config{
Server: &ServerConfig{
ServerJoin: &ServerJoin{
RetryJoin: []string{"127.0.0.1"},
RetryMaxAttempts: 0,
RetryInterval: 0,
StartJoin: []string{},
},
RetryJoin: []string{"127.0.0.1"},
RetryMaxAttempts: 0,
RetryInterval: 0,
StartJoin: []string{},
},
},
isValid: false,
reason: "server_join cannot be defined if retry_join is defined on the server stanza",
},
{
config: &Config{
Server: &ServerConfig{
ServerJoin: &ServerJoin{
RetryJoin: []string{"127.0.0.1"},
RetryMaxAttempts: 0,
RetryInterval: 0,
StartJoin: []string{},
},
StartJoin: []string{"127.0.0.1"},
RetryMaxAttempts: 0,
RetryInterval: 0,
RetryJoin: []string{},
},
},
isValid: false,
reason: "server_join cannot be defined if start_join is defined on the server stanza",
},
{
config: &Config{
Server: &ServerConfig{
ServerJoin: &ServerJoin{
RetryJoin: []string{"127.0.0.1"},
RetryMaxAttempts: 0,
RetryInterval: 0,
StartJoin: []string{},
},
StartJoin: []string{},
RetryMaxAttempts: 1,
RetryInterval: 0,
RetryJoin: []string{},
},
},
isValid: false,
reason: "server_join cannot be defined if retry_max_attempts is defined on the server stanza",
},
{
config: &Config{
Server: &ServerConfig{
ServerJoin: &ServerJoin{
RetryJoin: []string{"127.0.0.1"},
RetryMaxAttempts: 0,
RetryInterval: time.Duration(1),
StartJoin: []string{},
},
StartJoin: []string{},
RetryMaxAttempts: 0,
RetryInterval: 3 * time.Second,
RetryJoin: []string{},
},
},
isValid: false,
reason: "server_join cannot be defined if retry_interval is defined on the server stanza",
},
{
config: &Config{
Server: &ServerConfig{
ServerJoin: &ServerJoin{
RetryJoin: []string{"127.0.0.1"},
RetryMaxAttempts: 0,
RetryInterval: 0,
StartJoin: []string{"127.0.0.1"},
},
},
},
isValid: false,
reason: "start_join and retry_join should not both be defined",
},
{
config: &Config{
Client: &ClientConfig{
ServerJoin: &ServerJoin{
RetryJoin: []string{},
RetryMaxAttempts: 0,
RetryInterval: 0,
StartJoin: []string{"127.0.0.1"},
},
},
},
isValid: false,
reason: "start_join should not be defined on the client",
},
{
config: &Config{
Client: &ClientConfig{
ServerJoin: &ServerJoin{
RetryJoin: []string{"127.0.0.1"},
RetryMaxAttempts: 0,
RetryInterval: 0,
},
},
},
isValid: true,
reason: "client server_join should be valid",
},
{
config: &Config{
Server: &ServerConfig{
ServerJoin: &ServerJoin{
RetryJoin: []string{"127.0.0.1"},
RetryMaxAttempts: 1,
RetryInterval: 1,
StartJoin: []string{},
},
},
},
isValid: true,
reason: "server server_join should be valid",
},
}
joiner := retryJoiner{}
for _, scenario := range scenarios {
t.Run(scenario.reason, func(t *testing.T) {
err := joiner.Validate(scenario.config)
if scenario.isValid {
require.NoError(t, err)
} else {
require.Error(t, err)
}
})
}
}
---
layout: "docs"
page_title: "Cloud Auto-join"
sidebar_current: "docs-agent-cloud-auto-join"
description: |-
Nomad supports automatic cluster joining using cloud metadata from various cloud providers
---
# Cloud Auto-joining
As of Nomad 0.8.4,
[`retry_join`](/docs/agent/configuration/server_join.html#retry_join) accepts a
unified interface using the
[go-discover](https://github.com/hashicorp/go-discover) library for doing
automatic cluster joining using cloud metadata. To use retry-join with a
supported cloud provider, specify the configuration on the command line or
configuration file as a `key=value key=value ...` string.
Values are taken literally and must not be URL
encoded. If the values contain spaces, backslashes or double quotes then
they need to be double quoted and the usual escaping rules apply.
```json
{
"retry_join": ["provider=my-cloud config=val config2=\"some other val\" ..."]
}
```
The cloud provider-specific configurations are detailed below. This can be
combined with static IP or DNS addresses or even multiple configurations
for different providers.
In order to use discovery behind a proxy, you will need to set
`HTTP_PROXY`, `HTTPS_PROXY` and `NO_PROXY` environment variables per
[Golang `net/http` library](https://golang.org/pkg/net/http/#ProxyFromEnvironment).
The following sections give the options specific to a subset of supported cloud
provider. For information on all providers, see further documentation in
[go-discover](https://github.com/hashicorp/go-discover).
### Amazon EC2
This returns the first private IP address of all servers in the given
region which have the given `tag_key` and `tag_value`.
```json
{
"retry_join": ["provider=aws tag_key=... tag_value=..."]
}
```
- `provider` (required) - the name of the provider ("aws" in this case).
- `tag_key` (required) - the key of the tag to auto-join on.
- `tag_value` (required) - the value of the tag to auto-join on.
- `region` (optional) - the AWS region to authenticate in.
- `addr_type` (optional) - the type of address to discover: `private_v4`, `public_v4`, `public_v6`. Default is `private_v4`. (>= 1.0)
- `access_key_id` (optional) - the AWS access key for authentication (see below for more information about authenticating).
- `secret_access_key` (optional) - the AWS secret access key for authentication (see below for more information about authenticating).
#### Authentication &amp; Precedence
- Static credentials `access_key_id=... secret_access_key=...`
- Environment variables (`AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`)
- Shared credentials file (`~/.aws/credentials` or the path specified by `AWS_SHARED_CREDENTIALS_FILE`)
- ECS task role metadata (container-specific).
- EC2 instance role metadata.
The only required IAM permission is `ec2:DescribeInstances`, and it is
recommended that you make a dedicated key used only for auto-joining. If the
region is omitted it will be discovered through the local instance's [EC2
metadata
endpoint](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-identity-documents.html).
### Microsoft Azure
This returns the first private IP address of all servers in the given region
which have the given `tag_key` and `tag_value` in the tenant and subscription, or in
the given `resource_group` of a `vm_scale_set` for Virtual Machine Scale Sets.
```json
{
"retry_join": ["provider=azure tag_name=... tag_value=... tenant_id=... client_id=... subscription_id=... secret_access_key=..."]
}
```
- `provider` (required) - the name of the provider ("azure" in this case).
- `tenant_id` (required) - the tenant to join machines in.
- `client_id` (required) - the client to authenticate with.
- `secret_access_key` (required) - the secret client key.
Use these configuration parameters when using tags:
- `tag_name` - the name of the tag to auto-join on.
- `tag_value` - the value of the tag to auto-join on.
Use these configuration parameters when using Virtual Machine Scale Sets (Consul 1.0.3 and later):
- `resource_group` - the name of the resource group to filter on.
- `vm_scale_set` - the name of the virtual machine scale set to filter on.
When using tags the only permission needed is the `ListAll` method for `NetworkInterfaces`. When using
Virtual Machine Scale Sets the only role action needed is `Microsoft.Compute/virtualMachineScaleSets/*/read`.
### Google Compute Engine
This returns the first private IP address of all servers in the given
project which have the given `tag_value`.
```
```json
{
"retry_join": ["provider=gce project_name=... tag_value=..."]
}
```
- `provider` (required) - the name of the provider ("gce" in this case).
- `tag_value` (required) - the value of the tag to auto-join on.
- `project_name` (optional) - the name of the project to auto-join on. Discovered if not set.
- `zone_pattern` (optional) - the list of zones can be restricted through an RE2 compatible regular expression. If omitted, servers in all zones are returned.
- `credentials_file` (optional) - the credentials file for authentication. See below for more information.
#### Authentication &amp; Precedence
- Use credentials from `credentials_file`, if provided.
- Use JSON file from `GOOGLE_APPLICATION_CREDENTIALS` environment variable.
- Use JSON file in a location known to the gcloud command-line tool.
- On Windows, this is `%APPDATA%/gcloud/application_default_credentials.json`.
- On other systems, `$HOME/.config/gcloud/application_default_credentials.json`.
- On Google Compute Engine, use credentials from the metadata
server. In this final case any provided scopes are ignored.
Discovery requires a [GCE Service
Account](https://cloud.google.com/compute/docs/access/service-accounts).
Credentials are searched using the following paths, in order of precedence.
......@@ -90,6 +90,12 @@ client {
receive work. This may be specified as an IP address or DNS, with or without
the port. If the port is omitted, the default port of `4647` is used.
- `server_join` <code>([server_join][server-join]: nil)</code> - Specifies
how the Nomad client will connect to Nomad servers. The `start_join` field
is not supported on the client. The retry_join fields may directly specify
the server address or use go-discover syntax for auto-discovery. See the
documentation for more detail.
- `state_dir` `(string: "[data_dir]/client")` - Specifies the directory to use
to store client state. By default, this is - the top-level
[data_dir](/docs/agent/configuration/index.html#data_dir) suffixed with
......@@ -307,7 +313,11 @@ cluster.
```hcl
client {
enabled = true
servers = ["1.2.3.4:4647", "5.6.7.8:4647"]
server_join {
retry_join = [ "1.1.1.1", "2.2.2.2" ]
retry_max = 3
retry_interval = "15s"
}
}
```
......@@ -346,3 +356,4 @@ client {
}
}
```
[server-join]: /docs/agent/configuration/server_join.html "Server Join"
......@@ -28,7 +28,11 @@ join failures, and more.
server {
enabled = true
bootstrap_expect = 3
retry_join = ["1.2.3.4", "5.6.7.8"]
server_join {
retry_join = [ "1.1.1.1", "2.2.2.2" ]
retry_max = 3
retry_interval = "15s"
}
}
```
......@@ -102,9 +106,9 @@ server {
second is a tradeoff as it lowers failure detection time of nodes at the
tradeoff of false positives and increased load on the leader.
- `non_voting_server` `(bool: false)` - (Enterprise-only) Specifies whether
this server will act as a non-voting member of the cluster to help provide
read scalability.
- `non_voting_server` `(bool: false)` - (Enterprise-only) Specifies whether
this server will act as a non-voting member of the cluster to help provide
read scalability.
- `num_schedulers` `(int: [num-cores])` - Specifies the number of parallel
scheduler threads to run. This can be as many as one per core, or `0` to
......@@ -131,6 +135,17 @@ server {
cluster again when starting. This flag allows the previous state to be used to
rejoin the cluster.
- `server_join` <code>([server_join][server-join]: nil)</code> - Specifies
how the Nomad server will connect to other Nomad servers. The `retry_join`
fields may directly specify the server address or use go-discover syntax for
auto-discovery. See the [server_join documentation][server-join] for more detail.
- `upgrade_version` `(string: "")` - A custom version of the format X.Y.Z to use
in place of the Nomad version when custom upgrades are enabled in Autopilot.
For more information, see the [Autopilot Guide](/guides/cluster/autopilot.html).
### Deprecated Parameters
- `retry_join` `(array<string>: [])` - Specifies a list of server addresses to
retry joining if the first attempt fails. This is similar to
[`start_join`](#start_join), but only invokes if the initial join attempt
......@@ -138,63 +153,25 @@ server {
succeeds. After one succeeds, no further addresses will be contacted. This is
useful for cases where we know the address will become available eventually.
Use `retry_join` with an array as a replacement for `start_join`, **do not use
both options**. See the [server address format](#server-address-format)
section for more information on the format of the string.
both options**. See the [server_join][server-join]
section for more information on the format of the string. This field is
deprecated in favor of the [server_join stanza][server-join].
- `retry_interval` `(string: "30s")` - Specifies the time to wait between retry
join attempts.
join attempts. This field is deprecated in favor of the [server_join
stanza][server-join].
- `retry_max` `(int: 0)` - Specifies the maximum number of join attempts to be
made before exiting with a return code of 1. By default, this is set to 0
which is interpreted as infinite retries.
which is interpreted as infinite retries. This field is deprecated in favor of
the [server_join stanza][server-join].
- `start_join` `(array<string>: [])` - Specifies a list of server addresses to
join on startup. If Nomad is unable to join with any of the specified
addresses, agent startup will fail. See the
[server address format](#server-address-format) section for more information
on the format of the string.
- `upgrade_version` `(string: "")` - A custom version of the format X.Y.Z to use
in place of the Nomad version when custom upgrades are enabled in Autopilot.
For more information, see the [Autopilot Guide](/guides/cluster/autopilot.html).
### Server Address Format
This section describes the acceptable syntax and format for describing the
location of a Nomad server. There are many ways to reference a Nomad server,
including directly by IP address and resolving through DNS.
#### Directly via IP Address
It is possible to address another Nomad server using its IP address. This is
done in the `ip:port` format, such as:
```
1.2.3.4:5678
```
If the port option is omitted, it defaults to the Serf port, which is 4648
unless configured otherwise:
```
1.2.3.4 => 1.2.3.4:4648
```
#### Via Domains or DNS
It is possible to address another Nomad server using its DNS address. This is
done in the `address:port` format, such as:
```
nomad-01.company.local:5678
```
If the port option is omitted, it defaults to the Serf port, which is 4648
unless configured otherwise:
```
nomad-01.company.local => nomad-01.company.local:4648
```
addresses, agent startup will fail. See the [server address
format](/docs/agent/configuration/server_join.html#server-address-format)
section for more information on the format of the string. This field is
deprecated in favor of the [server_join stanza][server-join].
## `server` Examples
......@@ -242,3 +219,4 @@ server {
```
[encryption]: /docs/agent/encryption.html "Nomad Agent Encryption"
[server-join]: /docs/agent/configuration/server_join.html "Server Join"
---
layout: "docs"
page_title: "server_join Stanza - Agent Configuration"
sidebar_current: "docs-agent-configuration--server-join"
description: |-
The "server_join" stanza specifies how the Nomad agent will discover and connect to Nomad servers.
---
# `server_join` Stanza
<table class="table table-bordered table-striped">
<tr>
<th width="120">Placement</th>
<td>
<code>server -> **server_join**</code>
<br>
<code>client -> **server_join**</code>
</td>
</tr>
</table>
The `server_join` stanza specifies how the Nomad agent will discover and connect
to Nomad servers.
```hcl
server_join {
retry_join = [ "1.1.1.1", "2.2.2.2" ]
retry_max = 3
retry_interval = "15s"
}
```
## `server_join` Parameters
- `retry_join` `(array<string>: [])` - Specifies a list of server addresses to
join. This is similar to [`start_join`](#start_join), but will continue to
be attempted even if the initial join attempt fails, up to
[retry_max](#retry_max). Further, `retry_join` is available to
both Nomad servers and clients, while `start_join` is only defined for Nomad
servers. This is useful for cases where we know the address will become
available eventually. Use `retry_join` with an array as a replacement for
`start_join`, **do not use both options**.
Address format includes both using IP addresses as well as an interface to the
[go-discover](https://github.com/hashicorp/go-discover) library for doing
automated cluster joining using cloud metadata. See [Cloud
Auto-join][cloud_auto_join] for more information.
```
server_join {
retry_join = [ "1.1.1.1", "2.2.2.2" ]
}
```
Using the `go-discover` interface, this can be defined both in a client or
server configuration as well as provided as a command-line argument.
```
server_join {
retry_join = [ "provider=aws tag_key=..." ]
}
```
See the [server address format](#server-address-format) for more information
about expected server address formats.
- `retry_interval` `(string: "30s")` - Specifies the time to wait between retry
join attempts.
- `retry_max` `(int: 0)` - Specifies the maximum number of join attempts to be
made before exiting with a return code of 1. By default, this is set to 0
which is interpreted as infinite retries.
- `start_join` `(array<string>: [])` - Specifies a list of server addresses to
join on startup. If Nomad is unable to join with any of the specified
addresses, agent startup will fail. See the
[server address format](#server-address-format) section for more information
on the format of the string. This field is defined only for Nomad servers and
will result in a configuration parse error if included in a client
configuration.
## Server Address Format
This section describes the acceptable syntax and format for describing the
location of a Nomad server. There are many ways to reference a Nomad server,
including directly by IP address and resolving through DNS.
### Directly via IP Address
It is possible to address another Nomad server using its IP address. This is
done in the `ip:port` format, such as:
```
1.2.3.4:5678
```
If the port option is omitted, it defaults to the Serf port, which is 4648
unless configured otherwise:
```
1.2.3.4 => 1.2.3.4:4648
```
### Via Domains or DNS
It is possible to address another Nomad server using its DNS address. This is
done in the `address:port` format, such as:
```
nomad-01.company.local:5678
```
If the port option is omitted, it defaults to the Serf port, which is 4648
unless configured otherwise:
```
nomad-01.company.local => nomad-01.company.local:4648
```
### Via the go-discover interface
As of Nomad 0.8.4, `retry_join` accepts a unified interface using the
[go-discover](https://github.com/hashicorp/go-discover) library for doing
automated cluster joining using cloud metadata. See [Cloud
Auto-join][cloud_auto_join] for more information.
```
"provider=aws tag_key=..." => 1.2.3.4:4648
```
[cloud_auto_join]: /docs/agent/cloud_auto_join.html "Nomad Cloud Auto-join"
......@@ -71,7 +71,15 @@ via CLI arguments. The `agent` command accepts the following arguments:
* `-region=<region>`: Equivalent to the [region](#region) config option.
* `-rejoin`: Equivalent to the [rejoin_after_leave](#rejoin_after_leave) config option.
* `-retry-interval`: Equivalent to the [retry_interval](#retry_interval) config option.
* `-retry-join`: Similar to `-join` but allows retrying a join if the first attempt fails.
* `-retry-join`: Similar to `-join` but allows retrying a join if the first attempt fails.
```sh
$ nomad agent -retry-join "127.0.0.1:4648"
```
`retry-join` can be defined as a command line flag only for servers. Clients
can configure `retry-join` only in configuration files.
* `-retry-max`: Similar to the [retry_max](#retry_max) config option.
* `-server`: Enable server mode on the local agent.
* `-servers=<host:port>`: Equivalent to the Client [servers](#servers) config
......
......@@ -31,7 +31,9 @@ server {
bootstrap_expect = 3
# This is the IP address of the first server we provisioned
retry_join = ["<known-address>:4648"]
server_join {
retry_join = ["<known-address>:4648"]
}
}
```
......
......@@ -396,6 +396,9 @@
<a href="/docs/agent/index.html">Nomad Agent</a>
<ul class="nav">
<li <%= sidebar_current("docs-agent-cloud-auto-join") %>>
<a href="/docs/agent/cloud_auto_join.html">Cloud Auto-join</a>
</li>
<li<%= sidebar_current("docs-agent-configuration") %>>
<a href="/docs/agent/configuration/index.html">Configuration</a>
<ul class="nav">
......@@ -417,6 +420,9 @@
<li <%= sidebar_current("docs-agent-configuration-server") %>>
<a href="/docs/agent/configuration/server.html">server</a>
</li>
<li <%= sidebar_current("docs-agent-configuration--server-join") %>>
<a href="/docs/agent/configuration/server_join.html">server_join</a>
</li>
<li <%= sidebar_current("docs-agent-configuration-telemetry") %>>
<a href="/docs/agent/configuration/telemetry.html">telemetry</a>
</li>
......@@ -428,6 +434,7 @@
</li>
</ul>
</li>
<li<%= sidebar_current("docs-agent-encryption") %>>
<a href="/docs/agent/encryption.html">Encryption</a>
</li>
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment