Unverified Commit 820309ed authored by Tim Gross's avatar Tim Gross Committed by GitHub
Browse files

e2e: run client/allocs metrics nightly tests vs Windows (#6850)

Adds Windows targets to the client/allocs metrics tests. Removes the
`allocstats` test, which covers less than these tests and is now
redundant.

Adds a firewall rule to our Windows instances so that the prometheus
server can scrape the Nomad HTTP API for metrics.
parent 7700d384
Showing with 68 additions and 141 deletions
+68 -141
package allocstats
import (
"fmt"
"time"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/e2e/framework"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/require"
)
type BasicAllocStatsTest struct {
framework.TC
jobIds []string
}
func init() {
framework.AddSuites(&framework.TestSuite{
Component: "AllocationStats",
CanRunLocal: true,
Cases: []framework.TestCase{
new(BasicAllocStatsTest),
},
})
}
func (tc *BasicAllocStatsTest) BeforeAll(f *framework.F) {
// Ensure cluster has leader before running tests
e2eutil.WaitForLeader(f.T(), tc.Nomad())
// Ensure that we have at least one client node in ready state
e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 1)
}
// TestResourceStats is an end to end test for resource utilization
// This runs a raw exec job.
// TODO(preetha) - add more test cases with more realistic resource utilization
func (tc *BasicAllocStatsTest) TestResourceStats(f *framework.F) {
nomadClient := tc.Nomad()
clientNodes, err := e2eutil.ListLinuxClientNodes(nomadClient)
if err != nil {
f.T().Fatalf("could not list client nodes: %v", err)
}
if len(clientNodes) == 0 {
f.T().Skip("no Linux clients")
}
uuid := uuid.Generate()
jobID := "allocstats" + uuid[0:8]
tc.jobIds = append(tc.jobIds, jobID)
runResourceStatsTest(f, nomadClient, jobID,
"allocstats/input/raw_exec.nomad")
}
// TestResourceStatsWindows is an end to end test for resource utilization.
// This runs a raw exec job.
func (tc *BasicAllocStatsTest) TestResourceStatsWindows(f *framework.F) {
nomadClient := tc.Nomad()
clientNodes, err := e2eutil.ListWindowsClientNodes(nomadClient)
if err != nil {
f.T().Fatalf("could not list client nodes: %v", err)
}
if len(clientNodes) == 0 {
f.T().Skip("no Windows clients")
}
uuid := uuid.Generate()
jobID := "allocstats_windows" + uuid[0:8]
tc.jobIds = append(tc.jobIds, jobID)
runResourceStatsTest(f, nomadClient, jobID,
"allocstats/input/raw_exec_windows.nomad")
}
func runResourceStatsTest(f *framework.F, nomadClient *api.Client, jobID, jobSpec string) {
allocs := e2eutil.RegisterAndWaitForAllocs(f.T(), nomadClient, jobSpec, jobID)
require := require.New(f.T())
require.Len(allocs, 1)
// Wait till alloc is running
allocID := allocs[0].ID
e2eutil.WaitForAllocRunning(f.T(), nomadClient, allocID)
allocsClient := nomadClient.Allocations()
// Verify allocation resource stats
// This job file should result in non zero CPU and Memory stats
testutil.WaitForResultRetries(500, func() (bool, error) {
time.Sleep(time.Millisecond * 100)
allocStatsResp, err := allocsClient.Stats(&api.Allocation{ID: allocID}, nil)
if err != nil {
return false, fmt.Errorf("unexpected error getting alloc stats: %v", err)
}
resourceUsage := allocStatsResp.ResourceUsage
cpuStatsValid := resourceUsage.CpuStats.TotalTicks > 0 && resourceUsage.CpuStats.Percent > 0
memStatsValid := resourceUsage.MemoryStats.RSS > 0
return cpuStatsValid && memStatsValid, fmt.Errorf("expected non zero resource usage, but was: %v", resourceUsage)
}, func(err error) {
f.T().Fatalf("invalid resource usage : %v", err)
})
}
func (tc *BasicAllocStatsTest) AfterEach(f *framework.F) {
nomadClient := tc.Nomad()
jobs := nomadClient.Jobs()
// Stop all jobs in test
for _, id := range tc.jobIds {
jobs.Deregister(id, true, nil)
}
// Garbage collect
nomadClient.System().GarbageCollect()
}
......@@ -4,7 +4,6 @@ import (
"testing"
_ "github.com/hashicorp/nomad/e2e/affinities"
_ "github.com/hashicorp/nomad/e2e/allocstats"
_ "github.com/hashicorp/nomad/e2e/clientstate"
_ "github.com/hashicorp/nomad/e2e/connect"
_ "github.com/hashicorp/nomad/e2e/consul"
......
job "test_raw_windows" {
job "factorial_windows" {
datacenters = ["dc1"]
type = "service"
......
job "test_raw" {
job "mem_windows" {
datacenters = ["dc1"]
type = "service"
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
value = "windows"
}
group "test" {
......@@ -13,9 +13,18 @@ job "test_raw" {
task "test1" {
driver = "raw_exec"
template {
data = <<EOH
$mem_stress = @()
for ($i = 0; $i -lt ###; $i++) { $mem_stress += ("a" * 200MB) }
EOH
destination = "local/memtest.ps1"
}
config {
command = "bash"
args = ["-c", "var=10000;while true; do a=$(awk -v x=$var 'BEGIN{print sqrt(x)}'); ((var++)); done"]
command = "powershell"
args = ["local/memtest.ps1"]
}
}
}
......
......@@ -89,7 +89,28 @@ func (tc *MetricsTest) TestMetricsLinux(f *framework.F) {
tc.queryAllocMetrics(t, workloads)
}
// Run workloads from and wait for allocations
// TestMetricsWindows runs a collection of jobs that exercise alloc metrics.
// Then we query prometheus to verify we're collecting client and alloc metrics
// and correctly presenting them to the prometheus scraper.
func (tc *MetricsTest) TestMetricsWindows(f *framework.F) {
t := f.T()
clientNodes, err := e2eutil.ListWindowsClientNodes(tc.Nomad())
require.Nil(t, err)
if len(clientNodes) == 0 {
t.Skip("no Windows clients")
}
workloads := map[string]string{
"factorial_windows": "nomad_client_allocs_cpu_user",
"mem_windows": "nomad_client_allocs_memory_rss",
}
tc.runWorkloads(t, workloads)
tc.queryClientMetrics(t, clientNodes)
tc.queryAllocMetrics(t, workloads)
}
// run workloads and wait for allocations
func (tc *MetricsTest) runWorkloads(t *testing.T, workloads map[string]string) {
for jobName := range workloads {
uuid := uuid.Generate()
......@@ -125,14 +146,19 @@ func (tc *MetricsTest) queryClientMetrics(t *testing.T, clientNodes []string) {
if err != nil {
return false
}
instances := make(map[model.LabelValue]struct{})
instances := make(map[string]struct{})
for _, result := range results {
instances[result.Metric["instance"]] = struct{}{}
instances[string(result.Metric["node_id"])] = struct{}{}
}
if len(instances) != len(clientNodes) {
err = fmt.Errorf("expected metric '%s' for all clients. got:\n%v",
metric, results)
return false
// we're testing only clients for a specific OS, so we
// want to make sure we're checking for specific node_ids
// and not just equal lengths
for _, clientNode := range clientNodes {
if _, ok := instances[clientNode]; !ok {
err = fmt.Errorf("expected metric '%s' for all clients. got:\n%v",
metric, results)
return false
}
}
return true
}, timeout, 1*time.Second)
......@@ -176,17 +202,3 @@ func (tc *MetricsTest) queryAllocMetrics(t *testing.T, workloads map[string]stri
timeout = 10 * time.Second
}
}
// TestMetricsWindows runs a collection of jobs that exercise alloc metrics.
// Then we query prometheus to verify we're collecting client and alloc metrics
// and correctly presenting them to the prometheus scraper.
func (tc *MetricsTest) TestMetricsWindows(f *framework.F) {
t := f.T()
clientNodes, err := e2eutil.ListWindowsClientNodes(tc.Nomad())
require.Nil(t, err)
if len(clientNodes) == 0 {
t.Skip("no Windows clients")
}
// TODO(tgross): run metrics on Windows, too
}
......@@ -44,6 +44,9 @@ md C:\tmp\data
# Invoke-WebRequest -Uri "$cni_url" -Outfile cni.tgz
# Expand-7Zip -ArchiveFileName .\cni.tgz -TargetPath C:\opt\cni\bin\
# needed for metrics scraping HTTP API calls to the client
New-NetFirewallRule -DisplayName 'Nomad HTTP Inbound' -Profile @('Public', 'Domain', 'Private') -Direction Inbound -Action Allow -Protocol TCP -LocalPort @('4646')
# enable as a service
sc.exe create "Nomad" binPath= "C:\opt\nomad.exe agent -config C:\opt\nomad.d" start= auto
sc.exe start "Nomad"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment