Unverified Commit d56ad039 authored by Alex Dadgar's avatar Alex Dadgar Committed by GitHub
Browse files

Merge pull request #5231 from hashicorp/d-devices

Document devices
parents f059e9a9 6f22fc02
No related merge requests found
Showing with 731 additions and 48 deletions
+731 -48
......@@ -18,4 +18,4 @@ config {
The valid configuration options are:
* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad
* `fingerprint_period` (`string`: `"5s"`): The interval to repeat fingerprint process to identify possible changes.
* `fingerprint_period` (`string`: `"1m"`): interval to repeat the fingerprint process to identify possible changes.
......@@ -9,6 +9,7 @@ import (
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
"github.com/hashicorp/nomad/helper/pluginutils/loader"
"github.com/hashicorp/nomad/plugins/base"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/shared/hclspec"
......@@ -35,6 +36,19 @@ const (
)
var (
// PluginID is the nvidia plugin metadata registered in the plugin
// catalog.
PluginID = loader.PluginID{
Name: pluginName,
PluginType: base.PluginTypeDevice,
}
// PluginConfig is the nvidia factory function registered in the
// plugin catalog.
PluginConfig = &loader.InternalPluginConfig{
Factory: func(l log.Logger) interface{} { return NewNvidiaDevice(l) },
}
// pluginInfo describes the plugin
pluginInfo = &base.PluginInfoResponse{
Type: base.PluginTypeDevice,
......
......@@ -135,7 +135,7 @@ func (d *Driver) buildFingerprint() *drivers.Fingerprint {
}
sort.Strings(runtimeNames)
fp.Attributes["runtimes"] = pstructs.NewStringAttribute(
fp.Attributes["driver.docker.runtimes"] = pstructs.NewStringAttribute(
strings.Join(runtimeNames, ","))
}
......
package catalog
import "github.com/hashicorp/nomad/drivers/rkt"
import (
"github.com/hashicorp/nomad/devices/gpu/nvidia"
"github.com/hashicorp/nomad/drivers/rkt"
)
// This file is where all builtin plugins should be registered in the catalog.
// Plugins with build restrictions should be placed in the appropriate
// register_XXX.go file.
func init() {
RegisterDeferredConfig(rkt.PluginID, rkt.PluginConfig, rkt.PluginLoader)
Register(nvidia.PluginID, nvidia.PluginConfig)
}
......@@ -235,14 +235,14 @@ func TestParse(t *testing.T) {
Count: helper.Uint64ToPtr(10),
Constraints: []*api.Constraint{
{
LTarget: "${driver.attr.memory}",
LTarget: "${device.attr.memory}",
RTarget: "2GB",
Operand: ">",
},
},
Affinities: []*api.Affinity{
{
LTarget: "${driver.model}",
LTarget: "${device.model}",
RTarget: "1080ti",
Operand: "=",
Weight: 50,
......
......@@ -200,13 +200,13 @@ job "binstore-storagelocker" {
device "nvidia/gpu" {
count = 10
constraint {
attribute = "${driver.attr.memory}"
attribute = "${device.attr.memory}"
value = "2GB"
operator = ">"
}
affinity {
attribute = "${driver.model}"
attribute = "${device.model}"
value = "1080ti"
weight = 50
}
......
......@@ -161,7 +161,7 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
Name: "gpu",
Constraints: []*structs.Constraint{
{
LTarget: "${driver.attr.cuda_cores}",
LTarget: "${device.attr.cuda_cores}",
Operand: ">",
RTarget: "4000",
},
......@@ -172,7 +172,7 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
Name: "gpu",
Constraints: []*structs.Constraint{
{
LTarget: "${driver.attr.cuda_cores}",
LTarget: "${device.attr.cuda_cores}",
Operand: "<",
RTarget: "4000",
},
......@@ -184,17 +184,17 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
Constraints: []*structs.Constraint{
// First two are shared across both devices
{
LTarget: "${driver.attr.memory_bandwidth}",
LTarget: "${device.attr.memory_bandwidth}",
Operand: ">",
RTarget: "10 GB/s",
},
{
LTarget: "${driver.attr.memory}",
LTarget: "${device.attr.memory}",
Operand: "is",
RTarget: "11264 MiB",
},
{
LTarget: "${driver.attr.graphics_clock}",
LTarget: "${device.attr.graphics_clock}",
Operand: ">",
RTarget: "1.4 GHz",
},
......@@ -209,18 +209,18 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
Name: "nvidia/gpu",
Constraints: []*structs.Constraint{
{
LTarget: "${driver.attr.memory_bandwidth}",
LTarget: "${device.attr.memory_bandwidth}",
Operand: ">",
RTarget: "10 GB/s",
},
{
LTarget: "${driver.attr.memory}",
LTarget: "${device.attr.memory}",
Operand: "is",
RTarget: "11264 MiB",
},
// Rules both out
{
LTarget: "${driver.attr.graphics_clock}",
LTarget: "${device.attr.graphics_clock}",
Operand: ">",
RTarget: "2.4 GHz",
},
......@@ -271,7 +271,7 @@ func TestDeviceAllocator_Allocate_Affinities(t *testing.T) {
Name: "gpu",
Affinities: []*structs.Affinity{
{
LTarget: "${driver.attr.cuda_cores}",
LTarget: "${device.attr.cuda_cores}",
Operand: ">",
RTarget: "4000",
Weight: 0.6,
......@@ -283,7 +283,7 @@ func TestDeviceAllocator_Allocate_Affinities(t *testing.T) {
Name: "gpu",
Affinities: []*structs.Affinity{
{
LTarget: "${driver.attr.cuda_cores}",
LTarget: "${device.attr.cuda_cores}",
Operand: "<",
RTarget: "4000",
Weight: 0.1,
......@@ -295,7 +295,7 @@ func TestDeviceAllocator_Allocate_Affinities(t *testing.T) {
Name: "gpu",
Affinities: []*structs.Affinity{
{
LTarget: "${driver.attr.cuda_cores}",
LTarget: "${device.attr.cuda_cores}",
Operand: ">",
RTarget: "4000",
Weight: -0.2,
......@@ -309,19 +309,19 @@ func TestDeviceAllocator_Allocate_Affinities(t *testing.T) {
Affinities: []*structs.Affinity{
// First two are shared across both devices
{
LTarget: "${driver.attr.memory_bandwidth}",
LTarget: "${device.attr.memory_bandwidth}",
Operand: ">",
RTarget: "10 GB/s",
Weight: 0.2,
},
{
LTarget: "${driver.attr.memory}",
LTarget: "${device.attr.memory}",
Operand: "is",
RTarget: "11264 MiB",
Weight: 0.2,
},
{
LTarget: "${driver.attr.graphics_clock}",
LTarget: "${device.attr.graphics_clock}",
Operand: ">",
RTarget: "1.4 GHz",
Weight: 0.9,
......
......@@ -953,17 +953,17 @@ func resolveDeviceTarget(target string, d *structs.NodeDeviceResource) (*psstruc
// Handle the interpolations
switch {
case "${driver.model}" == target:
case "${device.model}" == target:
return psstructs.NewStringAttribute(d.Name), true
case "${driver.vendor}" == target:
case "${device.vendor}" == target:
return psstructs.NewStringAttribute(d.Vendor), true
case "${driver.type}" == target:
case "${device.type}" == target:
return psstructs.NewStringAttribute(d.Type), true
case strings.HasPrefix(target, "${driver.attr."):
attr := strings.TrimPrefix(target, "${driver.attr.")
case strings.HasPrefix(target, "${device.attr."):
attr := strings.TrimPrefix(target, "${device.attr.")
attr = strings.TrimSuffix(attr, "}")
val, ok := d.Attributes[attr]
return val, ok
......
......@@ -1862,22 +1862,22 @@ func TestDeviceChecker(t *testing.T) {
Constraints: []*structs.Constraint{
{
Operand: "=",
LTarget: "${driver.model}",
LTarget: "${device.model}",
RTarget: "1080ti",
},
{
Operand: ">",
LTarget: "${driver.attr.memory}",
LTarget: "${device.attr.memory}",
RTarget: "1320.5 MB",
},
{
Operand: "<=",
LTarget: "${driver.attr.pci_bandwidth}",
LTarget: "${device.attr.pci_bandwidth}",
RTarget: ".98 GiB/s",
},
{
Operand: "=",
LTarget: "${driver.attr.cores_clock}",
LTarget: "${device.attr.cores_clock}",
RTarget: "800MHz",
},
},
......@@ -1895,22 +1895,22 @@ func TestDeviceChecker(t *testing.T) {
Constraints: []*structs.Constraint{
{
Operand: "=",
LTarget: "${driver.model}",
LTarget: "${device.model}",
RTarget: "1080ti",
},
{
Operand: ">",
LTarget: "${driver.attr.memory}",
LTarget: "${device.attr.memory}",
RTarget: "1320.5 MB",
},
{
Operand: "<=",
LTarget: "${driver.attr.pci_bandwidth}",
LTarget: "${device.attr.pci_bandwidth}",
RTarget: ".98 GiB/s",
},
{
Operand: "=",
LTarget: "${driver.attr.cores_clock}",
LTarget: "${device.attr.cores_clock}",
RTarget: "800MHz",
},
},
......@@ -1928,22 +1928,22 @@ func TestDeviceChecker(t *testing.T) {
Constraints: []*structs.Constraint{
{
Operand: "=",
LTarget: "${driver.model}",
LTarget: "${device.model}",
RTarget: "1080ti",
},
{
Operand: ">",
LTarget: "${driver.attr.memory}",
LTarget: "${device.attr.memory}",
RTarget: "1320.5 MB",
},
{
Operand: "<=",
LTarget: "${driver.attr.pci_bandwidth}",
LTarget: "${device.attr.pci_bandwidth}",
RTarget: ".98 GiB/s",
},
{
Operand: "=",
LTarget: "${driver.attr.cores_clock}",
LTarget: "${device.attr.cores_clock}",
RTarget: "800MHz",
},
},
......@@ -1961,22 +1961,22 @@ func TestDeviceChecker(t *testing.T) {
Constraints: []*structs.Constraint{
{
Operand: "=",
LTarget: "${driver.model}",
LTarget: "${device.model}",
RTarget: "2080ti",
},
{
Operand: ">",
LTarget: "${driver.attr.memory}",
LTarget: "${device.attr.memory}",
RTarget: "1320.5 MB",
},
{
Operand: "<=",
LTarget: "${driver.attr.pci_bandwidth}",
LTarget: "${device.attr.pci_bandwidth}",
RTarget: ".98 GiB/s",
},
{
Operand: "=",
LTarget: "${driver.attr.cores_clock}",
LTarget: "${device.attr.cores_clock}",
RTarget: "800MHz",
},
},
......@@ -1994,22 +1994,22 @@ func TestDeviceChecker(t *testing.T) {
Constraints: []*structs.Constraint{
{
Operand: "=",
LTarget: "${driver.model}",
LTarget: "${device.model}",
RTarget: "1080ti",
},
{
Operand: "<",
LTarget: "${driver.attr.memory}",
LTarget: "${device.attr.memory}",
RTarget: "1320.5 MB",
},
{
Operand: "<=",
LTarget: "${driver.attr.pci_bandwidth}",
LTarget: "${device.attr.pci_bandwidth}",
RTarget: ".98 GiB/s",
},
{
Operand: "=",
LTarget: "${driver.attr.cores_clock}",
LTarget: "${device.attr.cores_clock}",
RTarget: "800MHz",
},
},
......
......@@ -584,7 +584,7 @@ func TestBinPackIterator_Devices(t *testing.T) {
Count: 1,
Affinities: []*structs.Affinity{
{
LTarget: "${driver.attr.graphics_clock}",
LTarget: "${device.attr.graphics_clock}",
Operand: ">",
RTarget: "1.4 GHz",
Weight: 0.9,
......
......@@ -525,6 +525,8 @@ The `Resources` object supports the following keys:
- `Networks` - A list of network objects.
- `Devices` - A list of device objects.
The Network object supports the following keys:
- `MBits` - The number of MBits in bandwidth required.
......@@ -538,6 +540,30 @@ ports. A network object allows the user to specify a list of `DynamicPorts` and
- `Label` - The label to annotate a port so that it can be referred in the
service discovery block or environment variables.
The Device object supports the following keys:
- `Name` - Specifies the device required. The following inputs are valid:
* `<device_type>`: If a single value is given, it is assumed to be the device
type, such as "gpu", or "fpga".
* `<vendor>/<device_type>`: If two values are given separated by a `/`, the
given device type will be selected, constraining on the provided vendor.
Examples include "nvidia/gpu" or "amd/gpu".
* `<vendor>/<device_type>/<model>`: If three values are given separated by a `/`, the
given device type will be selected, constraining on the provided vendor, and
model name. Examples include "nvidia/gpu/1080ti" or "nvidia/gpu/2080ti".
- `Count` - The count of devices being requested per task. Defaults to 1.
- `Constraints` - A list to define constraints on which device can satisfy the
request. See the constraint reference for more details.
- `Affinities` - A list to define preferences for which device should be
chosen. See the affinity reference for more details.
<a id="ephemeral_disk"></a>
### Ephemeral Disk
......
---
layout: "docs"
page_title: "Device Plugins: Community Supported"
sidebar_current: "docs-devices-community"
description: |-
A list of community supported Device Plugins.
---
# Community Supported
If you have authored a device plugin that you believe will be useful to the
broader Nomad community and you are committed to maintaining the plugin, please
file a PR to add your plugin to this page.
## Authoring Device Plugins
Nomad has a plugin system for defining device drivers. External device plugins
will have the same user experience as built in drivers. For details on authoring
a device plugin, please refer to the plugin authoring guide.
---
layout: "docs"
page_title: "Device Plugins"
sidebar_current: "docs-devices"
description: |-
Device Plugins are used to expose devices to tasks in Nomad.
---
# Device Plugins
Device plugins are used to detect and make devices available to tasks in Nomad.
Devices are physical hardware that exists on a node such as a GPU or an FPGA. By
having extensible device plugins, Nomad has the flexibility to support a broad
set of devices and allows the community to build additional device plugins as
needed.
The list of supported device plugins is provided on the left of this page.
Each device plugin documents its configuration and installation requirements,
the attributes it fingerprints, and the environment variables it exposes to
tasks.
---
layout: "docs"
page_title: "Device Plugins: Nvidia"
sidebar_current: "docs-devices-nvidia"
description: |-
The Nvidia Device Plugin detects and makes Nvidia devices available to tasks.
---
# Nvidia GPU Device Plugin
Name: `nvidia-gpu`
The Nvidia device plugin is used to expose Nvidia GPUs to Nomad. The Nvidia
plugin is built into Nomad and does not need to be downloaded separately.
## Fingerprinted Attributes
<table class="table table-bordered table-striped">
<tr>
<th>Attribute</th>
<th>Unit</th>
</tr>
<tr>
<td><tt>memory</tt></td>
<td>MiB</td>
</tr>
<tr>
<td><tt>power</tt></td>
<td>W (Watt)</td>
</tr>
<tr>
<td><tt>bar1</tt></td>
<td>MiB</td>
</tr>
<tr>
<td><tt>driver_version</tt></td>
<td>string</td>
</tr>
<tr>
<td><tt>cores_clock</tt></td>
<td>MHz</td>
</tr>
<tr>
<td><tt>memory_clock</tt></td>
<td>MHz</td>
</tr>
<tr>
<td><tt>pci_bandwidth</tt></td>
<td>MB/s</td>
</tr>
<tr>
<td><tt>display_state</tt></td>
<td>string</td>
</tr>
<tr>
<td><tt>persistence_mode</tt></td>
<td>string</td>
</tr>
</table>
## Runtime Environment
The `nvidia-gpu` device plugin exposes the following environment variables:
* `NVIDIA_VISIBLE_DEVICES` - List of Nvidia GPU IDs available to the task.
### Additional Task Configurations
Additional environment variables can be set by the task to influence the runtime
environment. See [Nvidia's
documentation](https://github.com/NVIDIA/nvidia-container-runtime#environment-variables-oci-spec).
## Installation Requirements
In order to use the `nvidia-gpu` the following prerequisites must be met:
1. GNU/Linux x86_64 with kernel version > 3.10
2. NVIDIA GPU with Architecture > Fermi (2.1)
3. NVIDIA drivers >= 340.29 with binary `nvidia-smi`
### Docker Driver Requirements
In order to use the Nvidia driver plugin with the Docker driver, please follow
the installation instructions for
[`nvidia-docker`](https://github.com/NVIDIA/nvidia-docker/wiki/Installation-\(version-1.0\)).
## Plugin Configuration
```hcl
plugin "nvidia-gpu" {
ignored_gpu_ids = ["GPU-fef8089b", "GPU-ac81e44d"]
fingerprint_period = "1m"
}
```
The `nvidia-gpu` device plugin supports the following configuration in the agent
config:
* `ignored_gpu_ids` `(array<string>: [])` - Specifies the set of GPU UUIDs that
should be ignored when fingerprinting.
* `fingerprint_period` `(string: "1m")` - The period in which to fingerprint for
device changes.
## Restrictions
The Nvidia integration only works with drivers who natively integrate with
Nvidia's [container runtime
library](https://github.com/NVIDIA/libnvidia-container).
Nomad has tested support with the [`docker` driver][docker-driver] and plans to
bring support to the built-in [`exec`][exec-driver] and [`java`][java-driver]
drivers. Support for [`lxc`][lxc-driver] should be possible by installing the
[Nvidia hook](https://github.com/lxc/lxc/blob/master/hooks/nvidia) but is not
tested or documented by Nomad.
## Examples
Inspect a node with a GPU:
```sh
$ nomad node status 4d46e59f
ID = 4d46e59f
Name = nomad
Class = <none>
DC = dc1
Drain = false
Eligibility = eligible
Status = ready
Uptime = 19m43s
Driver Status = docker,mock_driver,raw_exec
Node Events
Time Subsystem Message
2019-01-23T18:25:18Z Cluster Node registered
Allocated Resources
CPU Memory Disk
0/15576 MHz 0 B/55 GiB 0 B/28 GiB
Allocation Resource Utilization
CPU Memory
0/15576 MHz 0 B/55 GiB
Host Resource Utilization
CPU Memory Disk
2674/15576 MHz 1.5 GiB/55 GiB 3.0 GiB/31 GiB
Device Resource Utilization
nvidia/gpu/Tesla K80[GPU-e1f6f4f1-1ea5-7b9d-5f03-338a9dc32416] 0 / 11441 MiB
Allocations
No allocations placed
```
Display detailed statistics on a node with a GPU:
```sh
$ nomad node status -stats 4d46e59f
ID = 4d46e59f
Name = nomad
Class = <none>
DC = dc1
Drain = false
Eligibility = eligible
Status = ready
Uptime = 19m59s
Driver Status = docker,mock_driver,raw_exec
Node Events
Time Subsystem Message
2019-01-23T18:25:18Z Cluster Node registered
Allocated Resources
CPU Memory Disk
0/15576 MHz 0 B/55 GiB 0 B/28 GiB
Allocation Resource Utilization
CPU Memory
0/15576 MHz 0 B/55 GiB
Host Resource Utilization
CPU Memory Disk
2673/15576 MHz 1.5 GiB/55 GiB 3.0 GiB/31 GiB
Device Resource Utilization
nvidia/gpu/Tesla K80[GPU-e1f6f4f1-1ea5-7b9d-5f03-338a9dc32416] 0 / 11441 MiB
// ...TRUNCATED...
Device Stats
Device = nvidia/gpu/Tesla K80[GPU-e1f6f4f1-1ea5-7b9d-5f03-338a9dc32416]
BAR1 buffer state = 2 / 16384 MiB
Decoder utilization = 0 %
ECC L1 errors = 0
ECC L2 errors = 0
ECC memory errors = 0
Encoder utilization = 0 %
GPU utilization = 0 %
Memory state = 0 / 11441 MiB
Memory utilization = 0 %
Power usage = 37 / 149 W
Temperature = 34 C
Allocations
No allocations placed
```
Run the following example job to see that that the GPU was mounted in the
container:
```hcl
job "gpu-test" {
datacenters = ["dc1"]
type = "batch"
group "smi" {
task "smi" {
driver = "docker"
config {
image = "nvidia/cuda:9.0-base"
command = "nvidia-smi"
}
resources {
device "nvidia/gpu" {
count = 1
# Add an affinity for a particular model
affinity {
attribute = "${device.model}"
value = "Tesla K80"
}
}
}
}
}
}
```
```sh
$ nomad run example.nomad
==> Monitoring evaluation "21bd7584"
Evaluation triggered by job "gpu-test"
Allocation "d250baed" created: node "4d46e59f", group "smi"
Evaluation status changed: "pending" -> "complete"
==> Evaluation "21bd7584" finished with status "complete"
$ nomad alloc status d250baed
ID = d250baed
Eval ID = 21bd7584
Name = gpu-test.smi[0]
Node ID = 4d46e59f
Job ID = example
Job Version = 0
Client Status = complete
Client Description = All tasks have completed
Desired Status = run
Desired Description = <none>
Created = 7s ago
Modified = 2s ago
Task "smi" is "dead"
Task Resources
CPU Memory Disk Addresses
0/100 MHz 0 B/300 MiB 300 MiB
Device Stats
nvidia/gpu/Tesla K80[GPU-e1f6f4f1-1ea5-7b9d-5f03-338a9dc32416] 0 / 11441 MiB
Task Events:
Started At = 2019-01-23T18:25:32Z
Finished At = 2019-01-23T18:25:34Z
Total Restarts = 0
Last Restart = N/A
Recent Events:
Time Type Description
2019-01-23T18:25:34Z Terminated Exit Code: 0
2019-01-23T18:25:32Z Started Task started by client
2019-01-23T18:25:29Z Task Setup Building Task Directory
2019-01-23T18:25:29Z Received Task received by client
$ nomad alloc logs d250baed
Wed Jan 23 18:25:32 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.48 Driver Version: 410.48 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla K80 On | 00004477:00:00.0 Off | 0 |
| N/A 33C P8 37W / 149W | 0MiB / 11441MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
```
[docker-driver]: /docs/drivers/docker.html "Nomad docker Driver"
[exec-driver]: /docs/drivers/exec.html "Nomad exec Driver"
[java-driver]: /docs/drivers/java.html "Nomad java Driver"
[lxc-driver]: /docs/drivers/lxc.html "Nomad lxc Driver"
---
layout: "docs"
page_title: "device Stanza - Job Specification"
sidebar_current: "docs-job-specification-device"
description: |-
The "device" stanza is used to require a certain device be made available
to the task.
---
# `device` Stanza
<table class="table table-bordered table-striped">
<tr>
<th width="120">Placement</th>
<td>
<code>job -> group -> task -> resources -> **device**</code>
</td>
</tr>
</table>
The `device` stanza is used to create both a scheduling and runtime requirement
that the given task has access to the specified devices. A device is a hardware
device that is attached to the node and may be made available to the task.
Examples are GPUs, FPGAs, and TPUs.
When a `device` stanza is added, Nomad will schedule the task onto a node that
contains the set of device(s) that meet the specified requirements. The `device` stanza
allows the operator to specify as little as just the type of device required,
such as `gpu`, all the way to specifying arbitrary constraints and affinities.
Once the scheduler has placed the allocation on a suitable node, the Nomad
Client will invoke the device plugin to retrieve information on how to mount the
device and what environment variables to expose. For more information on the
runtime environment, please consult the individual device plugin's documentation.
See the [device plugin's documentation][devices] for a list of supported devices.
```hcl
job "docs" {
group "example" {
task "server" {
resources {
device "nvidia/gpu" {
count = 2
constraint {
attribute = "${driver.attr.memory}"
operator = ">="
value = "2 GiB"
}
affinity {
attribute = "${driver.attr.memory}"
operator = ">="
value = "4 GiB"
weight = 75
}
}
}
}
}
}
```
In the above example, the task is requesting two GPUs, from the Nvidia vendor,
but is not specifying the specific model required. Instead it is placing a hard
constraint that the device has at least 2 GiB of memory and that it would prefer
to use GPUs that have at least 4 GiB. This examples shows how expressive the
`device` stanza can be.
~> Device supported is currently limited to Linux, and container based drivers
due to the ability to isolate devices to specific tasks.
## `device` Parameters
- `name` `(string: "")` - Specifies the device required. The following inputs
are valid:
* `<device_type>`: If a single value is given, it is assumed to be the device
type, such as "gpu", or "fpga".
* `<vendor>/<device_type>`: If two values are given separated by a `/`, the
given device type will be selected, constraining on the provided vendor.
Examples include "nvidia/gpu" or "amd/gpu".
* `<vendor>/<device_type>/<model>`: If three values are given separated by a `/`, the
given device type will be selected, constraining on the provided vendor, and
model name. Examples include "nvidia/gpu/1080ti" or "nvidia/gpu/2080ti".
- `count` `(int: 1)` - Specifies the number of instances of the given device
that are required.
- `constraint` <code>([Constraint][]: nil)</code> - Constraints to restrict
which devices are eligible. This can be provided multiple times to define
additional constraints. See below for available attributes.
- `affinity` <code>([Affinity][]: nil)</code> - Affinity to specify a preference
for which devices get selected. This can be provided multiple times to define
additional affinities. See below for available attributes.
## `device` Constraint and Affinity Attributes
The set of attributes available for use in a `constraint` or `affinity` are as
follows:
<table class="table table-bordered table-striped">
<tr>
<th>Variable</th>
<th>Description</th>
<th>Example Value</th>
</tr>
<tr>
<td><tt>${device.type}</tt></td>
<td>The type of device</td>
<td><tt>"gpu", "tpu", "fpga"</tt></td>
</tr>
<tr>
<td><tt>${device.vendor}</tt></td>
<td>The device's vendor</td>
<td><tt>"amd", "nvidia", "intel"</tt></td>
</tr>
<tr>
<td><tt>${device.model}</tt></td>
<td>The device's model</td>
<td><tt>"1080ti"</tt></td>
</tr>
<tr>
<td><tt>${device.attr.&lt;property&gt;}</tt></td>
<td>Property of the device</td>
<td><tt>${device.attr.memory} => 8 GiB</tt></td>
</tr>
</table>
For the set of attributes available, please see the individual [device plugin's
documentation][devices].
### Attribute Units and Conversions
Devices report their attributes with strict types and can also provide unit
information. For example, when a GPU is reporting its memory, it can report that
it is "4096 MiB". Since Nomad has the associated unit information, a constraint
that requires greater than "3.5 GiB" can match since Nomad can convert between
these units.
The units Nomad supports is as follows:
<table class="table table-bordered table-striped">
<tr>
<th>Base Unit</th>
<th>Values</th>
</tr>
<tr>
<td><tt>Byte</tt></td>
<td><tt>**Base 2**: KiB, MiB, GiB, TiB, PiB, EiB<br>**Base 10**: kB, KB (equivalent to kB), MB, GB, TB, PB, EB</tt>
</tr>
<tr>
<td><tt>Byte Rates</tt></td>
<td><tt>**Base 2**: KiB/s, MiB/s, GiB/s, TiB/s, PiB/s, EiB/s<br>**Base 10**: kB/s, KB/s (equivalent to kB/s), MB/s, GB/s, TB/s, PB/s, EB/s</tt>
</tr>
<tr>
<td><tt>Hertz</tt></td>
<td><tt>MHz, GHz</tt></td>
</tr>
<tr>
<td><tt>Watts</tt></td>
<td><tt>mW, W, kW, MW, GW</tt></td>
</tr>
</table>
Conversion is only possible within the same base unit.
## `device` Examples
The following examples only show the `device` stanzas. Remember that the
`device` stanza is only valid in the placements listed above.
### Single Nvidia GPU
This example schedules a task with a single Nvidia GPU made available.
```hcl
device "nvidia/gpu" {}
```
### Multiple Nvidia GPU
This example schedules a task with a two Nvidia GPU made available.
```hcl
device "nvidia/gpu" {
count = 2
}
```
### Single Nvidia GPU with Specific Model
This example schedules a task with a single Nvidia GPU made available and uses
the name to specify the exact model to be used.
```hcl
device "nvidia/gpu/1080ti" {}
```
This is a simplification of the following:
```hcl
device "gpu" {
count = 1
constraint {
attribute = "${device.vendor}"
value = "nvidia"
}
constraint {
attribute = "${device.model}"
value = "1080ti"
}
}
```
### Affinity with Unit Conversion
This example uses an affinity to tell the scheduler it would prefer if the GPU
had at least 1.5 GiB of memory. The following are both equivalent as Nomad can
do unit conversions.
Specified in `GiB`:
```hcl
device "nvidia/gpu" {
affinity {
attribute = "${device.attr.memory}"
operator = ">="
value = "1.5 GiB"
weight = 75
}
}
```
Specified in `MiB`:
```hcl
device "nvidia/gpu" {
affinity {
attribute = "${device.attr.memory}"
operator = ">="
value = "1500 MiB"
weight = 75
}
}
```
[affinity]: /docs/job-specification/affinity.html "Nomad affinity Job Specification"
[constraint]: /docs/job-specification/constraint.html "Nomad constraint Job Specification"
[devices]: /docs/devices/index.html "Nomad Device Plugins"
......@@ -36,6 +36,10 @@ job "docs" {
static = 22
}
}
device "nvidia/gpu" {
count = 2
}
}
}
}
......@@ -48,9 +52,12 @@ job "docs" {
- `memory` `(int: 300)` - Specifies the memory required in MB
- `network` <code>([Network][]: <required>)</code> - Specifies the network
- `network` <code>([Network][]: &lt;optional&gt;)</code> - Specifies the network
requirements, including static and dynamic port allocations.
- `device` <code>([Device][]: &lt;optional&gt;)</code> - Specifies the device
requirements. This may be repeated to request multiple device types.
## `resources` Examples
The following examples only show the `resources` stanzas. Remember that the
......@@ -86,4 +93,18 @@ resources {
}
```
### Devices
This example shows a device constraints as specified in the [device][] stanza
which require two nvidia GPUs to be made available:
```hcl
resources {
device "nvidia/gpu" {
count = 2
}
}
```
[network]: /docs/job-specification/network.html "Nomad network Job Specification"
[device]: /docs/job-specification/device.html "Nomad device Job Specification"
......@@ -333,6 +333,9 @@
<li<%= sidebar_current("docs-job-specification-constraint")%>>
<a href="/docs/job-specification/constraint.html">constraint</a>
</li>
<li<%= sidebar_current("docs-job-specification-device")%>>
<a href="/docs/job-specification/device.html">device</a>
</li>
<li<%= sidebar_current("docs-job-specification-dispatch-payload")%>>
<a href="/docs/job-specification/dispatch_payload.html">dispatch_payload</a>
</li>
......@@ -434,6 +437,19 @@
</ul>
</li>
<li<%= sidebar_current("docs-devices") %>>
<a href="/docs/devices/index.html">Device Plugins</a>
<ul class="nav">
<li<%= sidebar_current("docs-devices-nvidia") %>>
<a href="/docs/devices/nvidia.html">Nvidia</a>
</li>
<li<%= sidebar_current("docs-devices-community") %>>
<a href="/docs/devices/community.html">Community Supported</a>
</li>
</ul>
</li>
<li<%= sidebar_current("docs-schedulers") %>>
<a href="/docs/schedulers.html">Schedulers</a>
</li>
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment