Unverified Commit 25a1f735 authored by Chris Baker's avatar Chris Baker Committed by GitHub
Browse files

Merge pull request #5548 from hashicorp/cgbaker/update-terraform

Update terraform, add nvidia example
Showing with 261 additions and 33 deletions
+261 -33
variable "name" {
description = "Used to name various infrastructure components"
default = "hashistack"
}
variable "whitelist_ip" {
description = "IP to whitelist for the security groups (set 0.0.0.0/0 for world)"
}
variable "region" {
......@@ -10,17 +13,24 @@ variable "region" {
variable "ami" {}
variable "instance_type" {
description = "The AWS instance type to use for both clients and servers."
variable "server_instance_type" {
description = "The AWS instance type to use for servers."
default = "t2.medium"
}
variable "client_instance_type" {
description = "The AWS instance type to use for clients."
default = "t2.medium"
}
variable "root_block_device_size" {
description = "The volume size of the root block device."
default = 8
default = 16
}
variable "key_name" {}
variable "key_name" {
description = "Name of the SSH key used to provision EC2 instances."
}
variable "server_count" {
description = "The number of servers to provision."
......@@ -32,6 +42,7 @@ variable "client_count" {
default = "4"
}
variable "retry_join" {
description = "Used by Consul to automatically form a cluster."
type = "map"
......@@ -58,13 +69,15 @@ module "hashistack" {
name = "${var.name}"
region = "${var.region}"
ami = "${var.ami}"
instance_type = "${var.instance_type}"
server_instance_type = "${var.server_instance_type}"
client_instance_type = "${var.client_instance_type}"
key_name = "${var.key_name}"
server_count = "${var.server_count}"
client_count = "${var.client_count}"
retry_join = "${var.retry_join}"
nomad_binary = "${var.nomad_binary}"
root_block_device_size = "${var.root_block_device_size}"
whitelist_ip = "${var.whitelist_ip}"
}
output "IP_Addresses" {
......@@ -89,8 +102,12 @@ executing:
Simply wait a few seconds and rerun the command if this occurs.
The Nomad UI can be accessed at http://PUBLIC_IP:4646/ui.
The Consul UI can be accessed at http://PUBLIC_IP:8500/ui.
The Nomad UI can be accessed at http://${module.hashistack.server_lb_ip}:4646/ui.
The Consul UI can be accessed at http://${module.hashistack.server_lb_ip}:8500/ui.
Set the following for access from the Nomad CLI:
export NOMAD_ADDR=http://${module.hashistack.server_lb_ip}:4646
CONFIGURATION
}
region = "us-east-1"
ami = "ami-066a7f2ffac02f833"
instance_type = "t2.medium"
key_name = "KEY_NAME"
server_count = "3"
client_count = "4"
region = "us-east-1"
ami = "ami-090a41df9e193a506"
server_instance_type = "t2.medium"
client_instance_type = "p3.2xlarge"
server_count = "1"
client_count = "1"
nomad_binary = "https://releases.hashicorp.com/nomad/0.9.0/nomad_0.9.0_linux_amd64.zip"
variable "name" {}
variable "region" {}
variable "ami" {}
variable "instance_type" {}
variable "server_instance_type" {}
variable "client_instance_type" {}
variable "key_name" {}
variable "server_count" {}
variable "client_count" {}
variable "nomad_binary" {}
variable "root_block_device_size" {}
variable "whitelist_ip" {}
variable "retry_join" {
type = "map"
......@@ -22,6 +24,35 @@ data "aws_vpc" "default" {
default = true
}
resource "aws_security_group" "server_lb" {
name = "${var.name}-server-lb"
vpc_id = "${data.aws_vpc.default.id}"
# Nomad
ingress {
from_port = 4646
to_port = 4646
protocol = "tcp"
cidr_blocks = ["${var.whitelist_ip}"]
}
# Consul
ingress {
from_port = 8500
to_port = 8500
protocol = "tcp"
cidr_blocks = ["${var.whitelist_ip}"]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
resource "aws_security_group" "primary" {
name = "${var.name}"
vpc_id = "${data.aws_vpc.default.id}"
......@@ -30,7 +61,7 @@ resource "aws_security_group" "primary" {
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
cidr_blocks = ["${var.whitelist_ip}"]
}
# Nomad
......@@ -38,7 +69,8 @@ resource "aws_security_group" "primary" {
from_port = 4646
to_port = 4646
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
cidr_blocks = ["${var.whitelist_ip}"]
security_groups = ["${aws_security_group.server_lb.id}"]
}
# Fabio
......@@ -46,7 +78,7 @@ resource "aws_security_group" "primary" {
from_port = 9998
to_port = 9999
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
cidr_blocks = ["${var.whitelist_ip}"]
}
# Consul
......@@ -54,7 +86,8 @@ resource "aws_security_group" "primary" {
from_port = 8500
to_port = 8500
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
cidr_blocks = ["${var.whitelist_ip}"]
security_groups = ["${aws_security_group.server_lb.id}"]
}
# HDFS NameNode UI
......@@ -62,7 +95,7 @@ resource "aws_security_group" "primary" {
from_port = 50070
to_port = 50070
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
cidr_blocks = ["${var.whitelist_ip}"]
}
# HDFS DataNode UI
......@@ -70,7 +103,7 @@ resource "aws_security_group" "primary" {
from_port = 50075
to_port = 50075
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
cidr_blocks = ["${var.whitelist_ip}"]
}
# Spark history server UI
......@@ -78,9 +111,18 @@ resource "aws_security_group" "primary" {
from_port = 18080
to_port = 18080
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
cidr_blocks = ["${var.whitelist_ip}"]
}
# Jupyter
ingress {
from_port = 8888
to_port = 8888
protocol = "tcp"
cidr_blocks = ["${var.whitelist_ip}"]
}
ingress {
from_port = 0
to_port = 0
......@@ -119,7 +161,7 @@ data "template_file" "user_data_client" {
resource "aws_instance" "server" {
ami = "${var.ami}"
instance_type = "${var.instance_type}"
instance_type = "${var.server_instance_type}"
key_name = "${var.key_name}"
vpc_security_group_ids = ["${aws_security_group.primary.id}"]
count = "${var.server_count}"
......@@ -142,7 +184,7 @@ resource "aws_instance" "server" {
resource "aws_instance" "client" {
ami = "${var.ami}"
instance_type = "${var.instance_type}"
instance_type = "${var.client_instance_type}"
key_name = "${var.key_name}"
vpc_security_group_ids = ["${aws_security_group.primary.id}"]
count = "${var.client_count}"
......@@ -213,6 +255,26 @@ data "aws_iam_policy_document" "auto_discover_cluster" {
}
}
resource "aws_elb" "server_lb" {
name = "${var.name}-server-lb"
availability_zones = ["${distinct(aws_instance.server.*.availability_zone)}"]
internal = false
instances = ["${aws_instance.server.*.id}"]
listener {
instance_port = 4646
instance_protocol = "http"
lb_port = 4646
lb_protocol = "http"
}
listener {
instance_port = 8500
instance_protocol = "http"
lb_port = 8500
lb_protocol = "http"
}
security_groups = ["${aws_security_group.server_lb.id}"]
}
output "server_public_ips" {
value = ["${aws_instance.server.*.public_ip}"]
}
......@@ -220,3 +282,7 @@ output "server_public_ips" {
output "client_public_ips" {
value = ["${aws_instance.client.*.public_ip}"]
}
output "server_lb_ip" {
value = "${aws_elb.server_lb.dns_name}"
}
......@@ -5,7 +5,7 @@
"source_ami": "ami-80861296",
"instance_type": "t2.medium",
"ssh_username": "ubuntu",
"ami_name": "hashistack {{timestamp}}",
"ami_name": "cgbaker hashistack-nvidia {{timestamp}}",
"ami_groups": ["all"]
}],
"provisioners": [
......@@ -28,6 +28,9 @@
},
{
"type": "shell",
"script": "../shared/scripts/setup.sh"
"script": "../shared/scripts/setup.sh",
"environment_vars": [
"INSTALL_NVIDIA_DOCKER=true"
]
}]
}
job "tensorrt" {
datacenters = ["dc1"]
group "back" {
task "rtserver" {
driver = "docker"
config {
image = "nvcr.io/nvidia/tensorrtserver:19.02-py3"
command = "trtserver"
args = [
"--model-store=${NOMAD_TASK_DIR}/models"
]
shm_size=1024
port_map {
http = 8000
grpc = 8001
metrics = 8002
}
ulimit {
memlock = "-1"
stack = "67108864"
}
}
service {
port = "http"
tags = ["http"]
check {
type = "http"
port = "http"
path = "/api/health/ready"
interval = "5s"
timeout = "1s"
}
check_restart {
grace = "30s"
}
}
# load the example model into ${NOMAD_TASK_DIR}/models
artifact {
source = "http://download.caffe2.ai.s3.amazonaws.com/models/resnet50/predict_net.pb"
destination = "local/models/resnet50_netdef/1/model.netdef"
mode = "file"
}
artifact {
source = "http://download.caffe2.ai.s3.amazonaws.com/models/resnet50/init_net.pb"
destination = "local/models/resnet50_netdef/1/init_model.netdef"
mode = "file"
}
artifact {
source = "https://raw.githubusercontent.com/NVIDIA/tensorrt-inference-server/v1.0.0/docs/examples/model_repository/resnet50_netdef/config.pbtxt"
destination = "local/models/resnet50_netdef/config.pbtxt"
mode = "file"
}
artifact {
source = "https://raw.githubusercontent.com/NVIDIA/tensorrt-inference-server/v1.0.0/docs/examples/model_repository/resnet50_netdef/resnet50_labels.txt"
destination = "local/models/resnet50_netdef/resnet50_labels.txt"
mode = "file"
}
resources {
cpu = 8192
memory = 8192
network {
mbits = 10
port "http" {}
}
# an Nvidia GPU with >= 4GiB memory, preferably a Tesla
device "nvidia/gpu" {
count = 1
constraint {
attribute = "${device.attr.memory}"
operator = ">="
value = "4 GiB"
}
affinity {
attribute = "${device.model}"
operator = "regexp"
value = "Tesla"
}
}
}
}
}
group "front" {
task "web" {
driver = "docker"
config {
image = "renaudwastaken/tensorrt-frontend:latest"
args = [
"main.py", "${RTSERVER}"
]
port_map {
http = 5000
}
}
resources {
cpu = 1024
memory = 1024
network {
mbits = 10
port "http" { static = "8888" }
}
}
template {
data = <<EOH
RTSERVER = {{ with service "tensorrt-back-rtserver" }}{{ with index . 0 }} http://{{.Address }}:{{.Port }} {{ end }}{{ end }}
EOH
destination = "local/rtserver.env"
env = true
}
}
}
}
......@@ -14,8 +14,8 @@ HOME_DIR=ubuntu
# Wait for network
sleep 15
# IP_ADDRESS=$(curl http://instance-data/latest/meta-data/local-ipv4)
IP_ADDRESS="$(/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}')"
IP_ADDRESS=$(curl http://instance-data/latest/meta-data/local-ipv4)
# IP_ADDRESS="$(/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}')"
DOCKER_BRIDGE_IP_ADDRESS=(`ifconfig docker0 2>/dev/null|awk '/inet addr:/ {print $2}'|sed 's/addr://'`)
CLOUD=$1
RETRY_JOIN=$2
......
......@@ -15,8 +15,8 @@ HOME_DIR=ubuntu
# Wait for network
sleep 15
# IP_ADDRESS=$(curl http://instance-data/latest/meta-data/local-ipv4)
IP_ADDRESS="$(/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}')"
IP_ADDRESS=$(curl http://instance-data/latest/meta-data/local-ipv4)
# IP_ADDRESS="$(/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}')"
DOCKER_BRIDGE_IP_ADDRESS=(`ifconfig docker0 2>/dev/null|awk '/inet addr:/ {print $2}'|sed 's/addr://'`)
CLOUD=$1
SERVER_COUNT=$2
......
......@@ -19,7 +19,7 @@ VAULTDOWNLOAD=https://releases.hashicorp.com/vault/${VAULTVERSION}/vault_${VAULT
VAULTCONFIGDIR=/etc/vault.d
VAULTDIR=/opt/vault
NOMADVERSION=0.8.7
NOMADVERSION=0.9.0
NOMADDOWNLOAD=https://releases.hashicorp.com/nomad/${NOMADVERSION}/nomad_${NOMADVERSION}_linux_amd64.zip
NOMADCONFIGDIR=/etc/nomad.d
NOMADDIR=/opt/nomad
......@@ -113,6 +113,25 @@ sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/${di
sudo apt-get update
sudo apt-get install -y docker-ce
if [[ ! -z ${INSTALL_NVIDIA_DOCKER+x} ]]; then
# Install official NVIDIA driver package
sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub
sudo sh -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list'
sudo apt-get update && sudo apt-get install -y --no-install-recommends linux-headers-generic dkms cuda-drivers
# Install nvidia-docker and nvidia-docker-plugin
# from: https://github.com/NVIDIA/nvidia-docker#ubuntu-140416041804-debian-jessiestretch
wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb
sudo dpkg -i /tmp/nvidia-docker*.deb && rm /tmp/nvidia-docker*.deb
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update
sudo apt-get install -y nvidia-docker2
fi
# rkt
VERSION=1.29.0
DOWNLOAD=https://github.com/rkt/rkt/releases/download/v${VERSION}/rkt-v${VERSION}.tar.gz
......@@ -155,7 +174,7 @@ JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::")
# Spark
sudo wget -P /ops/examples/spark https://s3.amazonaws.com/nomad-spark/spark-2.2.0-bin-nomad-0.7.0.tgz
sudo tar -xvf /ops/examples/spark/spark-2.2.0-bin-nomad-0.7.0.tgz --directory /ops/examples/spark
sudo tar -xf /ops/examples/spark/spark-2.2.0-bin-nomad-0.7.0.tgz --directory /ops/examples/spark
sudo mv /ops/examples/spark/spark-2.2.0-bin-nomad-0.7.0 /usr/local/bin/spark
sudo chown -R root:root /usr/local/bin/spark
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment