Merge pull request #5548 from hashicorp/cgbaker/update-terraform

Update terraform, add nvidia example

Merge pull request #5548 from hashicorp/cgbaker/update-terraform
Update terraform, add nvidia example
25a1f735 · Chris Baker · GitHub · 6b600688 · 26158acf · 25a1f735
Unverified Commit 25a1f735 authored 6 years ago by Chris Baker Committed by GitHub 6 years ago
Hide whitespace changes
Inline Side-by-side

Showing

with 261 additions and 33 deletions
+261 -33
--- a/terraform/aws/env/us-east/main.tf
+++ b/terraform/aws/env/us-east/main.tf
 variable "name" {
  description = "Used to name various infrastructure components"
-  default     = "hashistack"
+}
+
+variable "whitelist_ip" {
+  description = "IP to whitelist for the security groups (set 0.0.0.0/0 for world)"
 }

 variable "region" {
@@ -10,17 +13,24 @@ variable "region" {

 variable "ami" {}

-variable "instance_type" {
-  description = "The AWS instance type to use for both clients and servers."
+variable "server_instance_type" {
+  description = "The AWS instance type to use for servers."
+  default     = "t2.medium"
+}
+
+variable "client_instance_type" {
+  description = "The AWS instance type to use for clients."
  default     = "t2.medium"
 }

 variable "root_block_device_size" {
  description = "The volume size of the root block device."
-  default     = 8
+  default     = 16
 }

-variable "key_name" {}
+variable "key_name" {
+  description = "Name of the SSH key used to provision EC2 instances."
+}

 variable "server_count" {
  description = "The number of servers to provision."
@@ -32,6 +42,7 @@ variable "client_count" {
  default     = "4"
 }

+
 variable "retry_join" {
  description = "Used by Consul to automatically form a cluster."
  type        = "map"
@@ -58,13 +69,15 @@ module "hashistack" {
  name                   = "${var.name}"
  region                 = "${var.region}"
  ami                    = "${var.ami}"
-  instance_type          = "${var.instance_type}"
+  server_instance_type   = "${var.server_instance_type}"
+  client_instance_type   = "${var.client_instance_type}"
  key_name               = "${var.key_name}"
  server_count           = "${var.server_count}"
  client_count           = "${var.client_count}"
  retry_join             = "${var.retry_join}"
  nomad_binary           = "${var.nomad_binary}"
  root_block_device_size = "${var.root_block_device_size}"
+  whitelist_ip           = "${var.whitelist_ip}"
 }

 output "IP_Addresses" {
@@ -89,8 +102,12 @@ executing:

 Simply wait a few seconds and rerun the command if this occurs.

-The Nomad UI can be accessed at http://PUBLIC_IP:4646/ui.
-The Consul UI can be accessed at http://PUBLIC_IP:8500/ui.
+The Nomad UI can be accessed at http://${module.hashistack.server_lb_ip}:4646/ui.
+The Consul UI can be accessed at http://${module.hashistack.server_lb_ip}:8500/ui.
+
+Set the following for access from the Nomad CLI:
+
+  export NOMAD_ADDR=http://${module.hashistack.server_lb_ip}:4646

 CONFIGURATION
 }
--- a/terraform/aws/env/us-east/terraform.tfvars
+++ b/terraform/aws/env/us-east/terraform.tfvars
-region            = "us-east-1"
-ami               = "ami-066a7f2ffac02f833"
-instance_type     = "t2.medium"
-key_name          = "KEY_NAME"
-server_count      = "3"
-client_count      = "4"
-
+region               = "us-east-1"
+ami                  = "ami-090a41df9e193a506"
+server_instance_type = "t2.medium"
+client_instance_type = "p3.2xlarge"
+server_count         = "1"
+client_count         = "1"
+nomad_binary         = "https://releases.hashicorp.com/nomad/0.9.0/nomad_0.9.0_linux_amd64.zip"
--- a/terraform/aws/modules/hashistack/hashistack.tf
+++ b/terraform/aws/modules/hashistack/hashistack.tf
 variable "name" {}
 variable "region" {}
 variable "ami" {}
-variable "instance_type" {}
+variable "server_instance_type" {}
+variable "client_instance_type" {}
 variable "key_name" {}
 variable "server_count" {}
 variable "client_count" {}
 variable "nomad_binary" {}
 variable "root_block_device_size" {}
+variable "whitelist_ip" {}

 variable "retry_join" {
  type = "map"
@@ -22,6 +24,35 @@ data "aws_vpc" "default" {
  default = true
 }

+resource "aws_security_group" "server_lb" {
+  name   = "${var.name}-server-lb"
+  vpc_id = "${data.aws_vpc.default.id}"
+
+  # Nomad
+  ingress {
+    from_port   = 4646
+    to_port     = 4646
+    protocol    = "tcp"
+    cidr_blocks = ["${var.whitelist_ip}"]
+  }
+
+  # Consul
+  ingress {
+    from_port   = 8500
+    to_port     = 8500
+    protocol    = "tcp"
+    cidr_blocks = ["${var.whitelist_ip}"]
+  }
+
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+}
+
 resource "aws_security_group" "primary" {
  name   = "${var.name}"
  vpc_id = "${data.aws_vpc.default.id}"
@@ -30,7 +61,7 @@ resource "aws_security_group" "primary" {
    from_port   = 22
    to_port     = 22
    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"]
+    cidr_blocks = ["${var.whitelist_ip}"]
  }

  # Nomad
@@ -38,7 +69,8 @@ resource "aws_security_group" "primary" {
    from_port   = 4646
    to_port     = 4646
    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"]
+    cidr_blocks = ["${var.whitelist_ip}"]
+    security_groups = ["${aws_security_group.server_lb.id}"]
  }

  # Fabio 
@@ -46,7 +78,7 @@ resource "aws_security_group" "primary" {
    from_port   = 9998
    to_port     = 9999
    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"]
+    cidr_blocks = ["${var.whitelist_ip}"]
  }

  # Consul
@@ -54,7 +86,8 @@ resource "aws_security_group" "primary" {
    from_port   = 8500
    to_port     = 8500
    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"]
+    cidr_blocks = ["${var.whitelist_ip}"]
+    security_groups = ["${aws_security_group.server_lb.id}"]
  }

  # HDFS NameNode UI
@@ -62,7 +95,7 @@ resource "aws_security_group" "primary" {
    from_port   = 50070
    to_port     = 50070
    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"]
+    cidr_blocks = ["${var.whitelist_ip}"]
  }

  # HDFS DataNode UI
@@ -70,7 +103,7 @@ resource "aws_security_group" "primary" {
    from_port   = 50075
    to_port     = 50075
    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"]
+    cidr_blocks = ["${var.whitelist_ip}"]
  }

  # Spark history server UI
@@ -78,9 +111,18 @@ resource "aws_security_group" "primary" {
    from_port   = 18080
    to_port     = 18080
    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"]
+    cidr_blocks = ["${var.whitelist_ip}"]
  }

+  # Jupyter
+  ingress {
+    from_port   = 8888
+    to_port     = 8888
+    protocol    = "tcp"
+    cidr_blocks = ["${var.whitelist_ip}"]
+  }
+
+
  ingress {
    from_port = 0
    to_port   = 0
@@ -119,7 +161,7 @@ data "template_file" "user_data_client" {

 resource "aws_instance" "server" {
  ami                    = "${var.ami}"
-  instance_type          = "${var.instance_type}"
+  instance_type          = "${var.server_instance_type}"
  key_name               = "${var.key_name}"
  vpc_security_group_ids = ["${aws_security_group.primary.id}"]
  count                  = "${var.server_count}"
@@ -142,7 +184,7 @@ resource "aws_instance" "server" {

 resource "aws_instance" "client" {
  ami                    = "${var.ami}"
-  instance_type          = "${var.instance_type}"
+  instance_type          = "${var.client_instance_type}"
  key_name               = "${var.key_name}"
  vpc_security_group_ids = ["${aws_security_group.primary.id}"]
  count                  = "${var.client_count}"
@@ -213,6 +255,26 @@ data "aws_iam_policy_document" "auto_discover_cluster" {
  }
 }

+resource "aws_elb" "server_lb" {
+  name               = "${var.name}-server-lb"
+  availability_zones = ["${distinct(aws_instance.server.*.availability_zone)}"]
+  internal           = false
+  instances = ["${aws_instance.server.*.id}"]
+  listener {
+    instance_port     = 4646
+    instance_protocol = "http"
+    lb_port           = 4646
+    lb_protocol       = "http"
+  }
+  listener {
+    instance_port     = 8500
+    instance_protocol = "http"
+    lb_port           = 8500
+    lb_protocol       = "http"
+  }
+  security_groups = ["${aws_security_group.server_lb.id}"]
+}
+
 output "server_public_ips" {
  value = ["${aws_instance.server.*.public_ip}"]
 }
@@ -220,3 +282,7 @@ output "server_public_ips" {
 output "client_public_ips" {
  value = ["${aws_instance.client.*.public_ip}"]
 }
+
+output "server_lb_ip" {
+  value = "${aws_elb.server_lb.dns_name}"
+}
--- a/terraform/aws/packer.json
+++ b/terraform/aws/packer.json
@@ -5,7 +5,7 @@
    "source_ami": "ami-80861296",
    "instance_type": "t2.medium",
    "ssh_username": "ubuntu",
-    "ami_name": "hashistack {{timestamp}}",
+    "ami_name": "cgbaker hashistack-nvidia {{timestamp}}",
    "ami_groups": ["all"]
  }],
  "provisioners":  [
@@ -28,6 +28,9 @@
  },
  {
    "type": "shell",
-    "script": "../shared/scripts/setup.sh"
+    "script": "../shared/scripts/setup.sh",
+    "environment_vars": [
+      "INSTALL_NVIDIA_DOCKER=true"
+    ]
  }]
 }
--- a/terraform/examples/tensorrt/tensorrt-demo.nomad
+++ b/terraform/examples/tensorrt/tensorrt-demo.nomad
+job "tensorrt" {
+  datacenters = ["dc1"]
+
+  group "back" {
+    task "rtserver" {
+      driver = "docker"
+      config {
+        image = "nvcr.io/nvidia/tensorrtserver:19.02-py3"
+        command = "trtserver"
+        args = [
+          "--model-store=${NOMAD_TASK_DIR}/models"
+        ]
+        shm_size=1024
+        port_map { 
+          http = 8000
+          grpc = 8001 
+          metrics = 8002 
+        }
+        ulimit {
+          memlock = "-1"
+          stack = "67108864" 
+        }
+      }
+     
+      service {
+        port = "http"
+        tags = ["http"]
+        check {
+          type     = "http"
+          port     = "http"
+          path     = "/api/health/ready"
+          interval = "5s"
+          timeout  = "1s"
+        }
+        check_restart {
+          grace = "30s"
+        }
+      }
+    
+      # load the example model into ${NOMAD_TASK_DIR}/models
+      artifact {
+        source      = "http://download.caffe2.ai.s3.amazonaws.com/models/resnet50/predict_net.pb"
+        destination = "local/models/resnet50_netdef/1/model.netdef"
+        mode = "file"
+      }
+      artifact {
+        source      = "http://download.caffe2.ai.s3.amazonaws.com/models/resnet50/init_net.pb"
+        destination = "local/models/resnet50_netdef/1/init_model.netdef"
+        mode = "file"
+      }
+      artifact {
+        source      = "https://raw.githubusercontent.com/NVIDIA/tensorrt-inference-server/v1.0.0/docs/examples/model_repository/resnet50_netdef/config.pbtxt"
+        destination = "local/models/resnet50_netdef/config.pbtxt"
+        mode = "file"
+      }
+      artifact {
+        source      = "https://raw.githubusercontent.com/NVIDIA/tensorrt-inference-server/v1.0.0/docs/examples/model_repository/resnet50_netdef/resnet50_labels.txt"
+        destination = "local/models/resnet50_netdef/resnet50_labels.txt"
+        mode = "file"
+      }
+
+      resources {
+        cpu = 8192
+        memory = 8192
+        network {
+          mbits = 10
+          port "http" {} 
+        }
+
+        # an Nvidia GPU with >= 4GiB memory, preferably a Tesla
+        device "nvidia/gpu" {
+          count = 1
+          constraint {
+            attribute = "${device.attr.memory}"
+            operator  = ">="
+            value     = "4 GiB"
+          }
+          affinity {
+            attribute = "${device.model}"
+            operator = "regexp"
+            value     = "Tesla"
+          }
+        }
+      }
+    }
+  }
+
+  group "front" {
+    task "web" {
+
+      driver = "docker"
+
+      config {
+        image = "renaudwastaken/tensorrt-frontend:latest"
+        args = [
+          "main.py", "${RTSERVER}"
+        ]
+        port_map { 
+          http = 5000
+        }
+      }
+
+      resources {
+        cpu = 1024
+        memory = 1024
+        network {
+          mbits = 10
+          port "http" { static = "8888" }
+        }
+      }
+
+      template {
+        data = <<EOH
+          RTSERVER = {{ with service "tensorrt-back-rtserver" }}{{ with index . 0 }} http://{{.Address }}:{{.Port }} {{ end }}{{ end }}
+        EOH
+        destination = "local/rtserver.env"
+        env = true
+      }
+
+    }
+  }
+
+}
--- a/terraform/shared/scripts/client.sh
+++ b/terraform/shared/scripts/client.sh
@@ -14,8 +14,8 @@ HOME_DIR=ubuntu
 # Wait for network
 sleep 15

-# IP_ADDRESS=$(curl http://instance-data/latest/meta-data/local-ipv4)
-IP_ADDRESS="$(/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}')"
+IP_ADDRESS=$(curl http://instance-data/latest/meta-data/local-ipv4)
+# IP_ADDRESS="$(/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}')"
 DOCKER_BRIDGE_IP_ADDRESS=(`ifconfig docker0 2>/dev/null|awk '/inet addr:/ {print $2}'|sed 's/addr://'`)
 CLOUD=$1
 RETRY_JOIN=$2

--- a/terraform/shared/scripts/server.sh
+++ b/terraform/shared/scripts/server.sh
@@ -15,8 +15,8 @@ HOME_DIR=ubuntu
 # Wait for network
 sleep 15

-# IP_ADDRESS=$(curl http://instance-data/latest/meta-data/local-ipv4)
-IP_ADDRESS="$(/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}')"
+IP_ADDRESS=$(curl http://instance-data/latest/meta-data/local-ipv4)
+# IP_ADDRESS="$(/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}')"
 DOCKER_BRIDGE_IP_ADDRESS=(`ifconfig docker0 2>/dev/null|awk '/inet addr:/ {print $2}'|sed 's/addr://'`)
 CLOUD=$1
 SERVER_COUNT=$2

--- a/terraform/shared/scripts/setup.sh
+++ b/terraform/shared/scripts/setup.sh
@@ -19,7 +19,7 @@ VAULTDOWNLOAD=https://releases.hashicorp.com/vault/${VAULTVERSION}/vault_${VAULT
 VAULTCONFIGDIR=/etc/vault.d
 VAULTDIR=/opt/vault

-NOMADVERSION=0.8.7
+NOMADVERSION=0.9.0
 NOMADDOWNLOAD=https://releases.hashicorp.com/nomad/${NOMADVERSION}/nomad_${NOMADVERSION}_linux_amd64.zip
 NOMADCONFIGDIR=/etc/nomad.d
 NOMADDIR=/opt/nomad
@@ -113,6 +113,25 @@ sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/${di
 sudo apt-get update
 sudo apt-get install -y docker-ce

+if [[ ! -z ${INSTALL_NVIDIA_DOCKER+x} ]]; then 
+  # Install official NVIDIA driver package
+  sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub
+  sudo sh -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list'
+  sudo apt-get update && sudo apt-get install -y --no-install-recommends linux-headers-generic dkms cuda-drivers
+
+  # Install nvidia-docker and nvidia-docker-plugin
+  # from: https://github.com/NVIDIA/nvidia-docker#ubuntu-140416041804-debian-jessiestretch
+  wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb
+  sudo dpkg -i /tmp/nvidia-docker*.deb && rm /tmp/nvidia-docker*.deb
+  curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+  distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+  curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
+    sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+
+  sudo apt-get update
+  sudo apt-get install -y nvidia-docker2
+fi
+
 # rkt
 VERSION=1.29.0
 DOWNLOAD=https://github.com/rkt/rkt/releases/download/v${VERSION}/rkt-v${VERSION}.tar.gz
@@ -155,7 +174,7 @@ JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::")

 # Spark
 sudo wget -P /ops/examples/spark https://s3.amazonaws.com/nomad-spark/spark-2.2.0-bin-nomad-0.7.0.tgz
-sudo tar -xvf /ops/examples/spark/spark-2.2.0-bin-nomad-0.7.0.tgz --directory /ops/examples/spark
+sudo tar -xf /ops/examples/spark/spark-2.2.0-bin-nomad-0.7.0.tgz --directory /ops/examples/spark
 sudo mv /ops/examples/spark/spark-2.2.0-bin-nomad-0.7.0 /usr/local/bin/spark
 sudo chown -R root:root /usr/local/bin/spark