Просмотр исходного кода

EMR appears to stay up. Getting closer.

Fred Damstra (Macbook 2015) 2 лет назад
Родитель
Сommit
6a33778245
3 измененных файлов с 23 добавлено и 4 удалено
  1. 3 0
      config.tf
  2. 12 4
      emr.tf
  3. 8 0
      notes.md

+ 3 - 0
config.tf

@@ -2,6 +2,9 @@ locals {
   # unique id is used for terraform backend state storage. Duplicates _will_ be a problem.
   unique_id = "monkeybox_emr_lab_jupyter"
 
+  # an ssh key pair that must already exist in EC2
+  key_pair = "Fred-IO"
+
   # Everything here should be self-explanatory
   profile = "default"
   region  = "us-east-2"

+ 12 - 4
emr.tf

@@ -1,17 +1,25 @@
+# Note: Instance Types cannot be 'amd epyc', e.g. m5a.* and r5a.*
+# Note: Instance Types cannot be 'graviton', e.g. m6g.*
+
 resource "aws_emr_cluster" "cluster" {
   name          = local.unique_id
   release_label = "emr-6.8.0" # aws emr list-release-labels
-  applications  = ["Spark", "Hadoop", "JupyterEnterpriseGateway", "JupyterHub"]
-  log_uri       = "s3://${aws_s3_bucket.s3_emr.id}/logs/"
+  # per https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-managed-notebooks-considerations.html, 
+  # these ara the 4 applications that are needed
+  applications = ["Spark", "Hadoop", "Livy", "JupyterEnterpriseGateway", ]
+  # Maybe add : "JupyterHub"]
+
+  log_uri = "s3://${aws_s3_bucket.s3_emr.id}/logs/"
 
   termination_protection            = false # may want 'true' for production
-  keep_job_flow_alive_when_no_steps = false # If true, workers stay active. If false, HDFS is lost.
+  keep_job_flow_alive_when_no_steps = true  # If true, cluster sticks around when nothing to do
 
   ec2_attributes {
     subnet_id                         = aws_subnet.main.id
     emr_managed_master_security_group = aws_security_group.allow_access.id
     emr_managed_slave_security_group  = aws_security_group.allow_access.id
     instance_profile                  = aws_iam_instance_profile.emr_profile.arn
+    key_name                          = local.key_pair
   }
 
   master_instance_fleet {
@@ -155,7 +163,7 @@ resource "aws_emr_cluster" "cluster" {
 resource "aws_emr_managed_scaling_policy" "cluster-asg-policy" {
   cluster_id = aws_emr_cluster.cluster.id
   compute_limits {
-    unit_type                       = "Instances"
+    unit_type                       = "InstanceFleetUnits"
     minimum_capacity_units          = 1
     maximum_capacity_units          = 10
     maximum_ondemand_capacity_units = 2

+ 8 - 0
notes.md

@@ -0,0 +1,8 @@
+Jupyter Notebook and Python versions
+
+EMR Notebooks runs Jupyter Notebook version 6.0.2
+and Python 3.6.5 regardless of the Amazon EMR release version of the attached cluster.
+
+[source](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-managed-notebooks-considerations.html)
+
+