Terraform
/
monkeybox_emr_lab_jupyter


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
							# Note: Instance Types cannot be 'amd epyc', e.g. m5a.* and r5a.*
# Note: Instance Types cannot be 'graviton', e.g. m6g.*

resource "aws_emr_cluster" "cluster" {
  name          = local.unique_id
  release_label = "emr-6.8.0" # aws emr list-release-labels
  # per https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-managed-notebooks-considerations.html, 
  # these ara the 4 applications that are needed
  applications = ["Spark", "Hadoop", "Livy", "JupyterEnterpriseGateway", ]
  # Maybe add : "JupyterHub"]

  log_uri = "s3://${aws_s3_bucket.s3_emr.id}/logs/"

  termination_protection            = false # may want 'true' for production
  keep_job_flow_alive_when_no_steps = true  # If true, cluster sticks around when nothing to do

  ec2_attributes {
    subnet_id                         = aws_subnet.main.id
    emr_managed_master_security_group = aws_security_group.allow_access.id
    emr_managed_slave_security_group  = aws_security_group.allow_access.id
    instance_profile                  = aws_iam_instance_profile.emr_profile.arn
    key_name                          = local.key_pair
  }

  master_instance_fleet {
    name                      = "${local.unique_id} Master"
    target_on_demand_capacity = 0
    target_spot_capacity      = 1
    instance_type_configs {
      instance_type                              = "m5.2xlarge"
      bid_price_as_percentage_of_on_demand_price = 50
    }
    instance_type_configs {
      instance_type                              = "m5.xlarge"
      bid_price_as_percentage_of_on_demand_price = 70
    }
    launch_specifications {
      on_demand_specification {
        allocation_strategy = "lowest-price" # Only option
      }
      spot_specification {
        allocation_strategy = "capacity-optimized"
        #block_duration_minutes   = 0
        timeout_action           = "TERMINATE_CLUSTER"
        timeout_duration_minutes = 15 # Give up on the spot pricing after 15 minutes
      }
    }
  }

  core_instance_fleet {
    name                      = "${local.unique_id} Core"
    target_on_demand_capacity = 0
    target_spot_capacity      = 2
    instance_type_configs {
      bid_price_as_percentage_of_on_demand_price = 50
      ebs_config {
        size                 = 60
        type                 = "gp3"
        volumes_per_instance = 1
      }
      instance_type     = "m5.2xlarge"
      weighted_capacity = 1
    }
    instance_type_configs {
      bid_price_as_percentage_of_on_demand_price = 70
      ebs_config {
        size                 = 60
        type                 = "gp3"
        volumes_per_instance = 1
      }
      instance_type     = "m5.xlarge"
      weighted_capacity = 2
    }
    launch_specifications {
      on_demand_specification {
        allocation_strategy = "lowest-price" # Only option
      }
      spot_specification {
        allocation_strategy = "capacity-optimized"
        #block_duration_minutes   = 0
        timeout_action           = "SWITCH_TO_ON_DEMAND"
        timeout_duration_minutes = 15 # wait up to 15 minutes for a spot instance
      }
    }
  }
  tags = local.tags

  service_role = aws_iam_role.iam_emr_service_role.arn
}


#  bootstrap_action {
#    path = "s3://us-east-2.elasticmapreduce/bootstrap-actions/run-if"
#    name = "runif"
#    args = ["instance.isMaster=true", "echo running on master node"]
#  }

# We want a long-running cluster
#auto_termination_policy {
#  idle_timeout = 60
#}

#  step {
#    name              = "Install modules"
#    action_on_failure = "CONTINUE"
#
#    hadoop_jar_step {
#      jar  = "command-runner.jar"
#      args = ["sudo", "python3", "-m", "pip", "install", "numpy", "matplotlib", "pandas", "seaborn", "pyspark"]
#    }
#  }
#
#  step {
#    name              = "Copy script file from s3."
#    action_on_failure = "CONTINUE"
#
#    hadoop_jar_step {
#      jar  = "command-runner.jar"
#      args = ["aws", "s3", "cp", "s3://monkeybox-emr-test/health_violations.py", "/home/hadoop/"]
#    }
#  }
#
#  step {
#    name              = "Pyspark Job."
#    action_on_failure = "CONTINUE"
#
#    hadoop_jar_step {
#      jar  = "command-runner.jar"
#      args = ["sudo", "python3", "/home/hadoop/health_violations.py", "--data_source", "s3://monkeybox-emr-test/food_establishment_data.csv", "--output_uri", "s3://monkeybox-emr-test/output/"]
#    }
#  }

#  configurations_json = <<EOF
#[
#  {
#    "Classification": "hadoop-env",
#    "Configurations": [
#      {
#        "Classification": "export",
#        "Properties": {
#          "JAVA_HOME": "/usr/lib/jvm/java-1.8.0"
#        }
#      }
#    ],
#    "Properties": {}
#  },
#  {
#    "Classification": "spark-env",
#    "Configurations": [
#      {
#        "Classification": "export",
#        "Properties": {
#          "JAVA_HOME": "/usr/lib/jvm/java-1.8.0"
#        }
#      }
#    ],
#    "Properties": {}
#  }
#]
#EOF


resource "aws_emr_managed_scaling_policy" "cluster-asg-policy" {
  cluster_id = aws_emr_cluster.cluster.id
  compute_limits {
    unit_type                       = "InstanceFleetUnits"
    minimum_capacity_units          = 1
    maximum_capacity_units          = 10
    maximum_ondemand_capacity_units = 2
    maximum_core_capacity_units     = 10
  }
}