123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- # Note: Instance Types cannot be 'amd epyc', e.g. m5a.* and r5a.*
- # Note: Instance Types cannot be 'graviton', e.g. m6g.*
- resource "aws_emr_cluster" "cluster" {
- name = local.unique_id
- release_label = "emr-6.8.0" # aws emr list-release-labels
- # per https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-managed-notebooks-considerations.html,
- # these ara the 4 applications that are needed
- applications = ["Spark", "Hadoop", "Livy", "JupyterEnterpriseGateway", ]
- # Maybe add : "JupyterHub"]
- log_uri = "s3://${aws_s3_bucket.s3_emr.id}/logs/"
- termination_protection = false # may want 'true' for production
- keep_job_flow_alive_when_no_steps = true # If true, cluster sticks around when nothing to do
- ec2_attributes {
- subnet_id = aws_subnet.main.id
- emr_managed_master_security_group = aws_security_group.allow_access.id
- emr_managed_slave_security_group = aws_security_group.allow_access.id
- instance_profile = aws_iam_instance_profile.emr_profile.arn
- key_name = local.key_pair
- }
- master_instance_fleet {
- name = "${local.unique_id} Master"
- target_on_demand_capacity = 0
- target_spot_capacity = 1
- instance_type_configs {
- instance_type = "m5.2xlarge"
- bid_price_as_percentage_of_on_demand_price = 50
- }
- instance_type_configs {
- instance_type = "m5.xlarge"
- bid_price_as_percentage_of_on_demand_price = 70
- }
- launch_specifications {
- on_demand_specification {
- allocation_strategy = "lowest-price" # Only option
- }
- spot_specification {
- allocation_strategy = "capacity-optimized"
- #block_duration_minutes = 0
- timeout_action = "TERMINATE_CLUSTER"
- timeout_duration_minutes = 15 # Give up on the spot pricing after 15 minutes
- }
- }
- }
- core_instance_fleet {
- name = "${local.unique_id} Core"
- target_on_demand_capacity = 0
- target_spot_capacity = 2
- instance_type_configs {
- bid_price_as_percentage_of_on_demand_price = 50
- ebs_config {
- size = 60
- type = "gp3"
- volumes_per_instance = 1
- }
- instance_type = "m5.2xlarge"
- weighted_capacity = 1
- }
- instance_type_configs {
- bid_price_as_percentage_of_on_demand_price = 70
- ebs_config {
- size = 60
- type = "gp3"
- volumes_per_instance = 1
- }
- instance_type = "m5.xlarge"
- weighted_capacity = 2
- }
- launch_specifications {
- on_demand_specification {
- allocation_strategy = "lowest-price" # Only option
- }
- spot_specification {
- allocation_strategy = "capacity-optimized"
- #block_duration_minutes = 0
- timeout_action = "SWITCH_TO_ON_DEMAND"
- timeout_duration_minutes = 15 # wait up to 15 minutes for a spot instance
- }
- }
- }
- tags = local.tags
- service_role = aws_iam_role.iam_emr_service_role.arn
- }
- # bootstrap_action {
- # path = "s3://us-east-2.elasticmapreduce/bootstrap-actions/run-if"
- # name = "runif"
- # args = ["instance.isMaster=true", "echo running on master node"]
- # }
- # We want a long-running cluster
- #auto_termination_policy {
- # idle_timeout = 60
- #}
- # step {
- # name = "Install modules"
- # action_on_failure = "CONTINUE"
- #
- # hadoop_jar_step {
- # jar = "command-runner.jar"
- # args = ["sudo", "python3", "-m", "pip", "install", "numpy", "matplotlib", "pandas", "seaborn", "pyspark"]
- # }
- # }
- #
- # step {
- # name = "Copy script file from s3."
- # action_on_failure = "CONTINUE"
- #
- # hadoop_jar_step {
- # jar = "command-runner.jar"
- # args = ["aws", "s3", "cp", "s3://monkeybox-emr-test/health_violations.py", "/home/hadoop/"]
- # }
- # }
- #
- # step {
- # name = "Pyspark Job."
- # action_on_failure = "CONTINUE"
- #
- # hadoop_jar_step {
- # jar = "command-runner.jar"
- # args = ["sudo", "python3", "/home/hadoop/health_violations.py", "--data_source", "s3://monkeybox-emr-test/food_establishment_data.csv", "--output_uri", "s3://monkeybox-emr-test/output/"]
- # }
- # }
- # configurations_json = <<EOF
- #[
- # {
- # "Classification": "hadoop-env",
- # "Configurations": [
- # {
- # "Classification": "export",
- # "Properties": {
- # "JAVA_HOME": "/usr/lib/jvm/java-1.8.0"
- # }
- # }
- # ],
- # "Properties": {}
- # },
- # {
- # "Classification": "spark-env",
- # "Configurations": [
- # {
- # "Classification": "export",
- # "Properties": {
- # "JAVA_HOME": "/usr/lib/jvm/java-1.8.0"
- # }
- # }
- # ],
- # "Properties": {}
- # }
- #]
- #EOF
- resource "aws_emr_managed_scaling_policy" "cluster-asg-policy" {
- cluster_id = aws_emr_cluster.cluster.id
- compute_limits {
- unit_type = "InstanceFleetUnits"
- minimum_capacity_units = 1
- maximum_capacity_units = 10
- maximum_ondemand_capacity_units = 2
- maximum_core_capacity_units = 10
- }
- }
|