# Note: Instance Types cannot be 'amd epyc', e.g. m5a.* and r5a.* # Note: Instance Types cannot be 'graviton', e.g. m6g.* resource "aws_emr_cluster" "cluster" { name = local.unique_id release_label = "emr-6.8.0" # aws emr list-release-labels # per https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-managed-notebooks-considerations.html, # these ara the 4 applications that are needed applications = ["Spark", "Hadoop", "Livy", "JupyterEnterpriseGateway", ] # Maybe add : "JupyterHub"] log_uri = "s3://${aws_s3_bucket.s3_emr.id}/logs/" termination_protection = false # may want 'true' for production keep_job_flow_alive_when_no_steps = true # If true, cluster sticks around when nothing to do ec2_attributes { subnet_id = aws_subnet.main.id emr_managed_master_security_group = aws_security_group.allow_access.id emr_managed_slave_security_group = aws_security_group.allow_access.id instance_profile = aws_iam_instance_profile.emr_profile.arn key_name = local.key_pair } master_instance_fleet { name = "${local.unique_id} Master" target_on_demand_capacity = 0 target_spot_capacity = 1 instance_type_configs { instance_type = "m5.2xlarge" bid_price_as_percentage_of_on_demand_price = 50 } instance_type_configs { instance_type = "m5.xlarge" bid_price_as_percentage_of_on_demand_price = 70 } launch_specifications { on_demand_specification { allocation_strategy = "lowest-price" # Only option } spot_specification { allocation_strategy = "capacity-optimized" #block_duration_minutes = 0 timeout_action = "TERMINATE_CLUSTER" timeout_duration_minutes = 15 # Give up on the spot pricing after 15 minutes } } } core_instance_fleet { name = "${local.unique_id} Core" target_on_demand_capacity = 0 target_spot_capacity = 2 instance_type_configs { bid_price_as_percentage_of_on_demand_price = 50 ebs_config { size = 60 type = "gp3" volumes_per_instance = 1 } instance_type = "m5.2xlarge" weighted_capacity = 1 } instance_type_configs { bid_price_as_percentage_of_on_demand_price = 70 ebs_config { size = 60 type = "gp3" volumes_per_instance = 1 } instance_type = "m5.xlarge" weighted_capacity = 2 } launch_specifications { on_demand_specification { allocation_strategy = "lowest-price" # Only option } spot_specification { allocation_strategy = "capacity-optimized" #block_duration_minutes = 0 timeout_action = "SWITCH_TO_ON_DEMAND" timeout_duration_minutes = 15 # wait up to 15 minutes for a spot instance } } } tags = local.tags service_role = aws_iam_role.iam_emr_service_role.arn } # bootstrap_action { # path = "s3://us-east-2.elasticmapreduce/bootstrap-actions/run-if" # name = "runif" # args = ["instance.isMaster=true", "echo running on master node"] # } # We want a long-running cluster #auto_termination_policy { # idle_timeout = 60 #} # step { # name = "Install modules" # action_on_failure = "CONTINUE" # # hadoop_jar_step { # jar = "command-runner.jar" # args = ["sudo", "python3", "-m", "pip", "install", "numpy", "matplotlib", "pandas", "seaborn", "pyspark"] # } # } # # step { # name = "Copy script file from s3." # action_on_failure = "CONTINUE" # # hadoop_jar_step { # jar = "command-runner.jar" # args = ["aws", "s3", "cp", "s3://monkeybox-emr-test/health_violations.py", "/home/hadoop/"] # } # } # # step { # name = "Pyspark Job." # action_on_failure = "CONTINUE" # # hadoop_jar_step { # jar = "command-runner.jar" # args = ["sudo", "python3", "/home/hadoop/health_violations.py", "--data_source", "s3://monkeybox-emr-test/food_establishment_data.csv", "--output_uri", "s3://monkeybox-emr-test/output/"] # } # } # configurations_json = <