emr.tf 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. # Note: Instance Types cannot be 'amd epyc', e.g. m5a.* and r5a.*
  2. # Note: Instance Types cannot be 'graviton', e.g. m6g.*
  3. resource "aws_emr_cluster" "cluster" {
  4. name = local.unique_id
  5. release_label = "emr-6.8.0" # aws emr list-release-labels
  6. # per https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-managed-notebooks-considerations.html,
  7. # these ara the 4 applications that are needed
  8. applications = ["Spark", "Hadoop", "Livy", "JupyterEnterpriseGateway", ]
  9. # Maybe add : "JupyterHub"]
  10. log_uri = "s3://${aws_s3_bucket.s3_emr.id}/logs/"
  11. termination_protection = false # may want 'true' for production
  12. keep_job_flow_alive_when_no_steps = true # If true, cluster sticks around when nothing to do
  13. ec2_attributes {
  14. subnet_id = aws_subnet.main.id
  15. emr_managed_master_security_group = aws_security_group.allow_access.id
  16. emr_managed_slave_security_group = aws_security_group.allow_access.id
  17. instance_profile = aws_iam_instance_profile.emr_profile.arn
  18. key_name = local.key_pair
  19. }
  20. master_instance_fleet {
  21. name = "${local.unique_id} Master"
  22. target_on_demand_capacity = 0
  23. target_spot_capacity = 1
  24. instance_type_configs {
  25. instance_type = "m5.2xlarge"
  26. bid_price_as_percentage_of_on_demand_price = 50
  27. }
  28. instance_type_configs {
  29. instance_type = "m5.xlarge"
  30. bid_price_as_percentage_of_on_demand_price = 70
  31. }
  32. launch_specifications {
  33. on_demand_specification {
  34. allocation_strategy = "lowest-price" # Only option
  35. }
  36. spot_specification {
  37. allocation_strategy = "capacity-optimized"
  38. #block_duration_minutes = 0
  39. timeout_action = "TERMINATE_CLUSTER"
  40. timeout_duration_minutes = 15 # Give up on the spot pricing after 15 minutes
  41. }
  42. }
  43. }
  44. core_instance_fleet {
  45. name = "${local.unique_id} Core"
  46. target_on_demand_capacity = 0
  47. target_spot_capacity = 2
  48. instance_type_configs {
  49. bid_price_as_percentage_of_on_demand_price = 50
  50. ebs_config {
  51. size = 60
  52. type = "gp3"
  53. volumes_per_instance = 1
  54. }
  55. instance_type = "m5.2xlarge"
  56. weighted_capacity = 1
  57. }
  58. instance_type_configs {
  59. bid_price_as_percentage_of_on_demand_price = 70
  60. ebs_config {
  61. size = 60
  62. type = "gp3"
  63. volumes_per_instance = 1
  64. }
  65. instance_type = "m5.xlarge"
  66. weighted_capacity = 2
  67. }
  68. launch_specifications {
  69. on_demand_specification {
  70. allocation_strategy = "lowest-price" # Only option
  71. }
  72. spot_specification {
  73. allocation_strategy = "capacity-optimized"
  74. #block_duration_minutes = 0
  75. timeout_action = "SWITCH_TO_ON_DEMAND"
  76. timeout_duration_minutes = 15 # wait up to 15 minutes for a spot instance
  77. }
  78. }
  79. }
  80. tags = local.tags
  81. service_role = aws_iam_role.iam_emr_service_role.arn
  82. }
  83. # bootstrap_action {
  84. # path = "s3://us-east-2.elasticmapreduce/bootstrap-actions/run-if"
  85. # name = "runif"
  86. # args = ["instance.isMaster=true", "echo running on master node"]
  87. # }
  88. # We want a long-running cluster
  89. #auto_termination_policy {
  90. # idle_timeout = 60
  91. #}
  92. # step {
  93. # name = "Install modules"
  94. # action_on_failure = "CONTINUE"
  95. #
  96. # hadoop_jar_step {
  97. # jar = "command-runner.jar"
  98. # args = ["sudo", "python3", "-m", "pip", "install", "numpy", "matplotlib", "pandas", "seaborn", "pyspark"]
  99. # }
  100. # }
  101. #
  102. # step {
  103. # name = "Copy script file from s3."
  104. # action_on_failure = "CONTINUE"
  105. #
  106. # hadoop_jar_step {
  107. # jar = "command-runner.jar"
  108. # args = ["aws", "s3", "cp", "s3://monkeybox-emr-test/health_violations.py", "/home/hadoop/"]
  109. # }
  110. # }
  111. #
  112. # step {
  113. # name = "Pyspark Job."
  114. # action_on_failure = "CONTINUE"
  115. #
  116. # hadoop_jar_step {
  117. # jar = "command-runner.jar"
  118. # args = ["sudo", "python3", "/home/hadoop/health_violations.py", "--data_source", "s3://monkeybox-emr-test/food_establishment_data.csv", "--output_uri", "s3://monkeybox-emr-test/output/"]
  119. # }
  120. # }
  121. # configurations_json = <<EOF
  122. #[
  123. # {
  124. # "Classification": "hadoop-env",
  125. # "Configurations": [
  126. # {
  127. # "Classification": "export",
  128. # "Properties": {
  129. # "JAVA_HOME": "/usr/lib/jvm/java-1.8.0"
  130. # }
  131. # }
  132. # ],
  133. # "Properties": {}
  134. # },
  135. # {
  136. # "Classification": "spark-env",
  137. # "Configurations": [
  138. # {
  139. # "Classification": "export",
  140. # "Properties": {
  141. # "JAVA_HOME": "/usr/lib/jvm/java-1.8.0"
  142. # }
  143. # }
  144. # ],
  145. # "Properties": {}
  146. # }
  147. #]
  148. #EOF
  149. resource "aws_emr_managed_scaling_policy" "cluster-asg-policy" {
  150. cluster_id = aws_emr_cluster.cluster.id
  151. compute_limits {
  152. unit_type = "InstanceFleetUnits"
  153. minimum_capacity_units = 1
  154. maximum_capacity_units = 10
  155. maximum_ondemand_capacity_units = 2
  156. maximum_core_capacity_units = 10
  157. }
  158. }