emr.tf 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. #resource "aws_emr_cluster" "cluster" {
  2. # name = "monkeybox-emr-lab"
  3. # release_label = "emr-5.36.0"
  4. # applications = ["Spark"]
  5. # log_uri = "s3://monkeybox-emr-test/logs"
  6. #
  7. # ec2_attributes {
  8. # subnet_id = aws_subnet.main.id
  9. # emr_managed_master_security_group = aws_security_group.allow_access.id
  10. # emr_managed_slave_security_group = aws_security_group.allow_access.id
  11. # instance_profile = aws_iam_instance_profile.emr_profile.arn
  12. # }
  13. #
  14. # master_instance_group {
  15. # name = "monkeybox-emr-lab-master"
  16. # bid_price = "0.07" # Don't set this over the hourly cost!
  17. # instance_type = "m5.xlarge"
  18. # }
  19. #
  20. # core_instance_group {
  21. # name = "monkeybox-emr-lab-core"
  22. # bid_price = "0.07" # Don't set this over the hourly cost!
  23. # instance_count = 2
  24. # instance_type = "m5.xlarge"
  25. # }
  26. #
  27. # tags = {
  28. # project = "monkeybox_emr_lab"
  29. # }
  30. #
  31. # bootstrap_action {
  32. # path = "s3://us-east-2.elasticmapreduce/bootstrap-actions/run-if"
  33. # name = "runif"
  34. # args = ["instance.isMaster=true", "echo running on master node"]
  35. # }
  36. #
  37. # auto_termination_policy {
  38. # idle_timeout = 60
  39. # }
  40. #
  41. # step {
  42. # name = "Install modules"
  43. # action_on_failure = "CONTINUE"
  44. #
  45. # hadoop_jar_step {
  46. # jar = "command-runner.jar"
  47. # args = ["sudo", "python3", "-m", "pip", "install", "numpy", "matplotlib", "pandas", "seaborn", "pyspark"]
  48. # }
  49. # }
  50. #
  51. # step {
  52. # name = "Copy script file from s3."
  53. # action_on_failure = "CONTINUE"
  54. #
  55. # hadoop_jar_step {
  56. # jar = "command-runner.jar"
  57. # args = ["aws", "s3", "cp", "s3://monkeybox-emr-test/health_violations.py", "/home/hadoop/"]
  58. # }
  59. # }
  60. #
  61. # step {
  62. # name = "Pyspark Job."
  63. # action_on_failure = "CONTINUE"
  64. #
  65. # hadoop_jar_step {
  66. # jar = "command-runner.jar"
  67. # args = ["sudo", "python3", "/home/hadoop/health_violations.py", "--data_source", "s3://monkeybox-emr-test/food_establishment_data.csv", "--output_uri", "s3://monkeybox-emr-test/output/"]
  68. # }
  69. # }
  70. #
  71. # # configurations_json = <<EOF
  72. # #[
  73. # # {
  74. # # "Classification": "hadoop-env",
  75. # # "Configurations": [
  76. # # {
  77. # # "Classification": "export",
  78. # # "Properties": {
  79. # # "JAVA_HOME": "/usr/lib/jvm/java-1.8.0"
  80. # # }
  81. # # }
  82. # # ],
  83. # # "Properties": {}
  84. # # },
  85. # # {
  86. # # "Classification": "spark-env",
  87. # # "Configurations": [
  88. # # {
  89. # # "Classification": "export",
  90. # # "Properties": {
  91. # # "JAVA_HOME": "/usr/lib/jvm/java-1.8.0"
  92. # # }
  93. # # }
  94. # # ],
  95. # # "Properties": {}
  96. # # }
  97. # #]
  98. # #EOF
  99. #
  100. # service_role = aws_iam_role.iam_emr_service_role.arn
  101. #}