Terraform
/
monkeybox_emr_lab_jupyter


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
							#resource "aws_emr_cluster" "cluster" {
#  name          = "monkeybox-emr-lab"
#  release_label = "emr-5.36.0"
#  applications  = ["Spark"]
#  log_uri       = "s3://monkeybox-emr-test/logs"
#
#  ec2_attributes {
#    subnet_id                         = aws_subnet.main.id
#    emr_managed_master_security_group = aws_security_group.allow_access.id
#    emr_managed_slave_security_group  = aws_security_group.allow_access.id
#    instance_profile                  = aws_iam_instance_profile.emr_profile.arn
#  }
#
#  master_instance_group {
#    name          = "monkeybox-emr-lab-master"
#    bid_price     = "0.07" # Don't set this over the hourly cost!
#    instance_type = "m5.xlarge"
#  }
#
#  core_instance_group {
#    name           = "monkeybox-emr-lab-core"
#    bid_price      = "0.07" # Don't set this over the hourly cost!
#    instance_count = 2
#    instance_type  = "m5.xlarge"
#  }
#
#  tags = {
#    project = "monkeybox_emr_lab"
#  }
#
#  bootstrap_action {
#    path = "s3://us-east-2.elasticmapreduce/bootstrap-actions/run-if"
#    name = "runif"
#    args = ["instance.isMaster=true", "echo running on master node"]
#  }
#
#  auto_termination_policy {
#    idle_timeout = 60
#  }
#
#  step {
#    name              = "Install modules"
#    action_on_failure = "CONTINUE"
#
#    hadoop_jar_step {
#      jar  = "command-runner.jar"
#      args = ["sudo", "python3", "-m", "pip", "install", "numpy", "matplotlib", "pandas", "seaborn", "pyspark"]
#    }
#  }
#
#  step {
#    name              = "Copy script file from s3."
#    action_on_failure = "CONTINUE"
#
#    hadoop_jar_step {
#      jar  = "command-runner.jar"
#      args = ["aws", "s3", "cp", "s3://monkeybox-emr-test/health_violations.py", "/home/hadoop/"]
#    }
#  }
#
#  step {
#    name              = "Pyspark Job."
#    action_on_failure = "CONTINUE"
#
#    hadoop_jar_step {
#      jar  = "command-runner.jar"
#      args = ["sudo", "python3", "/home/hadoop/health_violations.py", "--data_source", "s3://monkeybox-emr-test/food_establishment_data.csv", "--output_uri", "s3://monkeybox-emr-test/output/"]
#    }
#  }
#
#  #  configurations_json = <<EOF
#  #[
#  #  {
#  #    "Classification": "hadoop-env",
#  #    "Configurations": [
#  #      {
#  #        "Classification": "export",
#  #        "Properties": {
#  #          "JAVA_HOME": "/usr/lib/jvm/java-1.8.0"
#  #        }
#  #      }
#  #    ],
#  #    "Properties": {}
#  #  },
#  #  {
#  #    "Classification": "spark-env",
#  #    "Configurations": [
#  #      {
#  #        "Classification": "export",
#  #        "Properties": {
#  #          "JAVA_HOME": "/usr/lib/jvm/java-1.8.0"
#  #        }
#  #      }
#  #    ],
#  #    "Properties": {}
#  #  }
#  #]
#  #EOF
#
#  service_role = aws_iam_role.iam_emr_service_role.arn
#}