# Terraform Splunk ASG Notes.md 5/28/2020 These are the notes for the migration to the indexer ASG. --- module.moose_cluster.module.indexer_cluster.module.indexer2.aws_launch_configuration.splunk_indexer module.moose_cluster.module.indexer_cluster.module.indexer2.aws_autoscaling_group.splunk_indexer_asg terraform destroy -target=module.moose_cluster.module.indexer_cluster.module.indexer1.aws_launch_configuration.splunk_indexer -target=module.moose_cluster.module.indexer_cluster.module.indexer1.aws_autoscaling_group.splunk_indexer_asg -target=module.moose_cluster.module.indexer_cluster.module.indexer2.aws_launch_configuration.splunk_indexer -target=module.moose_cluster.module.indexer_cluster.module.indexer2.aws_autoscaling_group.splunk_indexer_asg -target=module.moose_cluster.module.indexer_cluster.module.indexer0.aws_launch_configuration.splunk_indexer -target=module.moose_cluster.module.indexer_cluster.module.indexer0.aws_autoscaling_group.splunk_indexer_asg terraform destroy -target=module.moose_cluster.module.indexer_cluster.module.indexer1.aws_launch_template.splunk_indexer -target=module.moose_cluster.module.indexer_cluster.module.indexer1.aws_autoscaling_group.splunk_indexer_asg -target=module.moose_cluster.module.indexer_cluster.module.indexer2.aws_launch_template.splunk_indexer -target=module.moose_cluster.module.indexer_cluster.module.indexer2.aws_autoscaling_group.splunk_indexer_asg -target=module.moose_cluster.module.indexer_cluster.module.indexer0.aws_launch_template.splunk_indexer -target=module.moose_cluster.module.indexer_cluster.module.indexer0.aws_autoscaling_group.splunk_indexer_asg Current moose subnet: subnet-07312c554fb87e4b5 (main-infrastructure-public-us-east-1c) ASG subnet: subnet-0b1e9d82bcd8c0a2c (main-infrastructure-public-us-east-1a) resource "aws_launch_configuration" "splunk_indexer" { name = "${var.launch_conf_name}" instance_type = "${var.idx_instance_type}" image_id = "${var.ami}" user_data = "${var.user_data}" security_groups = ["${var.indexer_security_group_ids}"] associate_public_ip_address = false key_name = "${var.key_name}" iam_instance_profile = "${var.iam_instance_profile}" root_block_device = "${var.root_block_device}" ebs_block_device = "${local.ebs_block_device}" ebs_optimized = true ephemeral_block_device = [ { device_name = "xvdaa" virtual_name = "ephemeral0" }, { device_name = "xvdab" virtual_name = "ephemeral1" }, { device_name = "xvdac" virtual_name = "ephemeral2" }, { device_name = "xvdad" virtual_name = "ephemeral3" }, { device_name = "xvdae" virtual_name = "ephemeral4" }, { device_name = "xvdaf" virtual_name = "ephemeral5" }, { device_name = "xvdag" virtual_name = "ephemeral6" }, { device_name = "xvdah" virtual_name = "ephemeral7" }, { device_name = "xvdai" virtual_name = "ephemeral8" }, { device_name = "xvdaj" virtual_name = "ephemeral9" }, { device_name = "xvdak" virtual_name = "ephemeral10" }, { device_name = "xvdal" virtual_name = "ephemeral11" }, { device_name = "xvdam" virtual_name = "ephemeral12" }, { device_name = "xvdan" virtual_name = "ephemeral13" }, { device_name = "xvdao" virtual_name = "ephemeral14" }, { device_name = "xvdap" virtual_name = "ephemeral15" }, { device_name = "xvdaq" virtual_name = "ephemeral16" }, { device_name = "xvdar" virtual_name = "ephemeral17" }, { device_name = "xvdas" virtual_name = "ephemeral18" }, { device_name = "xvdat" virtual_name = "ephemeral19" }, { device_name = "xvdau" virtual_name = "ephemeral20" }, { device_name = "xvdav" virtual_name = "ephemeral21" }, { device_name = "xvdaw" virtual_name = "ephemeral22" }, { device_name = "xvdax" virtual_name = "ephemeral23" }, ] lifecycle { create_before_destroy = true } } ## ASG not launching 05/07/2020 ERROR: * module.moose_cluster.module.indexer_cluster.module.indexer0.aws_autoscaling_group.splunk_indexer_asg: 1 error(s) occurred: * aws_autoscaling_group.splunk_indexer_asg: "moose-splunk-asg-0": Waiting up to 10m0s: Need at least 1 healthy instances in ASG, have 0. Most recent activity: { ActivityId: "71d5c796-f6b8-7b06-600c-167c09da9b56", AutoScalingGroupName: "moose-splunk-asg-0", Cause: "At 2020-05-05T16:49:03Z an instance was started in response to a difference between desired and actual capacity, increasing the capacity from 0 to 1.", Description: "Launching a new EC2 instance. Status Reason: The requested configuration is currently not supported. Please check the documentation for supported configurations. Launching EC2 instance failed.", Details: "{\"Subnet ID\":\"subnet-0b1e9d82bcd8c0a2c\",\"Availability Zone\":\"us-east-1a\"}", EndTime: 2020-05-05 16:49:05 +0000 UTC, Progress: 100, StartTime: 2020-05-05 16:49:05.566 +0000 UTC, StatusCode: "Failed", StatusMessage: "The requested configuration is currently not supported. Please check the documentation for supported configurations. Launching EC2 instance failed." } FIX: ebs optimize needs to be set to false for t2.small instance size. --- # Salt issues 05/06/2020 Run the salt high state twice first time it gets 'stuck' when run with salt-call in the cloud-init??. kill it with saltutil.kill_job 20200528224000719269 RHEL subscription failing (Error: Must specify an activation key) Pillar must be bad! salt-call state.sls os_modifications.rhel_registration splunk install failing salt-call state.sls splunk.new_install salt moose-splunk-indexer-i* cmd.run 'systemctl restart splunkuf' # load balancer 05/08/2020 The internal UFs are pointing to moose-splunk-indexer-1.msoc.defpoint.local:9998, moose-splunk-indexer-2.msoc.defpoint.local:9998, moose-splunk-indexer-3.msoc.defpoint.local:9998. This is not going to work for a ASG. Switched to IDXC Discovery. Collectd is pointing to moose-splunk-indexers DNS > moose-splunk-indexers.msoc.defpoint.local > internal IPs :sad-face: Change to internal ELB pointing to target group. --- MOOSE LB #1 ( terraform/100-moose/elb.tf > moose_ext ) moose20190919200450791200000004 ALB INTERNET 443/8088 443 points to 8088. target group moose20190919200449849800000003 resource "aws_lb_target_group" "moose_ext_8088" pointing to dead moose indexers --- MOOSE LB #2 ( terraform/100-moose/elb.tf > moose ) Originally, this was setup this way for Phantom. Phantom only supported one DNS for all of Splunk. It now supports distributed Splunk. moose20190919200454975400000005 ALB INTERNAL 8088 target group Can i just point this at the moose-targets? moose20190919200449849000000002 resource "aws_lb_target_group" "moose_8088" pointing to dead moose indexers **PROPOSED** : Create ALB internal just for this thing and leave it in 100-moose. NOPE: iratemoses route53 points to ONE ELB with two listening ports. IDEA: create ireatemoses DNS in 100-moose and output the stuff upto moose. 8089 target group moose20190919200449848700000001 pointing to moose-splunk-sh moose_int_target_group = "${aws_lb_target_group.moose_8088}" moose_ext_target_group = "${aws_lb_target_group.moose_ext_8088}" "${var.create_hec_lb == 1 ? 1 : 0 }" "${var.create_hec_lb == 1 ? aws_lb_target_group.hec_8088.arn : "" }" --- CUSTOMER LB #1 ( terraform/modules/splunk_cluster/elb.tf > hec ) now working! --- CUSTOMER LB #2 ( terraform/modules/splunk_cluster/elb.tf > nlb ) 9998 target group: -target Already working for Moose with TF variable var.lb_target_group from splunk_cluster/elb.tf. --- CUSTOMER LB #3 ( terraform/modules/splunk_cluster/elb-classic-hec.tf > hec_classiclb ) 8088 now working! target_group_arns is for ALB, or NLBs ONLY create new target groups for ports 8088. IDXC D salt/fileroots/splunk/master/init.sls Then, update the Route 53 to point to the ELBs. PLAN: move the code to splunk_cluster and grab the output from splunk_indexer_asg then set the variables to false for the customers. count = "${var.make-lb == "true" ? 1 : 0 }" count = "${ var.create_private_dns == 1 ? var.count : 0 }" count = "${var.create_hec_lb == 1 ? 1 : 0 }" count = "${var.create_moose_ext_lb == 1 ? 1 : 0 }" count = "${var.create_moose_int_lb == 1 ? 1 : 0 }" count = "${var.create_moose_int_lb == 1 ? local.search_head_count : 0 }" count = "${var.create_moose_int_lb == 1 ? local.indexer_count : 0 }" resource "aws_lb_target_group_attachment" "moose_ext_8088" { count = "${local.indexer_count}" target_group_arn = "${aws_lb_target_group.moose_ext_8088.arn}" target_id = "${element(module.moose_cluster.idx_instances,count.index)}" } terraform apply -target=module.afs_cluster.module.indexer_cluster.aws_autoscaling_attachment.hec_classic_asg[2] -target=module.afs_cluster.module.indexer_cluster.aws_autoscaling_attachment.hec_classic_asg[1] -target=module.afs_cluster.module.indexer_cluster.aws_autoscaling_attachment.hec_classic_asg[0] terraform apply -target=module.afs_cluster.module.indexer_cluster.aws_elb.hec_classiclb terraform apply -target=module.afs_cluster.module.indexer_cluster.aws_autoscaling_attachment.hec_classic_asg0 -target=module.afs_cluster.module.indexer_cluster.aws_autoscaling_attachment.hec_classic_asg1 -target=module.afs_cluster.module.indexer_cluster.aws_autoscaling_attachment.hec_classic_asg2 terraform destroy -target=module.afs_cluster.module.indexer_cluster.aws_autoscaling_attachment.hec_classic_asg0 -target=module.afs_cluster.module.indexer_cluster.aws_autoscaling_attachment.hec_classic_asg1 -target=module.afs_cluster.module.indexer_cluster.aws_autoscaling_attachment.hec_classic_asg2 --- Internal DNS for -splunk-indexers Does anything use the customer-splunk-indexers DNS entry? Collectd uses the moose-splunk-indexers. PROPOSED: Lets create a new Route 53 that points to internal ALB and not static route53. resource "aws_route53_record" "indexers" # changes to each customer's code in customer.tf add additional variables for new module ``` asg_size_0 = 1 asg_size_1 = 1 asg_size_2 = 1 ``` in customer_env module in cust_variables.sls discovery-pass4SymmKey in outputs.conf for splunk nodes # Steps for migraion to PROD 0. CHECK TF to ensure attachments are in place. 0.1 CHECK indexers to ensure they are setup with IDXC discovery ( to avoid emails from SOC) 1. Setup IDXC-Discovery on CM 1.1 prep Cluster Master for IDXC by rm -rf the current outputs app 1.1.2 rm /opt/splunk/etc/apps/{{ pillar['cluster_name'] }}_cm_outputs/ 1.5 Run salt state on CM to enable IDXC discovery and enable IDXC outputs. This will replace the deleted outputs app. 1.5.1 `salt moose*cm* state.sls splunk.master test=true --state-output=changes` 1.6 adjust SH,HF outputs.conf to point to IDXC Discovery 1.7 rm /opt/splunk/etc/apps/{{ salt['pillar.get']('cluster_name') }}_sh_outputs/local/outputs.conf 1.8 rm /opt/splunk/etc/apps/{{ salt['pillar.get']('splunk:cluster_name') }}_hf_ec2_outputs/local/outputs.conf 1.9 Run salt states to change outputs 1.11 `salt moose*sh* state.sls splunk.search_head test=true --state-output=changes` 1.13 `salt moose*hf* state.sls splunk.heavy_forwarder test=true --state-output=changes` 1.14 Update all minions to IDXC discovery 1.14.1 `salt mail* state.sls internal_splunk_forwarder test=true --state-output=changes` 2. setup new iratemoses endpoint in TF. 2.0.1 TF apply in 05-customer_portal (openSGs), 100-moose (create DNS) 2.0 Update all minions to new collectd internal endpoint: iratemoses.msoc.defpoint.local 2.1 ensure pillar is refreshed ` salt minion pillar.item collectd:hec_hostname` 2.2 `salt minion network.connect iratemoses.msoc.defpoint.local 8088` 2.3 `salt minion state.sls collectd test=true --state-output=changes` 2.3 `salt-run survey.diff *.local cp.get_file_str file:///etc/collectd.conf` 2.4 Ensure collectd metrics are in moose splunk. 2.4.1 `| mstats count WHERE index=collectd metric_name=* by host, metric_name` 2.5 Ensure Splunk UFs are in moose splunk 2.5.1 `index="_internal" sourcetype=splunkd source="/opt/splunkforwarder/var/log/splunk/splunkd.log" | stats count by host` 3. stand up new templates and ASGs 3.1 launch new ASG instances 3.1.1 run highstate on new indexers 3.1.2 kill defunct highstate with this 3.1.3 `salt '*' saltutil.kill_job ` 3.1.4 pkg.upgrade to patch the server 3.2 Ensure three green checkmarks in CM 4. change salt master to new outputs (make sure it is working) 4.1 ensure they can connect first 4.2 `salt '*.local' network.connect moose-splunk-cm.msoc.defpoint.local 8089` 6. silence sensu 6. manually take the non-ASG indexers offline but don't stop instances ( yes you can offline 2 indexers at a time. ) 6.1 `/opt/splunk/bin/splunk offline --enforce-counts` 6.1.1 no users? create etc/system/local/user-seed.conf then restart ( https://answers.splunk.com/answers/834/how-to-reset-the-admin-password.html ) 7. terraform apply to remove unneeded objects. `systemctl stop sensu-agent` `cd /opt/splunk/etc` `mv passwd passwd.bak` `vim system/local/user-seed.conf` ``` [user_info] PASSWORD = KbxvB97DBTXFcxKOqm0P ``` 6.2 disable the service to prevent it startup back up 7. use tf to destroy the instances then remove the code from TF. <- this is tricky create new git branch 7.1 `terraform destroy -target=module.moose_cluster.module.indexer_cluster.module.indexers.aws_instance.this[0] -target=module.moose_cluster.module.indexer_cluster.module.indexers.aws_instance.this[1] -target=module.moose_cluster.module.indexer_cluster.module.indexers.aws_instance.this[2]` 8. ensure all LBs are pointing to the new indexers 8.1 Run from laptop NOT on VPN `curl https://iratemoses.mdr-test.defpoint.com:8088 --insecure` 8.2 RUN from laptop ON VPN salt-run survey.diff '*.local' cp.get_file_str file:////opt/splunkforwarder/etc/apps/moose_outputs/default/outputs.conf salt sensu* cmd.run 'tail -50 /opt/splunkforwarder/var/log/splunk/splunkd.log' salt phantom* cmd.run 'tail -200 /opt/splunkforwarder/var/log/splunk/splunkd.log | grep TcpOutputProc' ### Splunk Searches Check in `Splunk Forwarder` for indexerdiscovery errors index=_internal sourcetype="splunkd" source="/opt/splunkforwarder/var/log/splunk/splunkd.log" component=IndexerDiscoveryHeartbeatThread index=_internal host=*-splunk-cm* sourcetype=splunkd source="/opt/splunkforwarder/var/log/splunk/splunkd.log" 10.96.103.34 OR 10.96.101.248 OR 10.96.102.23 component=TcpOutputProc Check in CM,SH for indexerdiscovery index=_internal host=*-splunk-cm* sourcetype=splunkd source="/opt/splunk/var/log/splunk/splunkd.log" component=CMIndexerDiscovery index=_internal sourcetype="splunkd" source="/opt/splunk/var/log/splunk/splunkd.log" host="moose-splunk-sh.msoc.defpoint.local" component=TcpOutputProc "Initialization time for indexer discovery service" terraform apply -target=module.moose_cluster.module.indexer_cluster.module.indexer2.aws_launch_template.splunk_indexer -target=module.moose_cluster.module.indexer_cluster.module.indexer2.aws_autoscaling_group.splunk_indexer_asg ## PROD http://reposerver.msoc.defpoint.local/splunk/7.2/repodata/repomd.xml: [Errno 14] HTTP Error 404 - Not Found After salt highstate, splunkuf is running, but splunk is not. salt moose-splunk-indexer-i* cmd.run 'systemctl stop splunkuf' "/opt/splunk/etc/slave-apps/TA-Frozen-S3/bin/coldToFrozenS3.py"" ; no such script could be found on the filesystem." even after multiple attempts, Exiting.. ^^^ this is a bug in splunk! added python3 to ALL servers including indexers and moved coldToFrozenS3.py to /usr/local/bin to fix the issue. moose-splunk-indexer-i-01dc07f6a5f9dbeff.msoc.defpoint.local moose-splunk-indexer-i-0161555a167154cb0.msoc.defpoint.local - 50G - done terminated, salt, sensu, victorops, scaleft, redhat moose-splunk-indexer-i-0b9f30ce615c67c24.msoc.defpoint.local moose-splunk-indexer-i-087ecc377c008a0a3.msoc.defpoint.local moose-splunk-indexer-i-0bada91cd6ced80f7.msoc.defpoint.local - 50G - done terminated, salt, sensu, victorops, scaleft, redhat moose-splunk-indexer-i-055a31767d05fb053.msoc.defpoint.local - 50G - done terminated, salt, sensu, victorops, scaleft, redhat ERROR: ERROR IndexerDiscoveryHeartbeatThread - failed to parse response payload for group=afs-cluster, err=failed to extract FwdTarget from json node={"hostport":"?","ssl":false,"indexing_disk_space":-1}http_response=OK SOLUTION: indexers had no inputs.conf! the indexers were not listening for incoming connections. ERROR: coldToFrozen script not working pip3 install awscli chmod +x /usr/local/bin/aws