4 жил өмнө · e8d2d10ba1
--- a/GovCloud.md
+++ b/GovCloud.md
@@ -525,7 +525,7 @@ Commit all the changes to git and open PR. Once the settings are in the master b
 
				 
			
 
				 CUSTOMERPREFIX=modelclient
			
 
				 salt -C "${CUSTOMERPREFIX}* and G@msoc_pop:True" test.ping
			
 
				-#are the LCP images up-to-date on the salt minion version? See Salt Upgrade Notes.md
			
 
				+#are the LCP images up-to-date on the salt minion version? See Salt Upgrade Notes.md. Upgrade salt minions before syncing ec2_tags it needs py3. Make sure the environment grain is set before trying to upgrade salt. 
			
 
				 salt -C "${CUSTOMERPREFIX}* and G@msoc_pop:True" test.version
			
 
				 salt -C "${CUSTOMERPREFIX}* and G@msoc_pop:True" saltutil.sync_all
			
 
				 salt -C "${CUSTOMERPREFIX}* and G@msoc_pop:True" saltutil.refresh_pillar
			
@@ -540,9 +540,9 @@ salt -C "${CUSTOMERPREFIX}* and G@msoc_pop:True" grains.get environment
 
				 salt -C "${CUSTOMERPREFIX}* and G@msoc_pop:True" pillar.get os_settings:rhel:rh_subscription:activation-key
			
 
				 #LCP nodes need manual RH Subscription enrollment before removing test=true ensure the command is filled out with the pillar
			
 
				 salt -C "${CUSTOMERPREFIX}* and G@msoc_pop:True" state.sls os_modifications.rhel_registration test=true
			
 
				-#if needed upgrade salt
			
 
				+salt -C "${CUSTOMERPREFIX}* and G@msoc_pop:True" state.sls os_modifications
			
 
				+
			
 
				 
			
 
				-Register them with RH entitlement server (LCP nodes can't use AWS license)
			
 
				 
			
 
				 
			
 
				 salt ${CUSTOMERPREFIX}-splunk
			
@@ -568,4 +568,29 @@ SOLUTION: python 3 needed upgrade salt!!
 
				 ISSUE:
			
 
				 http://pkg.scaleft.com/rpm/repodata/repomd.xml: [Errno 12] Timeout on http://pkg.scaleft.com/rpm/repodata/repomd.xml: (28, 'Operation too slow. Less than 1000 bytes/sec transferred the last 30 seconds')
			
 
				 Trying other mirror.
			
 
				-SOLUTION: Repomove scaleft repo?
			
 
				+SOLUTION: Fix connectivity issues to scaleft
			
 
				+TEMP FIX: yum --disablerepo=okta_asa_repo_add pkg.upgrade
			
 
				+`cmd.run 'yum install python-virtualenv -y --disablerepo=okta_asa_repo_add'`
			
 
				+
			
 
				+
			
 
				+ISSUE:
			
 
				+```
			
 
				+2021-02-16 21:25:51,126 [salt.loaded.int.module.cmdmod:854 ][ERROR   ][26641] Command '['useradd', '-U', '-M', '-d', '/opt/splunk', 'splunk']' failed with return code: 9
			
 
				+    2021-02-16 21:25:51,127 [salt.loaded.int.module.cmdmod:858 ][ERROR   ][26641] stderr: useradd: group splunk exists - if you want to add this user to that group, use -g.
			
 
				+    2021-02-16 21:25:51,127 [salt.loaded.int.module.cmdmod:860 ][ERROR   ][26641] retcode: 9
			
 
				+    2021-02-16 21:25:51,127 [salt.state       :328 ][ERROR   ][26641] Failed to create new user splunk
			
 
				+```
			
 
				+SOLUTION: Manually create user and add to splunk group OR delete group and create user+group in one command. 
			
 
				+`cmd.run 'useradd -M -g splunk -d /opt/splunk splunk'`
			
 
				+
			
 
				+ISSUE:
			
 
				+```
			
 
				+splunk pkg.install 
			
 
				+Public key for splunk-8.0.5-a1a6394cc5ae-linux-2.6-x86_64.rpm is not installed
			
 
				+    Retrieving key from https://docs.splunk.com/images/6/6b/SplunkPGPKey.pub
			
 
				+
			
 
				+
			
 
				+    GPG key retrieval failed: [Errno 14] curl#35 - "TCP connection reset by peer"
			
 
				+```
			
 
				+TEMP FIX: `cmd.run 'yum --disablerepo=okta_asa_repo_add -y --nogpgcheck install splunk'` 
			
 
				+
			
--- a/Notes.md
+++ b/Notes.md
@@ -407,6 +407,8 @@ Post to Slack:
 
				 Resuming today's patching with the reboots of customer POPs.
			
 
				 ```
			
 
				 
			
 
				+Remeber to silence Sensu alerts before restarting servers. 
			
 
				+
			
 
				 NOTE: Restart POPs one server at a time in order to minimize risk of concurrent outages.
			
 
				 
			
 
				 #### First syslog servers
			
@@ -473,10 +475,12 @@ salt -C 'afs-splunk-syslog*'  grains.item location
 
				 salt -L 'afs-splunk-syslog-3, afs-splunk-syslog-7' cmd.run 'uptime'
			
 
				 date; salt -L 'afs-splunk-syslog-3, afs-splunk-syslog-7' system.reboot
			
 
				 watch "salt -L 'afs-splunk-syslog-3, afs-splunk-syslog-7' test.ping"
			
 
				+salt -L 'afs-splunk-syslog-3, afs-splunk-syslog-7' cmd.run 'ps -ef | grep syslog-ng | grep -v grep'
			
 
				 
			
 
				 salt -L 'afs-splunk-syslog-4, afs-splunk-syslog-8' cmd.run 'uptime'
			
 
				 date; salt -L 'afs-splunk-syslog-4, afs-splunk-syslog-8' system.reboot
			
 
				 watch "salt -L 'afs-splunk-syslog-4, afs-splunk-syslog-8' test.ping"
			
 
				+salt -L 'afs-splunk-syslog-4, afs-splunk-syslog-8' cmd.run 'ps -ef | grep syslog-ng | grep -v grep'
			
 
				 ```
			
 
				 
			
 
				 #### Verify logs are flowing
			
@@ -556,13 +560,17 @@ salt -C 'afs*local or ma-* or mo-*local or la-*local or nga*local or dc*local' c
 
				 salt -C 'afs*local or ma-* or mo-*local or la-*local or nga*local or dc*local' pkg.upgrade
			
 
				 ```
			
 
				 
			
 
				-Don't forget to patch nihors* on gc-prod-salt-master!
			
 
				+NOTE: Some Splunk Indexers always have high disk space usage (83%). This is normal. 
			
 
				+
			
 
				+Don't forget to patch Splunk clusters on gc-prod-salt-master!
			
 
				 
			
 
				 ```
			
 
				-salt -C 'nihor*com' cmd.run 'df -h | egrep "[890][0-9]\%"' 
			
 
				-salt -C 'nihor*com' pkg.upgrade
			
 
				+salt -C 'nihor*com or bp-ot-demo*com' cmd.run 'df -h | egrep "[890][0-9]\%"' 
			
 
				+salt -C 'nihor*com or bp-ot-demo*com' pkg.upgrade
			
 
				 ```
			
 
				 
			
 
				+Don't forget to un-silence Sensu.
			
 
				+
			
 
				 #### Troubleshooting
			
 
				 
			
 
				 EPEL repo is enabled on afs-splunk-hf ( I don't know why); had to run this to avoid issue with collectd package on msoc-repo
			
@@ -589,11 +597,11 @@ salt -C '*-sh* and not *moose* and not qcompliance* and not fm-shared-search*' s
 
				 watch "salt -C '*-sh* and not *moose* and not qcompliance* and not fm-shared-search*' cmd.run 'uptime'"
			
 
				 ```
			
 
				 
			
 
				-Don't forget to reboot nihors-splunk-sh* on gc-prod-salt-master!
			
 
				+Don't forget to reboot customer Splunk search heads on on gc-prod-salt-master!
			
 
				 
			
 
				 ```
			
 
				-salt -C 'nihor-splunk-sh*' cmd.run 'df -h | egrep "[890][0-9]\%"' 
			
 
				-salt -C 'nihor-splunk-sh*' system.reboot
			
 
				+salt -C 'nihors-splunk-sh* or bp-ot-demo-splunk-sh*' cmd.run 'df -h | egrep "[890][0-9]\%"' 
			
 
				+salt -C 'nihors-splunk-sh* or bp-ot-demo-splunk-sh*' system.reboot
			
 
				 ```
			
 
				 
			
 
				 Don't forget to un-silence Sensu. 
			
@@ -601,6 +609,7 @@ Don't forget to un-silence Sensu.
 
				 ### Day 4 (Tuesday), Step 1 of 1, Customer Slices CMs Reboots
			
 
				 Long Day of Reboots!
			
 
				 
			
 
				+Post to Slack in xdr-patching:
			
 
				 ```
			
 
				 Today's patching is the indexing clusters for all XDR customer environments. Cluster masters and indexers will be rebooted this morning. Thank you for your cooperation.
			
 
				 ```
			
@@ -661,30 +670,30 @@ watch "salt -C '*splunk-indexer-* and G@ec2:placement:availability_zone:us-east-
 
				 
			
 
				 NGA had a hard time getting 3 checkmarks The CM was waiting on stuck buckets. Force rolled the buckets to get green checkmarks.
			
 
				 
			
 
				-Don't forget nihors on GC salt-master
			
 
				-```
			
 
				-salt -C 'nihors*' test.ping --out=txt
			
 
				-salt -C 'nihors*' cmd.run 'df -h | egrep "[890][0-9]\%"'
			
 
				-salt -C 'nihors-splunk-hf* or nihors-splunk-cm*' system.reboot
			
 
				-watch "salt -C 'nihors-splunk-hf* or nihors-splunk-cm*' test.ping --out=txt"
			
 
				-salt -C 'nihors-splunk-hf* or nihors-splunk-cm*' cmd.run 'systemctl status splunk'
			
 
				-salt -C 'nihors-splunk-hf* or nihors-splunk-cm*' cmd.run 'uptime'
			
 
				-
			
 
				-salt -C 'nihors-splunk-idx*' test.ping --out=txt
			
 
				-salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1a' test.ping --out=txt
			
 
				-salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1a' cmd.run 'df -h | egrep "[890][0-9]\%"'
			
 
				-salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1a' system.reboot
			
 
				-watch "salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1a' test.ping --out=txt"
			
 
				-
			
 
				-salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1b' test.ping --out=txt
			
 
				-salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1b' cmd.run 'df -h | egrep "[890][0-9]\%"'
			
 
				-salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1b' system.reboot
			
 
				-watch "salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1b' test.ping --out=txt"
			
 
				-
			
 
				-salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1c' test.ping --out=txt
			
 
				-salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1c' cmd.run 'df -h | egrep "[890][0-9]\%"'
			
 
				-salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1c' system.reboot
			
 
				-watch "salt -C 'nihors-splunk-idx* and G@ec2:placement:availability_zone:us-gov-east-1c' test.ping --out=txt"
			
 
				+Don't forget Splunk clusters on GC salt-master
			
 
				+```
			
 
				+salt -C '( *splunk-cm*com or *splunk-hf*com ) not moose*' test.ping --out=txt
			
 
				+salt -C '( *splunk-cm*com or *splunk-hf*com ) not moose*' cmd.run 'df -h | egrep "[890][0-9]\%"'
			
 
				+salt -C '( *splunk-cm*com or *splunk-hf*com ) not moose*' system.reboot
			
 
				+watch "salt -C '( *splunk-cm*com or *splunk-hf*com ) not moose*' test.ping --out=txt"
			
 
				+salt -C '( *splunk-cm*com or *splunk-hf*com ) not moose*' cmd.run 'systemctl status splunk'
			
 
				+salt -C '( *splunk-cm*com or *splunk-hf*com ) not moose*' cmd.run 'uptime'
			
 
				+
			
 
				+salt -C '*splunk-idx-*com not moose*' test.ping --out=txt
			
 
				+salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1a not moose*' test.ping --out=txt
			
 
				+salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1a not moose*' cmd.run 'df -h | egrep "[890][0-9]\%"'
			
 
				+salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1a not moose*' system.reboot
			
 
				+watch "salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1a not moose*' test.ping --out=txt"
			
 
				+
			
 
				+salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1b not moose*' test.ping --out=txt
			
 
				+salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1b not moose*' cmd.run 'df -h | egrep "[890][0-9]\%"'
			
 
				+salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1b not moose*' system.reboot
			
 
				+watch "salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1b not moose*' test.ping --out=txt"
			
 
				+
			
 
				+salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1c not moose*' test.ping --out=txt
			
 
				+salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1c not moose*' cmd.run 'df -h | egrep "[890][0-9]\%"'
			
 
				+salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1c not moose*' system.reboot
			
 
				+watch "salt -C '*splunk-idx-*com and G@ec2:placement:availability_zone:us-gov-east-1c not moose*' test.ping --out=txt"
			
 
				 ```
			
 
				 
			
 
				 #### Verify you got everything