hmi.sh 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. #!/bin/sh
  2. #
  3. # Copyright 2015, Daniel Axtens, IBM Corporation
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; version 2 of the License.
  8. #
  9. # This program is distributed in the hope that it will be useful,
  10. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. # GNU General Public License for more details.
  13. # do we have ./getscom, ./putscom?
  14. if [ -x ./getscom ] && [ -x ./putscom ]; then
  15. GETSCOM=./getscom
  16. PUTSCOM=./putscom
  17. elif which getscom > /dev/null; then
  18. GETSCOM=$(which getscom)
  19. PUTSCOM=$(which putscom)
  20. else
  21. cat <<EOF
  22. Can't find getscom/putscom in . or \$PATH.
  23. See https://github.com/open-power/skiboot.
  24. The tool is in external/xscom-utils
  25. EOF
  26. exit 1
  27. fi
  28. # We will get 8 HMI events per injection
  29. # todo: deal with things being offline
  30. expected_hmis=8
  31. COUNT_HMIS() {
  32. dmesg | grep -c 'Harmless Hypervisor Maintenance interrupt'
  33. }
  34. # massively expand snooze delay, allowing injection on all cores
  35. ppc64_cpu --smt-snooze-delay=1000000000
  36. # when we exit, restore it
  37. trap "ppc64_cpu --smt-snooze-delay=100" 0 1
  38. # for each chip+core combination
  39. # todo - less fragile parsing
  40. egrep -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog |
  41. while read chipcore; do
  42. chip=$(echo "$chipcore"|awk '{print $3}')
  43. core=$(echo "$chipcore"|awk '{print $5}')
  44. fir="0x1${core}013100"
  45. # verify that Core FIR is zero as expected
  46. if [ "$($GETSCOM -c 0x${chip} $fir)" != 0 ]; then
  47. echo "FIR was not zero before injection for chip $chip, core $core. Aborting!"
  48. echo "Result of $GETSCOM -c 0x${chip} $fir:"
  49. $GETSCOM -c 0x${chip} $fir
  50. echo "If you get a -5 error, the core may be in idle state. Try stress-ng."
  51. echo "Otherwise, try $PUTSCOM -c 0x${chip} $fir 0"
  52. exit 1
  53. fi
  54. # keep track of the number of HMIs handled
  55. old_hmis=$(COUNT_HMIS)
  56. # do injection, adding a marker to dmesg for clarity
  57. echo "Injecting HMI on core $core, chip $chip" | tee /dev/kmsg
  58. # inject a RegFile recoverable error
  59. if ! $PUTSCOM -c 0x${chip} $fir 2000000000000000 > /dev/null; then
  60. echo "Error injecting. Aborting!"
  61. exit 1
  62. fi
  63. # now we want to wait for all the HMIs to be processed
  64. # we expect one per thread on the core
  65. i=0;
  66. new_hmis=$(COUNT_HMIS)
  67. while [ $new_hmis -lt $((old_hmis + expected_hmis)) ] && [ $i -lt 12 ]; do
  68. echo "Seen $((new_hmis - old_hmis)) HMI(s) out of $expected_hmis expected, sleeping"
  69. sleep 5;
  70. i=$((i + 1))
  71. new_hmis=$(COUNT_HMIS)
  72. done
  73. if [ $i = 12 ]; then
  74. echo "Haven't seen expected $expected_hmis recoveries after 1 min. Aborting."
  75. exit 1
  76. fi
  77. echo "Processed $expected_hmis events; presumed success. Check dmesg."
  78. echo ""
  79. done