You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

160 lines
5.0 KiB

  1. from datetime import timedelta as td
  2. import time
  3. import requests
  4. from threading import Thread
  5. from django.core.management.base import BaseCommand
  6. from django.utils import timezone
  7. from hc.api.models import Check, Flip
  8. from statsd.defaults.env import statsd
  9. SENDING_TMPL = "Sending alert, status=%s, code=%s\n"
  10. SEND_TIME_TMPL = "Sending took %.1fs, code=%s\n"
  11. def notify(flip_id, stdout):
  12. flip = Flip.objects.get(id=flip_id)
  13. check = flip.owner
  14. # Set the historic status here but *don't save it*.
  15. # It would be nicer to pass the status explicitly, as a separate parameter.
  16. check.status = flip.new_status
  17. # And just to make sure it doesn't get saved by a future coding accident:
  18. setattr(check, "save", None)
  19. stdout.write(SENDING_TMPL % (flip.new_status, check.code))
  20. # Set dates for followup nags
  21. if flip.new_status == "down":
  22. check.project.set_next_nag_date()
  23. # Send notifications
  24. send_start = timezone.now()
  25. errors = flip.send_alerts()
  26. for ch, error in errors:
  27. stdout.write("ERROR: %s %s %s\n" % (ch.kind, ch.value, error))
  28. # If sending took more than 5s, log it
  29. send_time = timezone.now() - send_start
  30. if send_time.total_seconds() > 5:
  31. stdout.write(SEND_TIME_TMPL % (send_time.total_seconds(), check.code))
  32. statsd.timing("hc.sendalerts.dwellTime", send_start - flip.created)
  33. statsd.timing("hc.sendalerts.sendTime", send_time)
  34. def notify_on_thread(flip_id, stdout):
  35. t = Thread(target=notify, args=(flip_id, stdout))
  36. t.start()
  37. class Command(BaseCommand):
  38. help = "Sends UP/DOWN email alerts"
  39. def add_arguments(self, parser):
  40. parser.add_argument(
  41. "--no-loop",
  42. action="store_false",
  43. dest="loop",
  44. default=True,
  45. help="Do not keep running indefinitely in a 2 second wait loop",
  46. )
  47. parser.add_argument(
  48. "--no-threads",
  49. action="store_false",
  50. dest="use_threads",
  51. default=False,
  52. help="Send alerts synchronously, without using threads",
  53. )
  54. def process_one_flip(self, use_threads=True):
  55. """ Find unprocessed flip, send notifications. """
  56. # Order by processed, otherwise Django will automatically order by id
  57. # and make the query less efficient
  58. q = Flip.objects.filter(processed=None).order_by("processed")
  59. flip = q.first()
  60. if flip is None:
  61. return False
  62. q = Flip.objects.filter(id=flip.id, processed=None)
  63. num_updated = q.update(processed=timezone.now())
  64. if num_updated != 1:
  65. # Nothing got updated: another worker process got there first.
  66. return True
  67. if use_threads:
  68. notify_on_thread(flip.id, self.stdout)
  69. else:
  70. notify(flip.id, self.stdout)
  71. return True
  72. def handle_going_down(self):
  73. """ Process a single check going down. """
  74. now = timezone.now()
  75. q = Check.objects.filter(alert_after__lt=now).exclude(status="down")
  76. # Sort by alert_after, to avoid unnecessary sorting by id:
  77. check = q.order_by("alert_after").first()
  78. if check is None:
  79. return False
  80. old_status = check.status
  81. q = Check.objects.filter(id=check.id, status=old_status)
  82. try:
  83. status = check.get_status(with_started=False)
  84. except Exception as e:
  85. # Make sure we don't trip on this check again for an hour:
  86. # Otherwise sendalerts may end up in a crash loop.
  87. q.update(alert_after=now + td(hours=1))
  88. # Then re-raise the exception:
  89. raise e
  90. if status != "down":
  91. # It is not down yet. Update alert_after
  92. q.update(alert_after=check.going_down_after())
  93. return True
  94. # Atomically update status
  95. flip_time = check.going_down_after()
  96. num_updated = q.update(alert_after=None, status="down")
  97. if num_updated != 1:
  98. # Nothing got updated: another worker process got there first.
  99. return True
  100. flip = Flip(owner=check)
  101. flip.created = flip_time
  102. flip.old_status = old_status
  103. flip.new_status = "down"
  104. flip.save()
  105. return True
  106. def handle(self, use_threads=True, loop=True, *args, **options):
  107. self.stdout.write("sendalerts is now running\n")
  108. i, sent = 0, 0
  109. while True:
  110. # Create flips for any checks going down
  111. while self.handle_going_down():
  112. pass
  113. # Process the unprocessed flips
  114. while self.process_one_flip(use_threads):
  115. sent += 1
  116. if not loop:
  117. break
  118. time.sleep(2)
  119. i += 1
  120. if i % 60 == 0:
  121. timestamp = timezone.now().isoformat()
  122. self.stdout.write("-- MARK %s --\n" % timestamp)
  123. requests.get('https://hc-ping.com/8bbf4b71-45cd-47d5-8425-4d5982419823')
  124. return "Sent %d alert(s)" % sent