from datetime import timedelta as td import time import requests from threading import Thread from django.core.management.base import BaseCommand from django.utils import timezone from hc.api.models import Check, Flip from statsd.defaults.env import statsd SENDING_TMPL = "Sending alert, status=%s, code=%s\n" SEND_TIME_TMPL = "Sending took %.1fs, code=%s\n" def notify(flip_id, stdout): flip = Flip.objects.get(id=flip_id) check = flip.owner # Set the historic status here but *don't save it*. # It would be nicer to pass the status explicitly, as a separate parameter. check.status = flip.new_status # And just to make sure it doesn't get saved by a future coding accident: setattr(check, "save", None) stdout.write(SENDING_TMPL % (flip.new_status, check.code)) # Set dates for followup nags if flip.new_status == "down": check.project.set_next_nag_date() # Send notifications send_start = timezone.now() errors = flip.send_alerts() for ch, error in errors: stdout.write("ERROR: %s %s %s\n" % (ch.kind, ch.value, error)) # If sending took more than 5s, log it send_time = timezone.now() - send_start if send_time.total_seconds() > 5: stdout.write(SEND_TIME_TMPL % (send_time.total_seconds(), check.code)) statsd.timing("hc.sendalerts.dwellTime", send_start - flip.created) statsd.timing("hc.sendalerts.sendTime", send_time) def notify_on_thread(flip_id, stdout): t = Thread(target=notify, args=(flip_id, stdout)) t.start() class Command(BaseCommand): help = "Sends UP/DOWN email alerts" def add_arguments(self, parser): parser.add_argument( "--no-loop", action="store_false", dest="loop", default=True, help="Do not keep running indefinitely in a 2 second wait loop", ) parser.add_argument( "--no-threads", action="store_false", dest="use_threads", default=False, help="Send alerts synchronously, without using threads", ) def process_one_flip(self, use_threads=True): """ Find unprocessed flip, send notifications. """ # Order by processed, otherwise Django will automatically order by id # and make the query less efficient q = Flip.objects.filter(processed=None).order_by("processed") flip = q.first() if flip is None: return False q = Flip.objects.filter(id=flip.id, processed=None) num_updated = q.update(processed=timezone.now()) if num_updated != 1: # Nothing got updated: another worker process got there first. return True if use_threads: notify_on_thread(flip.id, self.stdout) else: notify(flip.id, self.stdout) return True def handle_going_down(self): """ Process a single check going down. """ now = timezone.now() q = Check.objects.filter(alert_after__lt=now).exclude(status="down") # Sort by alert_after, to avoid unnecessary sorting by id: check = q.order_by("alert_after").first() if check is None: return False old_status = check.status q = Check.objects.filter(id=check.id, status=old_status) try: status = check.get_status(with_started=False) except Exception as e: # Make sure we don't trip on this check again for an hour: # Otherwise sendalerts may end up in a crash loop. q.update(alert_after=now + td(hours=1)) # Then re-raise the exception: raise e if status != "down": # It is not down yet. Update alert_after q.update(alert_after=check.going_down_after()) return True # Atomically update status flip_time = check.going_down_after() num_updated = q.update(alert_after=None, status="down") if num_updated != 1: # Nothing got updated: another worker process got there first. return True flip = Flip(owner=check) flip.created = flip_time flip.old_status = old_status flip.new_status = "down" flip.save() return True def handle(self, use_threads=True, loop=True, *args, **options): self.stdout.write("sendalerts is now running\n") i, sent = 0, 0 while True: # Create flips for any checks going down while self.handle_going_down(): pass # Process the unprocessed flips while self.process_one_flip(use_threads): sent += 1 if not loop: break time.sleep(2) i += 1 if i % 60 == 0: timestamp = timezone.now().isoformat() self.stdout.write("-- MARK %s --\n" % timestamp) requests.get('https://hc-ping.com/8bbf4b71-45cd-47d5-8425-4d5982419823') return "Sent %d alert(s)" % sent