Browse Source

Handling database connectivity issues

pull/490/head
James Kirsop 4 years ago
parent
commit
ab1583dec4
1 changed files with 35 additions and 26 deletions
  1. +35
    -26
      hc/api/management/commands/sendalerts.py

+ 35
- 26
hc/api/management/commands/sendalerts.py View File

@ -4,6 +4,7 @@ from threading import Thread
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.utils import timezone from django.utils import timezone
from django.db.utils import OperationalError
from hc.api.models import Check, Flip from hc.api.models import Check, Flip
from statsd.defaults.env import statsd from statsd.defaults.env import statsd
@ -109,33 +110,41 @@ class Command(BaseCommand):
old_status = check.status old_status = check.status
q = Check.objects.filter(id=check.id, status=old_status) q = Check.objects.filter(id=check.id, status=old_status)
try:
status = check.get_status()
except Exception as e:
# Make sure we don't trip on this check again for an hour:
# Otherwise sendalerts may end up in a crash loop.
q.update(alert_after=now + td(hours=1))
# Then re-raise the exception:
raise e
if status != "down":
# It is not down yet. Update alert_after
q.update(alert_after=check.going_down_after())
return True
# Atomically update status
flip_time = check.going_down_after()
num_updated = q.update(alert_after=None, status="down")
if num_updated != 1:
# Nothing got updated: another worker process got there first.
return True
flip = Flip(owner=check)
flip.created = flip_time
flip.old_status = old_status
flip.new_status = "down"
flip.save()
counter = 0
while counter < 5:
try:
try:
status = check.get_status()
except Exception as e:
# Make sure we don't trip on this check again for an hour:
# Otherwise sendalerts may end up in a crash loop.
q.update(alert_after=now + td(hours=1))
# Then re-raise the exception:
raise e
if status != "down":
# It is not down yet. Update alert_after
q.update(alert_after=check.going_down_after())
return True
# Atomically update status
flip_time = check.going_down_after()
num_updated = q.update(alert_after=None, status="down")
if num_updated != 1:
# Nothing got updated: another worker process got there first.
return True
flip = Flip(owner=check)
flip.created = flip_time
flip.old_status = old_status
flip.new_status = "down"
flip.save()
except OperationalError:
counter += 1
time.sleep(2*counter)
else:
break
return True return True
def handle(self, use_threads=True, loop=True, *args, **options): def handle(self, use_threads=True, loop=True, *args, **options):


Loading…
Cancel
Save