From 21a042aa16eb14487a68a8e43a5a88994de7dcb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C4=93teris=20Caune?= Date: Sun, 31 Jan 2016 18:35:24 +0200 Subject: [PATCH] Move notification logic to hc.api.transports. Don't use "paused" state for checks. --- hc/api/admin.py | 2 +- hc/api/management/commands/sendalerts.py | 22 +-- hc/api/migrations/0022_auto_20160130_2042.py | 24 +++ hc/api/models.py | 124 ++++------------ hc/api/tests/test_notify.py | 25 ++-- hc/api/transports.py | 145 +++++++++++++++++++ templates/integrations/pd_description.html | 5 + templates/integrations/pushover_title.html | 5 + 8 files changed, 230 insertions(+), 122 deletions(-) create mode 100644 hc/api/migrations/0022_auto_20160130_2042.py create mode 100644 hc/api/transports.py create mode 100644 templates/integrations/pd_description.html create mode 100644 templates/integrations/pushover_title.html diff --git a/hc/api/admin.py b/hc/api/admin.py index cde781dc..6538d083 100644 --- a/hc/api/admin.py +++ b/hc/api/admin.py @@ -182,7 +182,7 @@ class NotificationsAdmin(admin.ModelAdmin): search_fields = ["owner__name", "owner__code", "channel__value"] list_select_related = ("owner", "channel") list_display = ("id", "created", "check_status", "check_name", - "channel_kind", "channel_value", "status") + "channel_kind", "channel_value") list_filter = ("created", "check_status", "channel__kind") def check_name(self, obj): diff --git a/hc/api/management/commands/sendalerts.py b/hc/api/management/commands/sendalerts.py index 68666a8b..ca9b33c8 100644 --- a/hc/api/management/commands/sendalerts.py +++ b/hc/api/management/commands/sendalerts.py @@ -39,27 +39,17 @@ class Command(BaseCommand): Return False if no checks need to be processed. """ + + # Save the new status. If sendalerts crashes, + # it won't process this check again. check.status = check.get_status() + check.save() tmpl = "\nSending alert, status=%s, code=%s\n" self.stdout.write(tmpl % (check.status, check.code)) + check.send_alert() - try: - check.send_alert() - except: - # Catch EVERYTHING. If we crash here, what can happen is: - # - the sendalerts command will crash - # - supervisor will respawn sendalerts command - # - sendalerts will try same thing again, resulting in - # infinite loop - # So instead we catch and log all exceptions, and mark - # the checks as paused so they are not retried. - logger.error("Could not alert %s" % check.code, exc_info=True) - check.status = "paused" - finally: - check.save() - connection.close() - + connection.close() return True def handle(self, *args, **options): diff --git a/hc/api/migrations/0022_auto_20160130_2042.py b/hc/api/migrations/0022_auto_20160130_2042.py new file mode 100644 index 00000000..f1b940d4 --- /dev/null +++ b/hc/api/migrations/0022_auto_20160130_2042.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9 on 2016-01-30 20:42 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0021_ping_n'), + ] + + operations = [ + migrations.RemoveField( + model_name='notification', + name='status', + ), + migrations.AddField( + model_name='notification', + name='error', + field=models.CharField(blank=True, max_length=200), + ), + ] diff --git a/hc/api/models.py b/hc/api/models.py index bfab573a..a281be46 100644 --- a/hc/api/models.py +++ b/hc/api/models.py @@ -1,17 +1,15 @@ # coding: utf-8 import hashlib -import json import uuid from datetime import timedelta as td -import requests from django.conf import settings from django.contrib.auth.models import User from django.core.urlresolvers import reverse from django.db import models -from django.template.loader import render_to_string from django.utils import timezone +from hc.api import transports from hc.lib import emails STATUSES = ( @@ -125,103 +123,37 @@ class Channel(models.Model): verify_link = settings.SITE_ROOT + verify_link emails.verify_email(self.value, {"verify_link": verify_link}) - def notify(self, check): - n = Notification(owner=check, channel=self) - n.check_status = check.status - - if self.kind == "email" and self.email_verified: - ctx = { - "check": check, - "checks": self.user.check_set.order_by("created"), - "now": timezone.now() - } - emails.alert(self.value, ctx) - n.save() - elif self.kind == "webhook" and check.status == "down": - try: - headers = {"User-Agent": "healthchecks.io"} - r = requests.get(self.value, timeout=5, headers=headers) - n.status = r.status_code - except requests.exceptions.Timeout: - # Well, we tried - pass - - n.save() + @property + def transport(self): + if self.kind == "email": + return transports.Email(self) + elif self.kind == "webhook": + return transports.Webhook(self) elif self.kind == "slack": - tmpl = "integrations/slack_message.json" - text = render_to_string(tmpl, {"check": check}) - payload = json.loads(text) - r = requests.post(self.value, json=payload, timeout=5) - - n.status = r.status_code - n.save() + return transports.Slack(self) elif self.kind == "hipchat": - tmpl = "integrations/hipchat_message.html" - text = render_to_string(tmpl, {"check": check}) - payload = { - "message": text, - "color": "green" if check.status == "up" else "red", - } - - r = requests.post(self.value, json=payload, timeout=5) + return transports.HipChat(self) + elif self.kind == "pd": + return transports.PagerDuty(self) + elif self.kind == "po": + return transports.Pushover() + else: + raise NotImplemented("Unknown channel kind: %s" % self.kind) - n.status = r.status_code - n.save() + def notify(self, check): + # Make 3 attempts-- + for x in range(0, 3): + error = self.transport.notify(check) or "" + if error == "": + break # Success! - elif self.kind == "pd": - if check.status == "down": - event_type = "trigger" - description = "%s is DOWN" % check.name_then_code() - else: - event_type = "resolve" - description = "%s received a ping and is now UP" % \ - check.name_then_code() - - payload = { - "service_key": self.value, - "incident_key": str(check.code), - "event_type": event_type, - "description": description, - "client": "healthchecks.io", - "client_url": settings.SITE_ROOT - } - - url = "https://events.pagerduty.com/generic/2010-04-15/create_event.json" - r = requests.post(url, data=json.dumps(payload), timeout=5) - - n.status = r.status_code - n.save() + n = Notification(owner=check, channel=self) + n.check_status = check.status + n.error = error + n.save() - elif self.kind == "po": - tmpl = "integrations/pushover_message.html" - ctx = { - "check": check, - "down_checks": self.user.check_set.filter(status="down").exclude(code=check.code).order_by("created"), - } - text = render_to_string(tmpl, ctx).strip() - if check.status == "down": - title = "%s is DOWN" % check.name_then_code() - else: - title = "%s is now UP" % check.name_then_code() - - user_key, priority, _ = self.po_value - payload = { - "token": settings.PUSHOVER_API_TOKEN, - "user": user_key, - "message": text, - "title": title, - "html": 1, - "priority": priority, - } - if priority == 2: # Emergency notification - payload["retry"] = settings.PUSHOVER_EMERGENCY_RETRY_DELAY - payload["expire"] = settings.PUSHOVER_EMERGENCY_EXPIRATION - - url = "https://api.pushover.net/1/messages.json" - r = requests.post(url, data=payload, timeout=5) - - n.status = r.status_code - n.save() + def test(self): + return self.transport().test() @property def po_value(self): @@ -236,4 +168,4 @@ class Notification(models.Model): check_status = models.CharField(max_length=6) channel = models.ForeignKey(Channel) created = models.DateTimeField(auto_now_add=True) - status = models.IntegerField(default=0) + error = models.CharField(max_length=200, blank=True) diff --git a/hc/api/tests/test_notify.py b/hc/api/tests/test_notify.py index 1b3b0484..00c435af 100644 --- a/hc/api/tests/test_notify.py +++ b/hc/api/tests/test_notify.py @@ -20,7 +20,7 @@ class NotifyTestCase(BaseTestCase): self.channel.save() self.channel.checks.add(self.check) - @patch("hc.api.models.requests.get") + @patch("hc.api.transports.requests.get") def test_webhook(self, mock_get): self._setup_data("webhook", "http://example") mock_get.return_value.status_code = 200 @@ -30,16 +30,20 @@ class NotifyTestCase(BaseTestCase): u"http://example", headers={"User-Agent": "healthchecks.io"}, timeout=5) - @patch("hc.api.models.requests.get", side_effect=ReadTimeout) + @patch("hc.api.transports.requests.get", side_effect=ReadTimeout) def test_webhooks_handle_timeouts(self, mock_get): self._setup_data("webhook", "http://example") self.channel.notify(self.check) - assert Notification.objects.count() == 1 + + n = Notification.objects.get() + self.assertEqual(n.error, "Connection timed out") def test_email(self): self._setup_data("email", "alice@example.org") self.channel.notify(self.check) - assert Notification.objects.count() == 1 + + n = Notification.objects.get() + self.assertEqual(n.error, "") # And email should have been sent self.assertEqual(len(mail.outbox), 1) @@ -48,21 +52,24 @@ class NotifyTestCase(BaseTestCase): self._setup_data("email", "alice@example.org", email_verified=False) self.channel.notify(self.check) - assert Notification.objects.count() == 0 + assert Notification.objects.count() == 1 + n = Notification.objects.first() + self.assertEqual(n.error, "Email not verified") self.assertEqual(len(mail.outbox), 0) - @patch("hc.api.models.requests.post") + @patch("hc.api.transports.JsonTransport.post") def test_pd(self, mock_post): self._setup_data("pd", "123") - mock_post.return_value.status_code = 200 + mock_post.return_value = None self.channel.notify(self.check) assert Notification.objects.count() == 1 args, kwargs = mock_post.call_args - assert "trigger" in kwargs["data"] + payload = args[1] + self.assertEqual(payload["event_type"], "trigger") - @patch("hc.api.models.requests.post") + @patch("hc.api.transports.requests.post") def test_slack(self, mock_post): self._setup_data("slack", "123") mock_post.return_value.status_code = 200 diff --git a/hc/api/transports.py b/hc/api/transports.py new file mode 100644 index 00000000..43a982f3 --- /dev/null +++ b/hc/api/transports.py @@ -0,0 +1,145 @@ +from django.conf import settings +from django.template.loader import render_to_string +from django.utils import timezone +import json +import requests + +from hc.lib import emails + + +def tmpl(template_name, **ctx): + template_path = "integrations/%s" % template_name + return render_to_string(template_path, ctx).strip() + + +class Transport(object): + def __init__(self, channel): + self.channel = channel + + def notify(self, check): + """ Send notification about current status of the check. + + This method returns None on success, and error message + on error. + + """ + + raise NotImplemented() + + def test(self): + """ Send test message. + + This method returns None on success, and error message + on error. + + """ + + raise NotImplemented() + + def checks(self): + return self.channel.user.check_set.order_by("created") + + +class Email(Transport): + def notify(self, check): + if not self.channel.email_verified: + return "Email not verified" + + ctx = { + "check": check, + "checks": self.checks(), + "now": timezone.now() + } + emails.alert(self.channel.value, ctx) + + +class Webhook(Transport): + def notify(self, check): + # Webhook integration only fires when check goes down. + if check.status != "down": + return + + # Webhook transport sends no arguments, so the + # notify and test actions are the same + return self.test() + + def test(self): + headers = {"User-Agent": "healthchecks.io"} + try: + r = requests.get(self.channel.value, timeout=5, headers=headers) + if r.status_code not in (200, 201): + return "Received status code %d" % r.status_code + except requests.exceptions.Timeout: + # Well, we tried + return "Connection timed out" + + +class JsonTransport(Transport): + def post(self, url, payload): + headers = {"User-Agent": "healthchecks.io"} + r = requests.post(url, json=payload, timeout=5, headers=headers) + if r.status_code not in (200, 201): + return "Received status code %d" % r.status_code + + +class Slack(JsonTransport): + def notify(self, check): + text = tmpl("slack_message.json", check=check) + payload = json.loads(text) + return self.post(self.channel.value, payload) + + +class HipChat(JsonTransport): + def notify(self, check): + text = tmpl("hipchat_message.html", check=check) + payload = { + "message": text, + "color": "green" if check.status == "up" else "red", + } + return self.post(self.channel.value, payload) + + +class PagerDuty(JsonTransport): + URL = "https://events.pagerduty.com/generic/2010-04-15/create_event.json" + + def notify(self, check): + description = tmpl("pd_description.html", check=check) + payload = { + "service_key": self.channel.value, + "incident_key": str(check.code), + "event_type": "trigger" if check.status == "down" else "resolve", + "description": description, + "client": "healthchecks.io", + "client_url": settings.SITE_ROOT + } + + return self.post(self.URL, payload) + + +class Pushover(JsonTransport): + URL = "https://api.pushover.net/1/messages.json" + + def notify(self, check): + others = self.checks().filter(status="down").exclude(code=check.code) + ctx = { + "check": check, + "down_checks": others, + } + text = tmpl("pushover_message.html", **ctx) + title = tmpl("pushover_title.html", **ctx) + user_key, prio = self.channel.value.split("|") + payload = { + "token": settings.PUSHOVER_API_TOKEN, + "user": user_key, + "message": text, + "title": title, + "html": 1, + "priority": int(prio), + } + + # Emergency notification + if prio == "2": + payload["retry"] = settings.PUSHOVER_EMERGENCY_RETRY_DELAY + payload["expire"] = settings.PUSHOVER_EMERGENCY_EXPIRATION + + return self.post(self.URL, payload) diff --git a/templates/integrations/pd_description.html b/templates/integrations/pd_description.html new file mode 100644 index 00000000..7326efa7 --- /dev/null +++ b/templates/integrations/pd_description.html @@ -0,0 +1,5 @@ +{% if check.status == "down" %} + {{ check.name_then_code }} is DOWN +{% else %} + {{ check.name_then_code }} received a ping and is now UP +{% endif %} \ No newline at end of file diff --git a/templates/integrations/pushover_title.html b/templates/integrations/pushover_title.html new file mode 100644 index 00000000..42a26764 --- /dev/null +++ b/templates/integrations/pushover_title.html @@ -0,0 +1,5 @@ +{% if check.status == "down" %} + {{ check.name_then_code }} is DOWN +{% else %} + {{ check.name_then_code }} is now UP +{% endif %} \ No newline at end of file