diff --git a/CHANGELOG.md b/CHANGELOG.md index f237c46b..1a45b857 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. ### Improvements - Switch from croniter to cronsim (vendored in hc.lib.cronsim) - Change outgoing webhook timeout to 10s, but cap the total time to 20s +- Implement automatic `api_ping` and `api_notification` pruning (#556) ### Bug Fixes - Fix hc.api.views.ping to handle non-utf8 data in request body (#574) diff --git a/README.md b/README.md index a08836cd..42f89a51 100644 --- a/README.md +++ b/README.md @@ -155,29 +155,17 @@ manager like [supervisor](http://supervisord.org/) or systemd. ## Database Cleanup -With time and use the Healthchecks database will grow in size. You may -decide to prune old data: inactive user accounts, old checks not assigned -to users, records of outgoing email messages and records of received pings. -There are separate Django management commands for each task: +Healthchecks deletes old entries from `api_ping` and `api_notification` +tables automatically. By default, Healthchecks keeps the 100 most recent +pings for every check. You can set the limit higher to keep a longer history: +go to the Administration Panel, look up user's **Profile** and modify its +"Ping log limit" field. -* Remove old records from `api_ping` table. For each check, keep 100 most - recent pings: +For each check, Healthchecks removes notifications that are older than the +oldest stored ping for same check. - ``` - $ ./manage.py prunepings - ``` - - Note: 100 is the default value but you can configure a different - limit per-user. To do that, go to the - Administration Panel, look up user's **Profile** and modify its - "Ping log limit" field. - -* Remove old records of sent notifications. For each check, remove - notifications that are older than the oldest stored ping for same check. - - ``` - $ ./manage.py prunenotifications - ``` +Healthchecks also provides management commands for cleaning up +`auth_user`, `api_tokenbucket` and `api_flip` tables. * Remove user accounts that match either of these conditions: * Account was created more than 6 months ago, and user has never logged in. diff --git a/hc/api/management/commands/prunepings.py b/hc/api/management/commands/prunepings.py deleted file mode 100644 index 8b85dd99..00000000 --- a/hc/api/management/commands/prunepings.py +++ /dev/null @@ -1,22 +0,0 @@ -from django.db.models import F -from django.contrib.auth.models import User -from django.core.management.base import BaseCommand -from hc.accounts.models import Profile -from hc.api.models import Ping - - -class Command(BaseCommand): - help = "Prune pings based on limits in user profiles" - - def handle(self, *args, **options): - # Create any missing user profiles - for user in User.objects.filter(profile=None): - Profile.objects.get_or_create(user_id=user.id) - - q = Ping.objects - q = q.annotate(limit=F("owner__project__owner__profile__ping_log_limit")) - q = q.filter(n__lte=F("owner__n_pings") - F("limit")) - q = q.filter(n__gt=0) - n_pruned, _ = q.delete() - - return "Done! Pruned %d pings" % n_pruned diff --git a/hc/api/management/commands/prunepingsslow.py b/hc/api/management/commands/prunepingsslow.py deleted file mode 100644 index b73a28a5..00000000 --- a/hc/api/management/commands/prunepingsslow.py +++ /dev/null @@ -1,36 +0,0 @@ -from django.db.models import F -from django.contrib.auth.models import User -from django.core.management.base import BaseCommand -from hc.accounts.models import Profile -from hc.api.models import Check, Ping - - -class Command(BaseCommand): - help = """Prune pings based on limits in user profiles. - - This command prunes each check individually. So it does the work - in small chunks instead of a few big SQL queries like the `prunepings` - command. It is appropriate for initial pruning of the potentially - huge api_ping table. - - """ - - def handle(self, *args, **options): - # Create any missing user profiles - for user in User.objects.filter(profile=None): - Profile.objects.get_or_create(user_id=user.id) - - checks = Check.objects - checks = checks.annotate(limit=F("project__owner__profile__ping_log_limit")) - - for check in checks: - q = Ping.objects.filter(owner_id=check.id) - q = q.filter(n__lte=check.n_pings - check.limit) - q = q.filter(n__gt=0) - n_pruned, _ = q.delete() - - self.stdout.write( - "Pruned %d pings for check %s (%s)" % (n_pruned, check.id, check.name) - ) - - return "Done!" diff --git a/hc/api/models.py b/hc/api/models.py index dd308ceb..f305a99f 100644 --- a/hc/api/models.py +++ b/hc/api/models.py @@ -329,6 +329,22 @@ class Check(models.Model): ping.exitstatus = exitstatus ping.save() + # Every 100 received pings, prune old pings and notifications: + if self.n_pings % 100 == 0: + self.prune() + + def prune(self): + """ Remove old pings and notifications. """ + + limit = self.project.owner_profile.ping_log_limit + self.ping_set.filter(n__lte=self.n_pings - limit).delete() + + try: + ping = self.ping_set.earliest("id") + self.notification_set.filter(created__lt=ping.created).delete() + except Ping.DoesNotExist: + pass + def downtimes(self, months): """ Calculate the number of downtimes and downtime minutes per month. diff --git a/hc/api/tests/test_check_model.py b/hc/api/tests/test_check_model.py index acbfb21b..f906de70 100644 --- a/hc/api/tests/test_check_model.py +++ b/hc/api/tests/test_check_model.py @@ -1,8 +1,8 @@ -from datetime import datetime, timedelta +from datetime import datetime, timedelta as td from unittest.mock import Mock, patch from django.utils import timezone -from hc.api.models import Check, Flip +from hc.api.models import Channel, Check, Flip, Notification, Ping from hc.test import BaseTestCase CURRENT_TIME = datetime(2020, 1, 15, tzinfo=timezone.utc) @@ -26,14 +26,14 @@ class CheckModelTestCase(BaseTestCase): def test_status_works_with_grace_period(self): check = Check() check.status = "up" - check.last_ping = timezone.now() - timedelta(days=1, minutes=30) + check.last_ping = timezone.now() - td(days=1, minutes=30) self.assertEqual(check.get_status(), "grace") def test_get_status_handles_paused_check(self): check = Check() check.status = "paused" - check.last_ping = timezone.now() - timedelta(days=1, minutes=30) + check.last_ping = timezone.now() - td(days=1, minutes=30) self.assertEqual(check.get_status(), "paused") def test_status_works_with_cron_syntax(self): @@ -47,15 +47,15 @@ class CheckModelTestCase(BaseTestCase): check.last_ping = dt # 23:59pm - now = dt + timedelta(hours=23, minutes=59) + now = dt + td(hours=23, minutes=59) self.assertEqual(check.get_status(now), "up") # 00:00am - now = dt + timedelta(days=1) + now = dt + td(days=1) self.assertEqual(check.get_status(now), "grace") # 1:30am - now = dt + timedelta(days=1, minutes=60) + now = dt + td(days=1, minutes=60) self.assertEqual(check.get_status(now), "down") def test_status_works_with_timezone(self): @@ -70,36 +70,36 @@ class CheckModelTestCase(BaseTestCase): check.tz = "Australia/Brisbane" # UTC+10 # 10:30am - now = dt + timedelta(hours=23, minutes=59) + now = dt + td(hours=23, minutes=59) self.assertEqual(check.get_status(now), "up") # 10:30am - now = dt + timedelta(days=1) + now = dt + td(days=1) self.assertEqual(check.get_status(now), "grace") # 11:30am - now = dt + timedelta(days=1, minutes=60) + now = dt + td(days=1, minutes=60) self.assertEqual(check.get_status(now), "down") def test_get_status_handles_past_grace(self): check = Check() check.status = "up" - check.last_ping = timezone.now() - timedelta(days=2) + check.last_ping = timezone.now() - td(days=2) self.assertEqual(check.get_status(), "down") def test_get_status_obeys_down_status(self): check = Check() check.status = "down" - check.last_ping = timezone.now() - timedelta(minutes=1) + check.last_ping = timezone.now() - td(minutes=1) self.assertEqual(check.get_status(), "down") def test_get_status_handles_started(self): check = Check() - check.last_ping = timezone.now() - timedelta(hours=2) + check.last_ping = timezone.now() - td(hours=2) # Last start was 5 minutes ago, display status should be "started" - check.last_start = timezone.now() - timedelta(minutes=5) + check.last_start = timezone.now() - td(minutes=5) for status in ("new", "paused", "up", "down"): check.status = status self.assertEqual(check.get_status(with_started=True), "started") @@ -107,9 +107,9 @@ class CheckModelTestCase(BaseTestCase): def test_get_status_handles_down_then_started_and_expired(self): check = Check(status="down") # Last ping was 2 days ago - check.last_ping = timezone.now() - timedelta(days=2) + check.last_ping = timezone.now() - td(days=2) # Last start was 2 hours ago - the check is past its grace time - check.last_start = timezone.now() - timedelta(hours=2) + check.last_start = timezone.now() - td(hours=2) self.assertEqual(check.get_status(with_started=True), "down") self.assertEqual(check.get_status(), "down") @@ -117,9 +117,9 @@ class CheckModelTestCase(BaseTestCase): def test_get_status_handles_up_then_started(self): check = Check(status="up") # Last ping was 2 hours ago, so is still up - check.last_ping = timezone.now() - timedelta(hours=2) + check.last_ping = timezone.now() - td(hours=2) # Last start was 5 minutes ago - check.last_start = timezone.now() - timedelta(minutes=5) + check.last_start = timezone.now() - td(minutes=5) self.assertEqual(check.get_status(with_started=True), "started") # A started check still is considered "up": @@ -128,9 +128,9 @@ class CheckModelTestCase(BaseTestCase): def test_get_status_handles_up_then_started_and_expired(self): check = Check(status="up") # Last ping was 3 hours ago, so is still up - check.last_ping = timezone.now() - timedelta(hours=3) + check.last_ping = timezone.now() - td(hours=3) # Last start was 2 hours ago - the check is past its grace time - check.last_start = timezone.now() - timedelta(hours=2) + check.last_start = timezone.now() - td(hours=2) self.assertEqual(check.get_status(with_started=True), "down") self.assertEqual(check.get_status(), "down") @@ -138,14 +138,14 @@ class CheckModelTestCase(BaseTestCase): def test_get_status_handles_paused_then_started_and_expired(self): check = Check(status="paused") # Last start was 2 hours ago - the check is past its grace time - check.last_start = timezone.now() - timedelta(hours=2) + check.last_start = timezone.now() - td(hours=2) self.assertEqual(check.get_status(with_started=True), "down") self.assertEqual(check.get_status(), "down") def test_get_status_handles_started_and_mia(self): check = Check() - check.last_start = timezone.now() - timedelta(hours=2) + check.last_start = timezone.now() - td(hours=2) self.assertEqual(check.get_status(with_started=True), "down") self.assertEqual(check.get_status(), "down") @@ -174,17 +174,17 @@ class CheckModelTestCase(BaseTestCase): # Nov. 2019 self.assertEqual(nov[0].strftime("%m-%Y"), "11-2019") - self.assertEqual(nov[1], timedelta()) + self.assertEqual(nov[1], td()) self.assertEqual(nov[2], 0) # Dec. 2019 self.assertEqual(dec[0].strftime("%m-%Y"), "12-2019") - self.assertEqual(dec[1], timedelta()) + self.assertEqual(dec[1], td()) self.assertEqual(dec[2], 0) # Jan. 2020 self.assertEqual(jan[0].strftime("%m-%Y"), "01-2020") - self.assertEqual(jan[1], timedelta()) + self.assertEqual(jan[1], td()) self.assertEqual(jan[2], 0) @patch("hc.api.models.timezone.now", MOCK_NOW) @@ -259,5 +259,24 @@ class CheckModelTestCase(BaseTestCase): self.assertIsNone(dec[2]) # Jan. 2020 - self.assertEqual(jan[1], timedelta()) + self.assertEqual(jan[1], td()) self.assertEqual(jan[2], 0) + + def test_it_prunes(self): + check = Check.objects.create(project=self.project, n_pings=101) + Ping.objects.create(owner=check, n=101) + Ping.objects.create(owner=check, n=1) + + n = Notification(owner=check) + n.channel = Channel.objects.create(project=self.project, kind="email") + n.check_status = "down" + n.save() + n.created = check.created - td(minutes=10) + n.save() + + check.prune() + + self.assertTrue(Ping.objects.filter(n=101).exists()) + self.assertFalse(Ping.objects.filter(n=1).exists()) + + self.assertEqual(Notification.objects.count(), 0) diff --git a/hc/api/tests/test_prunepings.py b/hc/api/tests/test_prunepings.py deleted file mode 100644 index 9c28262e..00000000 --- a/hc/api/tests/test_prunepings.py +++ /dev/null @@ -1,24 +0,0 @@ -from datetime import timedelta - -from django.utils import timezone -from hc.api.management.commands.prunepings import Command -from hc.api.models import Check, Ping -from hc.test import BaseTestCase - - -class PrunePingsTestCase(BaseTestCase): - year_ago = timezone.now() - timedelta(days=365) - - def test_it_removes_old_pings(self): - self.profile.ping_log_limit = 1 - self.profile.save() - - c = Check(project=self.project, n_pings=2) - c.save() - - Ping.objects.create(owner=c, n=1) - Ping.objects.create(owner=c, n=2) - - Command().handle() - - self.assertEqual(Ping.objects.count(), 1) diff --git a/hc/api/tests/test_prunepingsslow.py b/hc/api/tests/test_prunepingsslow.py deleted file mode 100644 index fb0f3a25..00000000 --- a/hc/api/tests/test_prunepingsslow.py +++ /dev/null @@ -1,25 +0,0 @@ -from datetime import timedelta -from unittest.mock import Mock - -from django.utils import timezone -from hc.api.management.commands.prunepingsslow import Command -from hc.api.models import Check, Ping -from hc.test import BaseTestCase - - -class PrunePingsSlowTestCase(BaseTestCase): - year_ago = timezone.now() - timedelta(days=365) - - def test_it_removes_old_pings(self): - self.profile.ping_log_limit = 1 - self.profile.save() - - c = Check(project=self.project, n_pings=2) - c.save() - - Ping.objects.create(owner=c, n=1) - Ping.objects.create(owner=c, n=2) - - Command(stdout=Mock()).handle() - - self.assertEqual(Ping.objects.count(), 1)