Browse Source

Implement automatic `api_ping` and `api_notification` pruning

cc: #556
master
Pēteris Caune 3 years ago
parent
commit
e1f51093f1
No known key found for this signature in database GPG Key ID: E28D7679E9A9EDE2
8 changed files with 71 additions and 154 deletions
  1. +1
    -0
      CHANGELOG.md
  2. +9
    -21
      README.md
  3. +0
    -22
      hc/api/management/commands/prunepings.py
  4. +0
    -36
      hc/api/management/commands/prunepingsslow.py
  5. +16
    -0
      hc/api/models.py
  6. +45
    -26
      hc/api/tests/test_check_model.py
  7. +0
    -24
      hc/api/tests/test_prunepings.py
  8. +0
    -25
      hc/api/tests/test_prunepingsslow.py

+ 1
- 0
CHANGELOG.md View File

@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file.
### Improvements
- Switch from croniter to cronsim (vendored in hc.lib.cronsim)
- Change outgoing webhook timeout to 10s, but cap the total time to 20s
- Implement automatic `api_ping` and `api_notification` pruning (#556)
### Bug Fixes
- Fix hc.api.views.ping to handle non-utf8 data in request body (#574)


+ 9
- 21
README.md View File

@ -155,29 +155,17 @@ manager like [supervisor](http://supervisord.org/) or systemd.
## Database Cleanup
With time and use the Healthchecks database will grow in size. You may
decide to prune old data: inactive user accounts, old checks not assigned
to users, records of outgoing email messages and records of received pings.
There are separate Django management commands for each task:
Healthchecks deletes old entries from `api_ping` and `api_notification`
tables automatically. By default, Healthchecks keeps the 100 most recent
pings for every check. You can set the limit higher to keep a longer history:
go to the Administration Panel, look up user's **Profile** and modify its
"Ping log limit" field.
* Remove old records from `api_ping` table. For each check, keep 100 most
recent pings:
For each check, Healthchecks removes notifications that are older than the
oldest stored ping for same check.
```
$ ./manage.py prunepings
```
Note: 100 is the default value but you can configure a different
limit per-user. To do that, go to the
Administration Panel, look up user's **Profile** and modify its
"Ping log limit" field.
* Remove old records of sent notifications. For each check, remove
notifications that are older than the oldest stored ping for same check.
```
$ ./manage.py prunenotifications
```
Healthchecks also provides management commands for cleaning up
`auth_user`, `api_tokenbucket` and `api_flip` tables.
* Remove user accounts that match either of these conditions:
* Account was created more than 6 months ago, and user has never logged in.


+ 0
- 22
hc/api/management/commands/prunepings.py View File

@ -1,22 +0,0 @@
from django.db.models import F
from django.contrib.auth.models import User
from django.core.management.base import BaseCommand
from hc.accounts.models import Profile
from hc.api.models import Ping
class Command(BaseCommand):
help = "Prune pings based on limits in user profiles"
def handle(self, *args, **options):
# Create any missing user profiles
for user in User.objects.filter(profile=None):
Profile.objects.get_or_create(user_id=user.id)
q = Ping.objects
q = q.annotate(limit=F("owner__project__owner__profile__ping_log_limit"))
q = q.filter(n__lte=F("owner__n_pings") - F("limit"))
q = q.filter(n__gt=0)
n_pruned, _ = q.delete()
return "Done! Pruned %d pings" % n_pruned

+ 0
- 36
hc/api/management/commands/prunepingsslow.py View File

@ -1,36 +0,0 @@
from django.db.models import F
from django.contrib.auth.models import User
from django.core.management.base import BaseCommand
from hc.accounts.models import Profile
from hc.api.models import Check, Ping
class Command(BaseCommand):
help = """Prune pings based on limits in user profiles.
This command prunes each check individually. So it does the work
in small chunks instead of a few big SQL queries like the `prunepings`
command. It is appropriate for initial pruning of the potentially
huge api_ping table.
"""
def handle(self, *args, **options):
# Create any missing user profiles
for user in User.objects.filter(profile=None):
Profile.objects.get_or_create(user_id=user.id)
checks = Check.objects
checks = checks.annotate(limit=F("project__owner__profile__ping_log_limit"))
for check in checks:
q = Ping.objects.filter(owner_id=check.id)
q = q.filter(n__lte=check.n_pings - check.limit)
q = q.filter(n__gt=0)
n_pruned, _ = q.delete()
self.stdout.write(
"Pruned %d pings for check %s (%s)" % (n_pruned, check.id, check.name)
)
return "Done!"

+ 16
- 0
hc/api/models.py View File

@ -329,6 +329,22 @@ class Check(models.Model):
ping.exitstatus = exitstatus
ping.save()
# Every 100 received pings, prune old pings and notifications:
if self.n_pings % 100 == 0:
self.prune()
def prune(self):
""" Remove old pings and notifications. """
limit = self.project.owner_profile.ping_log_limit
self.ping_set.filter(n__lte=self.n_pings - limit).delete()
try:
ping = self.ping_set.earliest("id")
self.notification_set.filter(created__lt=ping.created).delete()
except Ping.DoesNotExist:
pass
def downtimes(self, months):
""" Calculate the number of downtimes and downtime minutes per month.


+ 45
- 26
hc/api/tests/test_check_model.py View File

@ -1,8 +1,8 @@
from datetime import datetime, timedelta
from datetime import datetime, timedelta as td
from unittest.mock import Mock, patch
from django.utils import timezone
from hc.api.models import Check, Flip
from hc.api.models import Channel, Check, Flip, Notification, Ping
from hc.test import BaseTestCase
CURRENT_TIME = datetime(2020, 1, 15, tzinfo=timezone.utc)
@ -26,14 +26,14 @@ class CheckModelTestCase(BaseTestCase):
def test_status_works_with_grace_period(self):
check = Check()
check.status = "up"
check.last_ping = timezone.now() - timedelta(days=1, minutes=30)
check.last_ping = timezone.now() - td(days=1, minutes=30)
self.assertEqual(check.get_status(), "grace")
def test_get_status_handles_paused_check(self):
check = Check()
check.status = "paused"
check.last_ping = timezone.now() - timedelta(days=1, minutes=30)
check.last_ping = timezone.now() - td(days=1, minutes=30)
self.assertEqual(check.get_status(), "paused")
def test_status_works_with_cron_syntax(self):
@ -47,15 +47,15 @@ class CheckModelTestCase(BaseTestCase):
check.last_ping = dt
# 23:59pm
now = dt + timedelta(hours=23, minutes=59)
now = dt + td(hours=23, minutes=59)
self.assertEqual(check.get_status(now), "up")
# 00:00am
now = dt + timedelta(days=1)
now = dt + td(days=1)
self.assertEqual(check.get_status(now), "grace")
# 1:30am
now = dt + timedelta(days=1, minutes=60)
now = dt + td(days=1, minutes=60)
self.assertEqual(check.get_status(now), "down")
def test_status_works_with_timezone(self):
@ -70,36 +70,36 @@ class CheckModelTestCase(BaseTestCase):
check.tz = "Australia/Brisbane" # UTC+10
# 10:30am
now = dt + timedelta(hours=23, minutes=59)
now = dt + td(hours=23, minutes=59)
self.assertEqual(check.get_status(now), "up")
# 10:30am
now = dt + timedelta(days=1)
now = dt + td(days=1)
self.assertEqual(check.get_status(now), "grace")
# 11:30am
now = dt + timedelta(days=1, minutes=60)
now = dt + td(days=1, minutes=60)
self.assertEqual(check.get_status(now), "down")
def test_get_status_handles_past_grace(self):
check = Check()
check.status = "up"
check.last_ping = timezone.now() - timedelta(days=2)
check.last_ping = timezone.now() - td(days=2)
self.assertEqual(check.get_status(), "down")
def test_get_status_obeys_down_status(self):
check = Check()
check.status = "down"
check.last_ping = timezone.now() - timedelta(minutes=1)
check.last_ping = timezone.now() - td(minutes=1)
self.assertEqual(check.get_status(), "down")
def test_get_status_handles_started(self):
check = Check()
check.last_ping = timezone.now() - timedelta(hours=2)
check.last_ping = timezone.now() - td(hours=2)
# Last start was 5 minutes ago, display status should be "started"
check.last_start = timezone.now() - timedelta(minutes=5)
check.last_start = timezone.now() - td(minutes=5)
for status in ("new", "paused", "up", "down"):
check.status = status
self.assertEqual(check.get_status(with_started=True), "started")
@ -107,9 +107,9 @@ class CheckModelTestCase(BaseTestCase):
def test_get_status_handles_down_then_started_and_expired(self):
check = Check(status="down")
# Last ping was 2 days ago
check.last_ping = timezone.now() - timedelta(days=2)
check.last_ping = timezone.now() - td(days=2)
# Last start was 2 hours ago - the check is past its grace time
check.last_start = timezone.now() - timedelta(hours=2)
check.last_start = timezone.now() - td(hours=2)
self.assertEqual(check.get_status(with_started=True), "down")
self.assertEqual(check.get_status(), "down")
@ -117,9 +117,9 @@ class CheckModelTestCase(BaseTestCase):
def test_get_status_handles_up_then_started(self):
check = Check(status="up")
# Last ping was 2 hours ago, so is still up
check.last_ping = timezone.now() - timedelta(hours=2)
check.last_ping = timezone.now() - td(hours=2)
# Last start was 5 minutes ago
check.last_start = timezone.now() - timedelta(minutes=5)
check.last_start = timezone.now() - td(minutes=5)
self.assertEqual(check.get_status(with_started=True), "started")
# A started check still is considered "up":
@ -128,9 +128,9 @@ class CheckModelTestCase(BaseTestCase):
def test_get_status_handles_up_then_started_and_expired(self):
check = Check(status="up")
# Last ping was 3 hours ago, so is still up
check.last_ping = timezone.now() - timedelta(hours=3)
check.last_ping = timezone.now() - td(hours=3)
# Last start was 2 hours ago - the check is past its grace time
check.last_start = timezone.now() - timedelta(hours=2)
check.last_start = timezone.now() - td(hours=2)
self.assertEqual(check.get_status(with_started=True), "down")
self.assertEqual(check.get_status(), "down")
@ -138,14 +138,14 @@ class CheckModelTestCase(BaseTestCase):
def test_get_status_handles_paused_then_started_and_expired(self):
check = Check(status="paused")
# Last start was 2 hours ago - the check is past its grace time
check.last_start = timezone.now() - timedelta(hours=2)
check.last_start = timezone.now() - td(hours=2)
self.assertEqual(check.get_status(with_started=True), "down")
self.assertEqual(check.get_status(), "down")
def test_get_status_handles_started_and_mia(self):
check = Check()
check.last_start = timezone.now() - timedelta(hours=2)
check.last_start = timezone.now() - td(hours=2)
self.assertEqual(check.get_status(with_started=True), "down")
self.assertEqual(check.get_status(), "down")
@ -174,17 +174,17 @@ class CheckModelTestCase(BaseTestCase):
# Nov. 2019
self.assertEqual(nov[0].strftime("%m-%Y"), "11-2019")
self.assertEqual(nov[1], timedelta())
self.assertEqual(nov[1], td())
self.assertEqual(nov[2], 0)
# Dec. 2019
self.assertEqual(dec[0].strftime("%m-%Y"), "12-2019")
self.assertEqual(dec[1], timedelta())
self.assertEqual(dec[1], td())
self.assertEqual(dec[2], 0)
# Jan. 2020
self.assertEqual(jan[0].strftime("%m-%Y"), "01-2020")
self.assertEqual(jan[1], timedelta())
self.assertEqual(jan[1], td())
self.assertEqual(jan[2], 0)
@patch("hc.api.models.timezone.now", MOCK_NOW)
@ -259,5 +259,24 @@ class CheckModelTestCase(BaseTestCase):
self.assertIsNone(dec[2])
# Jan. 2020
self.assertEqual(jan[1], timedelta())
self.assertEqual(jan[1], td())
self.assertEqual(jan[2], 0)
def test_it_prunes(self):
check = Check.objects.create(project=self.project, n_pings=101)
Ping.objects.create(owner=check, n=101)
Ping.objects.create(owner=check, n=1)
n = Notification(owner=check)
n.channel = Channel.objects.create(project=self.project, kind="email")
n.check_status = "down"
n.save()
n.created = check.created - td(minutes=10)
n.save()
check.prune()
self.assertTrue(Ping.objects.filter(n=101).exists())
self.assertFalse(Ping.objects.filter(n=1).exists())
self.assertEqual(Notification.objects.count(), 0)

+ 0
- 24
hc/api/tests/test_prunepings.py View File

@ -1,24 +0,0 @@
from datetime import timedelta
from django.utils import timezone
from hc.api.management.commands.prunepings import Command
from hc.api.models import Check, Ping
from hc.test import BaseTestCase
class PrunePingsTestCase(BaseTestCase):
year_ago = timezone.now() - timedelta(days=365)
def test_it_removes_old_pings(self):
self.profile.ping_log_limit = 1
self.profile.save()
c = Check(project=self.project, n_pings=2)
c.save()
Ping.objects.create(owner=c, n=1)
Ping.objects.create(owner=c, n=2)
Command().handle()
self.assertEqual(Ping.objects.count(), 1)

+ 0
- 25
hc/api/tests/test_prunepingsslow.py View File

@ -1,25 +0,0 @@
from datetime import timedelta
from unittest.mock import Mock
from django.utils import timezone
from hc.api.management.commands.prunepingsslow import Command
from hc.api.models import Check, Ping
from hc.test import BaseTestCase
class PrunePingsSlowTestCase(BaseTestCase):
year_ago = timezone.now() - timedelta(days=365)
def test_it_removes_old_pings(self):
self.profile.ping_log_limit = 1
self.profile.save()
c = Check(project=self.project, n_pings=2)
c.save()
Ping.objects.create(owner=c, n=1)
Ping.objects.create(owner=c, n=2)
Command(stdout=Mock()).handle()
self.assertEqual(Ping.objects.count(), 1)

Loading…
Cancel
Save