diff options
author | Dan McGee <dan@archlinux.org> | 2010-09-16 23:45:30 +0200 |
---|---|---|
committer | Dan McGee <dan@archlinux.org> | 2010-09-21 16:10:33 +0200 |
commit | 3d8bc07622561028dbca9c709470accf79c95bd6 (patch) | |
tree | b7af51ca4dedd5b7f7f8108d80b87d6420b7c131 /mirrors/management/commands | |
parent | 7c88e3e1a2b5d36281b3981e2144c2dd16c7596c (diff) | |
download | archweb-3d8bc07622561028dbca9c709470accf79c95bd6.tar.gz archweb-3d8bc07622561028dbca9c709470accf79c95bd6.tar.xz |
Add 'mirrorcheck' command
This does the actual work of going out and checking the mirror status. In
short, it polls every active mirror URL for the 'lastsync' file and then
records the appropriate details. These include the contents of that file,
how long the total time to retrieve took, and any errors encountered.
In order to finish up a bit faster, we spawn several threads to do the
actual work. This parallelization allows the whole check process to take
around 30 seconds rather than several minutes.
Signed-off-by: Dan McGee <dan@archlinux.org>
Diffstat (limited to 'mirrors/management/commands')
-rw-r--r-- | mirrors/management/commands/__init__.py | 0 | ||||
-rw-r--r-- | mirrors/management/commands/mirrorcheck.py | 149 |
2 files changed, 149 insertions, 0 deletions
diff --git a/mirrors/management/commands/__init__.py b/mirrors/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/mirrors/management/commands/__init__.py diff --git a/mirrors/management/commands/mirrorcheck.py b/mirrors/management/commands/mirrorcheck.py new file mode 100644 index 0000000..4a933ca --- /dev/null +++ b/mirrors/management/commands/mirrorcheck.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- +""" +mirrorcheck command + +Poll every active mirror URL we have in the database, grab the 'lastsync' file, +and record details about how long it took and how up to date the mirror is. If +we encounter errors, record those as well. + +Usage: ./manage.py mirrorcheck +""" + +from django.core.management.base import NoArgsCommand +from django.db.models import Q + +from datetime import datetime, timedelta +import logging +import re +import socket +import sys +import time +import thread +from threading import Thread +from Queue import Queue, Empty +import urllib2 + +from logging import ERROR, WARNING, INFO, DEBUG + +from mirrors.models import Mirror, MirrorUrl, MirrorLog + +logging.basicConfig( + level=WARNING, + format='%(asctime)s -> %(levelname)s: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + stream=sys.stderr) +logger = logging.getLogger() + +class Command(NoArgsCommand): + help = "Runs a check on all known mirror URLs to determine their up-to-date status." + + def handle_noargs(self, **options): + v = int(options.get('verbosity', 0)) + if v == 0: + logger.level = ERROR + elif v == 1: + logger.level = INFO + elif v == 2: + logger.level = DEBUG + + import signal, traceback + handler = lambda sig, stack: traceback.print_stack(stack) + signal.signal(signal.SIGQUIT, handler) + signal.signal(signal.SIGUSR1, handler) + + return check_current_mirrors() + +def parse_rfc3339_datetime(time): + # '2010-09-02 11:05:06+02:00' + m = re.match('^(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})([-+])(\d{2}):(\d{2})', time) + if m: + vals = m.groups() + parsed = datetime(int(vals[0]), int(vals[1]), int(vals[2]), + int(vals[3]), int(vals[4]), int(vals[5])) + # now account for time zone offset + sign = vals[6] + offset = timedelta(hours=int(sign + vals[7]), + minutes=int(sign + vals[8])) + # subtract the offset, e.g. '-04:00' should be moved up 4 hours + return parsed - offset + return None + +def check_mirror_url(mirror_url): + url = mirror_url.url + 'lastsync' + logger.info("checking URL %s" % url) + log = MirrorLog(url=mirror_url, check_time=datetime.utcnow()) + try: + start = time.time() + result = urllib2.urlopen(url, timeout=10) + data = result.read() + result.close() + end = time.time() + # lastsync should be an epoch value, but some mirrors + # are creating their own in RFC-3339 format: + # '2010-09-02 11:05:06+02:00' + try: + parsed_time = datetime.utcfromtimestamp(int(data)) + except ValueError: + parsed_time = parse_rfc3339_datetime(data) + + log.last_sync = parsed_time + log.duration = end - start + logger.debug("success: %s, %.2f" % (url, log.duration)) + except urllib2.HTTPError, e: + log.is_success = False + log.error =str(e) + logger.debug("failed: %s, %s" % (url, log.error)) + except urllib2.URLError, e: + log.is_success=False + log.error = e.reason + if isinstance(e.reason, socket.timeout): + log.error = "Connection timed out." + elif isinstance(e.reason, socket.error): + log.error = e.reason.args[1] + logger.debug("failed: %s, %s" % (url, log.error)) + + log.save() + return log + +def mirror_url_worker(queue): + while True: + try: + item = queue.get(block=False) + check_mirror_url(item) + queue.task_done() + except Empty: + return 0 + +class MirrorCheckPool(object): + def __init__(self, work, num_threads=10): + self.tasks = Queue() + for i in work: + self.tasks.put(i) + self.threads = [] + for i in range(num_threads): + thread = Thread(target=mirror_url_worker, args=(self.tasks,)) + thread.daemon = True + self.threads.append(thread) + + def run_and_join(self): + logger.debug("starting threads") + for t in self.threads: + t.start() + logger.debug("joining on all threads") + self.tasks.join() + +def check_current_mirrors(): + urls = MirrorUrl.objects.filter( + Q(protocol__protocol__iexact='HTTP') | + Q(protocol__protocol__iexact='FTP'), + mirror__active=True, mirror__public=True) + + pool = MirrorCheckPool(urls) + pool.run_and_join() + return 0 + +# For lack of a better place to put it, here is a query to get latest check +# result joined with mirror details: +# SELECT mu.*, m.*, ml.* FROM mirrors_mirrorurl mu JOIN mirrors_mirror m ON mu.mirror_id = m.id JOIN mirrors_mirrorlog ml ON mu.id = ml.url_id LEFT JOIN mirrors_mirrorlog ml2 ON ml.url_id = ml2.url_id AND ml.id < ml2.id WHERE ml2.id IS NULL AND m.active = 1 AND m.public = 1; + +# vim: set ts=4 sw=4 et: |