diff options
author | Dan McGee <dan@archlinux.org> | 2011-02-23 16:46:54 +0100 |
---|---|---|
committer | Dan McGee <dan@archlinux.org> | 2011-02-23 16:46:54 +0100 |
commit | f6c41b273c8962718b303c6050c2fd8bcea533a8 (patch) | |
tree | a6ec976a0f225852faa74bbb0955035f228ba78e | |
parent | dfc4d919f1b0349d5143764c3f8f62c240e50623 (diff) | |
download | archweb-f6c41b273c8962718b303c6050c2fd8bcea533a8.tar.gz archweb-f6c41b273c8962718b303c6050c2fd8bcea533a8.tar.xz |
reporead performance improvements
When importing over a million files, it makes sense to take the slightly
faster route and call the PackageFile() constructor directly rather than
going through the related manager's create method.
We can also get huge performance improvements, especially with files
databases, by using the 'io' rather than 'codecs' module. The former is
now implemented in C in 2.7 and results in a no-work import (so
measuring only the DB read speed) of extra.files.tar.gz from ~30 seconds
to ~5 seconds.
Signed-off-by: Dan McGee <dan@archlinux.org>
-rw-r--r-- | devel/management/commands/reporead.py | 21 |
1 files changed, 18 insertions, 3 deletions
diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 72595c6..bda3bd6 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -27,9 +27,17 @@ import logging from datetime import datetime from optparse import make_option +# New in 2.6, but fast (C implementation) in 2.7. We will use it over codecs if +# available. Eventually remove the codecs import completely. +io = None +try: + import io +except ImportError: + pass + from logging import ERROR, WARNING, INFO, DEBUG -from main.models import Arch, Package, PackageDepend, Repo +from main.models import Arch, Package, PackageDepend, PackageFile, Repo logging.basicConfig( level=WARNING, @@ -241,10 +249,13 @@ def populate_files(dbpkg, repopkg, force=False): dirname, filename = f.rsplit('/', 1) if filename == '': filename = None - dbpkg.packagefile_set.create( + # this is basically like calling dbpkg.packagefile_set.create(), + # but much faster as we can skip a lot of the repeated code paths + pkgfile = PackageFile(pkg=dbpkg, is_directory=(filename is None), directory=dirname + '/', filename=filename) + pkgfile.save() dbpkg.files_last_update = datetime.now() dbpkg.save() @@ -394,7 +405,11 @@ def parse_repo(repopath): if fname not in dbfiles: continue data_file = repodb.extractfile(tarinfo) - data_file = codecs.EncodedFile(data_file, 'utf-8') + if io is None: + data_file = codecs.EncodedFile(data_file, 'utf-8') + else: + data_file = io.TextIOWrapper(io.BytesIO(data_file.read()), + encoding='utf=8') try: data = parse_info(data_file) p = pkgs.setdefault(pkgid, Pkg(reponame)) |