summaryrefslogtreecommitdiffstats
path: root/devel
diff options
context:
space:
mode:
authorDan McGee <dan@archlinux.org>2013-12-17 15:26:47 +0100
committerDan McGee <dan@archlinux.org>2013-12-18 07:01:59 +0100
commitac157895f10abf96ad7d1d8170e57a3cb4ee3df8 (patch)
treef55dce4644b807df1e5ccb9ef4597f4c0a2d81ce /devel
parent8e8e3d7aa7ac843a5523f971cb2c5426e9f05379 (diff)
downloadarchweb-ac157895f10abf96ad7d1d8170e57a3cb4ee3df8.tar.gz
archweb-ac157895f10abf96ad7d1d8170e57a3cb4ee3df8.tar.xz
reporead: implement delayed parsing of files data
This gives us some large memory savings in python due to the internal storage of Unicode strings vs. byte strings, as well as saving us processing time up front for filelist data we are never going to have to actually use. Signed-off-by: Dan McGee <dan@archlinux.org>
Diffstat (limited to 'devel')
-rw-r--r--devel/management/commands/reporead.py58
1 files changed, 35 insertions, 23 deletions
diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py
index 248eb62..3244d59 100644
--- a/devel/management/commands/reporead.py
+++ b/devel/management/commands/reporead.py
@@ -82,8 +82,7 @@ class RepoPackage(object):
'md5sum', 'sha256sum', 'url', 'packager' )
number = ( 'csize', 'isize' )
collections = ( 'depends', 'optdepends', 'makedepends', 'checkdepends',
- 'conflicts', 'provides', 'replaces', 'groups', 'license',
- 'files' )
+ 'conflicts', 'provides', 'replaces', 'groups', 'license')
def __init__(self, repo):
self.repo = repo
@@ -98,7 +97,6 @@ class RepoPackage(object):
setattr(self, k, ())
self.builddate = None
self.files = None
- self.has_files = False
def populate(self, values):
for k, v in values.iteritems():
@@ -120,14 +118,22 @@ class RepoPackage(object):
logger.warning(
'Package %s had unparsable build date %s',
self.name, v[0])
- elif k == 'files':
- self.files = tuple(v)
- self.has_files = True
else:
# anything left in collections
setattr(self, k, tuple(v))
@property
+ def files_list(self):
+ data_file = io.TextIOWrapper(io.BytesIO(self.files), encoding='UTF-8')
+ try:
+ info = parse_info(data_file)
+ except UnicodeDecodeError:
+ logger.warn("Could not correctly decode files list for %s",
+ self.name)
+ return None
+ return info['files']
+
+ @property
def full_version(self):
'''Very similar to the main.models.Package method.'''
if self.epoch > 0:
@@ -291,15 +297,18 @@ def populate_files(dbpkg, repopkg, force=False):
return
# only delete files if we are reading a DB that contains them
- if repopkg.has_files:
+ if repopkg.files:
+ files = repopkg.files_list
+ # we had files data, but it couldn't be parsed, so skip
+ if not files:
+ return
delete_pkg_files(dbpkg)
logger.info("adding %d files for package %s",
- len(repopkg.files), dbpkg.pkgname)
+ len(files), dbpkg.pkgname)
pkg_files = []
# sort in normal alpha-order that pacman uses, rather than makepkg's
# default breadth-first, directory-first ordering
- files = sorted(repopkg.files)
- for f in files:
+ for f in sorted(files):
if '/' in f:
dirname, filename = f.rsplit('/', 1)
dirname += '/'
@@ -507,24 +516,27 @@ def parse_repo(repopath):
repodb = tarfile.open(repopath, "r")
logger.debug("Starting package parsing")
- dbfiles = ('desc', 'depends', 'files')
newpkg = lambda: RepoPackage(reponame)
pkgs = defaultdict(newpkg)
for tarinfo in repodb.getmembers():
if tarinfo.isreg():
pkgid, fname = os.path.split(tarinfo.name)
- if fname not in dbfiles:
- continue
- data_file = repodb.extractfile(tarinfo)
- data_file = io.TextIOWrapper(io.BytesIO(data_file.read()),
- encoding='UTF-8')
- try:
- pkgs[pkgid].populate(parse_info(data_file))
- except UnicodeDecodeError:
- logger.warn("Could not correctly decode %s, skipping file",
- tarinfo.name)
- data_file.close()
- del data_file
+ if fname == 'files':
+ # don't parse yet for speed and memory consumption reasons
+ files_data = repodb.extractfile(tarinfo)
+ pkgs[pkgid].files = files_data.read()
+ del files_data
+ elif fname in ('desc', 'depends'):
+ data_file = repodb.extractfile(tarinfo)
+ data_file = io.TextIOWrapper(io.BytesIO(data_file.read()),
+ encoding='UTF-8')
+ try:
+ pkgs[pkgid].populate(parse_info(data_file))
+ except UnicodeDecodeError:
+ logger.warn("Could not correctly decode %s, skipping file",
+ tarinfo.name)
+ data_file.close()
+ del data_file
logger.debug("Done parsing file %s/%s", pkgid, fname)