summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan McGee <dan@archlinux.org>2013-12-17 15:24:41 +0100
committerDan McGee <dan@archlinux.org>2013-12-18 07:01:50 +0100
commit8e8e3d7aa7ac843a5523f971cb2c5426e9f05379 (patch)
treea14e59083e8d1e02fbade04dc50e3511c3943daf
parent77a45dc7bc6f0badb45ec043e85f1b542c52792e (diff)
downloadarchweb-8e8e3d7aa7ac843a5523f971cb2c5426e9f05379.tar.gz
archweb-8e8e3d7aa7ac843a5523f971cb2c5426e9f05379.tar.xz
reporead: bring back batched_bulk_create()
For packages with filelists with > 80,000 items, we were starting to see some serious memory issues in reporead. This was both on the statement generation side in Python as well as on the database side. Break the updates into chunks of 10,000 when we encounter packages with tons of files to control things in a bit. Signed-off-by: Dan McGee <dan@archlinux.org>
-rw-r--r--devel/management/commands/reporead.py20
1 files changed, 19 insertions, 1 deletions
diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py
index 2b565cf..248eb62 100644
--- a/devel/management/commands/reporead.py
+++ b/devel/management/commands/reporead.py
@@ -260,6 +260,24 @@ def delete_pkg_files(dbpkg):
cursor.execute('DELETE FROM package_files WHERE pkg_id = %s', [dbpkg.id])
+def batched_bulk_create(model, all_objects):
+ cutoff = 10000
+ length = len(all_objects)
+ if length < cutoff:
+ return model.objects.bulk_create(all_objects)
+
+ def chunks():
+ offset = 0
+ while offset < length:
+ yield all_objects[offset:offset + cutoff]
+ offset += cutoff
+
+ for items in chunks():
+ ret = model.objects.bulk_create(items)
+
+ return ret
+
+
def populate_files(dbpkg, repopkg, force=False):
if not force:
if not pkg_same_version(repopkg, dbpkg):
@@ -294,7 +312,7 @@ def populate_files(dbpkg, repopkg, force=False):
directory=dirname,
filename=filename)
pkg_files.append(pkgfile)
- PackageFile.objects.bulk_create(pkg_files)
+ batched_bulk_create(PackageFile, pkg_files)
dbpkg.files_last_update = now()
dbpkg.save()