summaryrefslogtreecommitdiffstats
path: root/schema/gendummydata.py
diff options
context:
space:
mode:
Diffstat (limited to 'schema/gendummydata.py')
-rwxr-xr-xschema/gendummydata.py354
1 files changed, 180 insertions, 174 deletions
diff --git a/schema/gendummydata.py b/schema/gendummydata.py
index 1f3d0476..b3a73ef2 100755
--- a/schema/gendummydata.py
+++ b/schema/gendummydata.py
@@ -10,33 +10,32 @@ usage: gendummydata.py outputfilename.sql
# insert these users/packages into the AUR database.
#
import hashlib
-import random
-import time
+import logging
import os
+import random
import sys
-import io
-import logging
+import time
-LOG_LEVEL = logging.DEBUG # logging level. set to logging.INFO to reduce output
+LOG_LEVEL = logging.DEBUG # logging level. set to logging.INFO to reduce output
SEED_FILE = "/usr/share/dict/words"
-DB_HOST = os.getenv("DB_HOST", "localhost")
-DB_NAME = os.getenv("DB_NAME", "AUR")
-DB_USER = os.getenv("DB_USER", "aur")
-DB_PASS = os.getenv("DB_PASS", "aur")
-USER_ID = 5 # Users.ID of first bogus user
-PKG_ID = 1 # Packages.ID of first package
+DB_HOST = os.getenv("DB_HOST", "localhost")
+DB_NAME = os.getenv("DB_NAME", "AUR")
+DB_USER = os.getenv("DB_USER", "aur")
+DB_PASS = os.getenv("DB_PASS", "aur")
+USER_ID = 5 # Users.ID of first bogus user
+PKG_ID = 1 # Packages.ID of first package
MAX_USERS = 300 # how many users to 'register'
-MAX_DEVS = .1 # what percentage of MAX_USERS are Developers
-MAX_TUS = .2 # what percentage of MAX_USERS are Trusted Users
-MAX_PKGS = 900 # how many packages to load
-PKG_DEPS = (1, 15) # min/max depends a package has
-PKG_RELS = (1, 5) # min/max relations a package has
-PKG_SRC = (1, 3) # min/max sources a package has
+MAX_DEVS = .1 # what percentage of MAX_USERS are Developers
+MAX_TUS = .2 # what percentage of MAX_USERS are Trusted Users
+MAX_PKGS = 900 # how many packages to load
+PKG_DEPS = (1, 15) # min/max depends a package has
+PKG_RELS = (1, 5) # min/max relations a package has
+PKG_SRC = (1, 3) # min/max sources a package has
PKG_CMNTS = (1, 5) # min/max number of comments a package has
CATEGORIES_COUNT = 17 # the number of categories from aur-schema
-VOTING = (0, .30) # percentage range for package voting
-OPEN_PROPOSALS = 5 # number of open trusted user proposals
-CLOSE_PROPOSALS = 15 # number of closed trusted user proposals
+VOTING = (0, .30) # percentage range for package voting
+OPEN_PROPOSALS = 5 # number of open trusted user proposals
+CLOSE_PROPOSALS = 15 # number of closed trusted user proposals
RANDOM_TLDS = ("edu", "com", "org", "net", "tw", "ru", "pl", "de", "es")
RANDOM_URL = ("http://www.", "ftp://ftp.", "http://", "ftp://")
RANDOM_LOCS = ("pub", "release", "files", "downloads", "src")
@@ -48,20 +47,20 @@ logging.basicConfig(format=logformat, level=LOG_LEVEL)
log = logging.getLogger()
if len(sys.argv) != 2:
- log.error("Missing output filename argument")
- raise SystemExit
+ log.error("Missing output filename argument")
+ raise SystemExit
# make sure the seed file exists
#
if not os.path.exists(SEED_FILE):
- log.error("Please install the 'words' Arch package")
- raise SystemExit
+ log.error("Please install the 'words' Arch package")
+ raise SystemExit
# make sure comments can be created
#
if not os.path.exists(FORTUNE_FILE):
- log.error("Please install the 'fortune-mod' Arch package")
- raise SystemExit
+ log.error("Please install the 'fortune-mod' Arch package")
+ raise SystemExit
# track what users/package names have been used
#
@@ -69,21 +68,28 @@ seen_users = {}
seen_pkgs = {}
user_keys = []
+
# some functions to generate random data
#
def genVersion():
- ver = []
- ver.append("%d" % random.randrange(0,10))
- ver.append("%d" % random.randrange(0,20))
- if random.randrange(0,2) == 0:
- ver.append("%d" % random.randrange(0,100))
- return ".".join(ver) + "-%d" % random.randrange(1,11)
+ ver = []
+ ver.append("%d" % random.randrange(0, 10))
+ ver.append("%d" % random.randrange(0, 20))
+ if random.randrange(0, 2) == 0:
+ ver.append("%d" % random.randrange(0, 100))
+ return ".".join(ver) + "-%d" % random.randrange(1, 11)
+
+
def genCategory():
- return random.randrange(1,CATEGORIES_COUNT)
+ return random.randrange(1, CATEGORIES_COUNT)
+
+
def genUID():
- return seen_users[user_keys[random.randrange(0,len(user_keys))]]
+ return seen_users[user_keys[random.randrange(0, len(user_keys))]]
+
+
def genFortune():
- return fortunes[random.randrange(0,len(fortunes))].replace("'", "")
+ return fortunes[random.randrange(0, len(fortunes))].replace("'", "")
# load the words, and make sure there are enough words for users/pkgs
@@ -93,25 +99,25 @@ fp = open(SEED_FILE, "r", encoding="utf-8")
contents = fp.readlines()
fp.close()
if MAX_USERS > len(contents):
- MAX_USERS = len(contents)
+ MAX_USERS = len(contents)
if MAX_PKGS > len(contents):
- MAX_PKGS = len(contents)
+ MAX_PKGS = len(contents)
if len(contents) - MAX_USERS > MAX_PKGS:
- need_dupes = 0
+ need_dupes = 0
else:
- need_dupes = 1
+ need_dupes = 1
# select random usernames
#
log.debug("Generating random user names...")
user_id = USER_ID
while len(seen_users) < MAX_USERS:
- user = random.randrange(0, len(contents))
- word = contents[user].replace("'", "").replace(".","").replace(" ", "_")
- word = word.strip().lower()
- if word not in seen_users:
- seen_users[word] = user_id
- user_id += 1
+ user = random.randrange(0, len(contents))
+ word = contents[user].replace("'", "").replace(".", "").replace(" ", "_")
+ word = word.strip().lower()
+ if word not in seen_users:
+ seen_users[word] = user_id
+ user_id += 1
user_keys = list(seen_users.keys())
# select random package names
@@ -119,17 +125,17 @@ user_keys = list(seen_users.keys())
log.debug("Generating random package names...")
num_pkgs = PKG_ID
while len(seen_pkgs) < MAX_PKGS:
- pkg = random.randrange(0, len(contents))
- word = contents[pkg].replace("'", "").replace(".","").replace(" ", "_")
- word = word.strip().lower()
- if not need_dupes:
- if word not in seen_pkgs and word not in seen_users:
- seen_pkgs[word] = num_pkgs
- num_pkgs += 1
- else:
- if word not in seen_pkgs:
- seen_pkgs[word] = num_pkgs
- num_pkgs += 1
+ pkg = random.randrange(0, len(contents))
+ word = contents[pkg].replace("'", "").replace(".", "").replace(" ", "_")
+ word = word.strip().lower()
+ if not need_dupes:
+ if word not in seen_pkgs and word not in seen_users:
+ seen_pkgs[word] = num_pkgs
+ num_pkgs += 1
+ else:
+ if word not in seen_pkgs:
+ seen_pkgs[word] = num_pkgs
+ num_pkgs += 1
# free up contents memory
#
@@ -151,32 +157,32 @@ out.write("BEGIN;\n")
#
log.debug("Creating SQL statements for users.")
for u in user_keys:
- account_type = 1 # default to normal user
- if not has_devs or not has_tus:
- account_type = random.randrange(1, 4)
- if account_type == 3 and not has_devs:
- # this will be a dev account
- #
- developers.append(seen_users[u])
- if len(developers) >= MAX_DEVS * MAX_USERS:
- has_devs = 1
- elif account_type == 2 and not has_tus:
- # this will be a trusted user account
- #
- trustedusers.append(seen_users[u])
- if len(trustedusers) >= MAX_TUS * MAX_USERS:
- has_tus = 1
- else:
- # a normal user account
- #
- pass
-
- h = hashlib.new('md5')
- h.update(u.encode());
- s = ("INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd)"
- " VALUES (%d, %d, '%s', '%s@example.com', '%s');\n")
- s = s % (seen_users[u], account_type, u, u, h.hexdigest())
- out.write(s)
+ account_type = 1 # default to normal user
+ if not has_devs or not has_tus:
+ account_type = random.randrange(1, 4)
+ if account_type == 3 and not has_devs:
+ # this will be a dev account
+ #
+ developers.append(seen_users[u])
+ if len(developers) >= MAX_DEVS * MAX_USERS:
+ has_devs = 1
+ elif account_type == 2 and not has_tus:
+ # this will be a trusted user account
+ #
+ trustedusers.append(seen_users[u])
+ if len(trustedusers) >= MAX_TUS * MAX_USERS:
+ has_tus = 1
+ else:
+ # a normal user account
+ #
+ pass
+
+ h = hashlib.new('md5')
+ h.update(u.encode())
+ s = ("INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd)"
+ " VALUES (%d, %d, '%s', '%s@example.com', '%s');\n")
+ s = s % (seen_users[u], account_type, u, u, h.hexdigest())
+ out.write(s)
log.debug("Number of developers: %d" % len(developers))
log.debug("Number of trusted users: %d" % len(trustedusers))
@@ -193,123 +199,123 @@ fp.close()
log.debug("Creating SQL statements for packages.")
count = 0
for p in list(seen_pkgs.keys()):
- NOW = int(time.time())
- if count % 2 == 0:
- muid = developers[random.randrange(0,len(developers))]
- puid = developers[random.randrange(0,len(developers))]
- else:
- muid = trustedusers[random.randrange(0,len(trustedusers))]
- puid = trustedusers[random.randrange(0,len(trustedusers))]
- if count % 20 == 0: # every so often, there are orphans...
- muid = "NULL"
-
- uuid = genUID() # the submitter/user
-
- s = ("INSERT INTO PackageBases (ID, Name, FlaggerComment, SubmittedTS, ModifiedTS, "
+ NOW = int(time.time())
+ if count % 2 == 0:
+ muid = developers[random.randrange(0, len(developers))]
+ puid = developers[random.randrange(0, len(developers))]
+ else:
+ muid = trustedusers[random.randrange(0, len(trustedusers))]
+ puid = trustedusers[random.randrange(0, len(trustedusers))]
+ if count % 20 == 0: # every so often, there are orphans...
+ muid = "NULL"
+
+ uuid = genUID() # the submitter/user
+
+ s = ("INSERT INTO PackageBases (ID, Name, FlaggerComment, SubmittedTS, ModifiedTS, "
"SubmitterUID, MaintainerUID, PackagerUID) VALUES (%d, '%s', '', %d, %d, %d, %s, %s);\n")
- s = s % (seen_pkgs[p], p, NOW, NOW, uuid, muid, puid)
- out.write(s)
+ s = s % (seen_pkgs[p], p, NOW, NOW, uuid, muid, puid)
+ out.write(s)
- s = ("INSERT INTO Packages (ID, PackageBaseID, Name, Version) VALUES "
+ s = ("INSERT INTO Packages (ID, PackageBaseID, Name, Version) VALUES "
"(%d, %d, '%s', '%s');\n")
- s = s % (seen_pkgs[p], seen_pkgs[p], p, genVersion())
- out.write(s)
+ s = s % (seen_pkgs[p], seen_pkgs[p], p, genVersion())
+ out.write(s)
- count += 1
+ count += 1
- # create random comments for this package
- #
- num_comments = random.randrange(PKG_CMNTS[0], PKG_CMNTS[1])
- for i in range(0, num_comments):
- now = NOW + random.randrange(400, 86400*3)
- s = ("INSERT INTO PackageComments (PackageBaseID, UsersID,"
- " Comments, RenderedComment, CommentTS) VALUES (%d, %d, '%s', '', %d);\n")
- s = s % (seen_pkgs[p], genUID(), genFortune(), now)
- out.write(s)
+ # create random comments for this package
+ #
+ num_comments = random.randrange(PKG_CMNTS[0], PKG_CMNTS[1])
+ for i in range(0, num_comments):
+ now = NOW + random.randrange(400, 86400*3)
+ s = ("INSERT INTO PackageComments (PackageBaseID, UsersID,"
+ " Comments, RenderedComment, CommentTS) VALUES (%d, %d, '%s', '', %d);\n")
+ s = s % (seen_pkgs[p], genUID(), genFortune(), now)
+ out.write(s)
# Cast votes
#
track_votes = {}
log.debug("Casting votes for packages.")
for u in user_keys:
- num_votes = random.randrange(int(len(seen_pkgs)*VOTING[0]),
- int(len(seen_pkgs)*VOTING[1]))
- pkgvote = {}
- for v in range(num_votes):
- pkg = random.randrange(1, len(seen_pkgs) + 1)
- if pkg not in pkgvote:
- s = ("INSERT INTO PackageVotes (UsersID, PackageBaseID)"
- " VALUES (%d, %d);\n")
- s = s % (seen_users[u], pkg)
- pkgvote[pkg] = 1
- if pkg not in track_votes:
- track_votes[pkg] = 0
- track_votes[pkg] += 1
- out.write(s)
+ num_votes = random.randrange(int(len(seen_pkgs)*VOTING[0]),
+ int(len(seen_pkgs)*VOTING[1]))
+ pkgvote = {}
+ for v in range(num_votes):
+ pkg = random.randrange(1, len(seen_pkgs) + 1)
+ if pkg not in pkgvote:
+ s = ("INSERT INTO PackageVotes (UsersID, PackageBaseID)"
+ " VALUES (%d, %d);\n")
+ s = s % (seen_users[u], pkg)
+ pkgvote[pkg] = 1
+ if pkg not in track_votes:
+ track_votes[pkg] = 0
+ track_votes[pkg] += 1
+ out.write(s)
# Update statements for package votes
#
for p in list(track_votes.keys()):
- s = "UPDATE PackageBases SET NumVotes = %d WHERE ID = %d;\n"
- s = s % (track_votes[p], p)
- out.write(s)
+ s = "UPDATE PackageBases SET NumVotes = %d WHERE ID = %d;\n"
+ s = s % (track_votes[p], p)
+ out.write(s)
# Create package dependencies and sources
#
log.debug("Creating statements for package depends/sources.")
for p in list(seen_pkgs.keys()):
- num_deps = random.randrange(PKG_DEPS[0], PKG_DEPS[1])
- for i in range(0, num_deps):
- dep = random.choice([k for k in seen_pkgs])
- deptype = random.randrange(1, 5)
- if deptype == 4:
- dep += ": for " + random.choice([k for k in seen_pkgs])
- s = "INSERT INTO PackageDepends(PackageID, DepTypeID, DepName) VALUES (%d, %d, '%s');\n"
- s = s % (seen_pkgs[p], deptype, dep)
- out.write(s)
-
- num_rels = random.randrange(PKG_RELS[0], PKG_RELS[1])
- for i in range(0, num_deps):
- rel = random.choice([k for k in seen_pkgs])
- reltype = random.randrange(1, 4)
- s = "INSERT INTO PackageRelations(PackageID, RelTypeID, RelName) VALUES (%d, %d, '%s');\n"
- s = s % (seen_pkgs[p], reltype, rel)
- out.write(s)
-
- num_sources = random.randrange(PKG_SRC[0], PKG_SRC[1])
- for i in range(num_sources):
- src_file = user_keys[random.randrange(0, len(user_keys))]
- src = "%s%s.%s/%s/%s-%s.tar.gz" % (
- RANDOM_URL[random.randrange(0,len(RANDOM_URL))],
- p, RANDOM_TLDS[random.randrange(0,len(RANDOM_TLDS))],
- RANDOM_LOCS[random.randrange(0,len(RANDOM_LOCS))],
- src_file, genVersion())
- s = "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
- s = s % (seen_pkgs[p], src)
- out.write(s)
+ num_deps = random.randrange(PKG_DEPS[0], PKG_DEPS[1])
+ for i in range(0, num_deps):
+ dep = random.choice([k for k in seen_pkgs])
+ deptype = random.randrange(1, 5)
+ if deptype == 4:
+ dep += ": for " + random.choice([k for k in seen_pkgs])
+ s = "INSERT INTO PackageDepends(PackageID, DepTypeID, DepName) VALUES (%d, %d, '%s');\n"
+ s = s % (seen_pkgs[p], deptype, dep)
+ out.write(s)
+
+ num_rels = random.randrange(PKG_RELS[0], PKG_RELS[1])
+ for i in range(0, num_deps):
+ rel = random.choice([k for k in seen_pkgs])
+ reltype = random.randrange(1, 4)
+ s = "INSERT INTO PackageRelations(PackageID, RelTypeID, RelName) VALUES (%d, %d, '%s');\n"
+ s = s % (seen_pkgs[p], reltype, rel)
+ out.write(s)
+
+ num_sources = random.randrange(PKG_SRC[0], PKG_SRC[1])
+ for i in range(num_sources):
+ src_file = user_keys[random.randrange(0, len(user_keys))]
+ src = "%s%s.%s/%s/%s-%s.tar.gz" % (
+ RANDOM_URL[random.randrange(0, len(RANDOM_URL))],
+ p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))],
+ RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))],
+ src_file, genVersion())
+ s = "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
+ s = s % (seen_pkgs[p], src)
+ out.write(s)
# Create trusted user proposals
#
log.debug("Creating SQL statements for trusted user proposals.")
-count=0
+count = 0
for t in range(0, OPEN_PROPOSALS+CLOSE_PROPOSALS):
- now = int(time.time())
- if count < CLOSE_PROPOSALS:
- start = now - random.randrange(3600*24*7, 3600*24*21)
- end = now - random.randrange(0, 3600*24*7)
- else:
- start = now
- end = now + random.randrange(3600*24, 3600*24*7)
- if count % 5 == 0: # Don't make the vote about anyone once in a while
- user = ""
- else:
- user = user_keys[random.randrange(0,len(user_keys))]
- suid = trustedusers[random.randrange(0,len(trustedusers))]
- s = ("INSERT INTO TU_VoteInfo (Agenda, User, Submitted, End,"
- " Quorum, SubmitterID) VALUES ('%s', '%s', %d, %d, 0.0, %d);\n")
- s = s % (genFortune(), user, start, end, suid)
- out.write(s)
- count += 1
+ now = int(time.time())
+ if count < CLOSE_PROPOSALS:
+ start = now - random.randrange(3600*24*7, 3600*24*21)
+ end = now - random.randrange(0, 3600*24*7)
+ else:
+ start = now
+ end = now + random.randrange(3600*24, 3600*24*7)
+ if count % 5 == 0: # Don't make the vote about anyone once in a while
+ user = ""
+ else:
+ user = user_keys[random.randrange(0, len(user_keys))]
+ suid = trustedusers[random.randrange(0, len(trustedusers))]
+ s = ("INSERT INTO TU_VoteInfo (Agenda, User, Submitted, End,"
+ " Quorum, SubmitterID) VALUES ('%s', '%s', %d, %d, 0.0, %d);\n")
+ s = s % (genFortune(), user, start, end, suid)
+ out.write(s)
+ count += 1
# close output file
#