#!/usr/bin/python
"""
usage: gendummydata.py outputfilename.sql
"""
#
# This script seeds the AUR database with dummy data for
# use during development/testing.  It uses random entries
# from /usr/share/dict/words to create user accounts and
# package names.  It generates the SQL statements to
# insert these users/packages into the AUR database.
#

DBUG      = 1
SEED_FILE = "/usr/share/dict/words"
DB_HOST   = "localhost"
DB_NAME   = "AUR"
DB_USER   = "aur"
DB_PASS   = "aur"
USER_ID   = 5          # Users.ID of first bogus user
PKG_ID    = 1          # Packages.ID of first package
MAX_USERS = 500        # how many users to 'register'
MAX_DEVS  = .1         # what percentage of MAX_USERS are Developers
MAX_TUS   = .2         # what percentage of MAX_USERS are Trusted Users
MAX_PKGS  = 2500       # how many packages to load
PKG_FILES = (8, 30)    # min/max number of files in a package
PKG_DEPS  = (1, 5)     # min/max depends a package has
PKG_SRC   = (1, 3)     # min/max sources a package has
PKG_CMNTS = (1, 5)     # min/max number of comments a package has
VOTING    = (0, .30)   # percentage range for package voting
RANDOM_PATHS = (       # random path locations for package files
	"/usr/bin", "/usr/lib", "/etc", "/etc/rc.d", "/usr/share", "/lib",
	"/var/spool", "/var/log", "/usr/sbin", "/opt", "/usr/X11R6/bin",
	"/usr/X11R6/lib", "/usr/libexec", "/usr/man/man1", "/usr/man/man3",
	"/usr/man/man5", "/usr/X11R6/man/man1", "/etc/profile.d"
)
RANDOM_TLDS = ("edu", "com", "org", "net", "tw", "ru", "pl", "de", "es")
RANDOM_URL = ("http://www.", "ftp://ftp.", "http://", "ftp://")
RANDOM_LOCS = ("pub", "release", "files", "downloads", "src")
FORTUNE_FILES = ("computers", "fortunes", "literature", "science",
  "songs-poems", "startrek", "wisdom", "work", "buffy", "calvin",
  "futurama", "jargon", "matrix", "starwars", "tao", "chalkboard",
  "dune", "dune-messiah", "children-of-dune", "god-emperor", "definitions",
  "drugs", "education", "food", "humorists", "kids", "law", "news",
  "people", "pets", "politics", "platitudes", "zippy", "riddles"
)
FORTUNE_CMD = "/usr/bin/fortune -l " + " ".join(FORTUNE_FILES)


import random
import time
import os
import sys
import cStringIO
import commands


if len(sys.argv) != 2:
	sys.stderr.write("Missing output filename argument");
	raise SystemExit

# Just let python throw the errors if any happen
#
out = open(sys.argv[1], "w")

# make sure the seed file exists
#
if not os.path.exists(SEED_FILE):
	sys.stderr.write("Please install the 'words' Arch package\n");

# Make sure database access will be available
#
try:
	import MySQLdb
except:
	sys.stderr.write("Please install the 'mysql-python' Arch package\n");
	raise SystemExit

# try to connect to database
#
try:
	db = MySQLdb.connect(host = DB_HOST, user = DB_USER,
			db = DB_NAME, passwd = DB_PASS)
	dbc = db.cursor()
except:
	sys.stderr.write("Could not connect to database\n");
	raise SystemExit

esc = db.escape_string


# track what users/package names have been used
#
seen_users = {}
seen_pkgs = {}
locations = {}
categories = {}
location_keys = []
category_keys = []
user_keys = []

# some functions to generate random data
#
def genVersion(location_id=0):
	ver = []
	ver.append("%d" % random.randrange(0,10))
	ver.append("%d" % random.randrange(0,20))
	if random.randrange(0,2) == 0:
		ver.append("%d" % random.randrange(0,100))
	if location_id == 2: # the package is in the AUR
		return ".".join(ver) + "-u%d" % random.randrange(1,11)
	else:
		return ".".join(ver) + "%d" % random.randrange(1,11)
def genCategory():
	return categories[category_keys[random.randrange(0,len(category_keys))]]
def genLocation():
	return locations[location_keys[random.randrange(0,len(location_keys))]]
def genUID():
	return seen_users[user_keys[random.randrange(0,len(user_keys))]]


# load the words, and make sure there are enough words for users/pkgs
#
if DBUG: print "Grabbing words from seed file..."
fp = open(SEED_FILE, "r")
contents = fp.readlines()
fp.close()
if MAX_USERS > len(contents):
	MAX_USERS = len(contents)
if MAX_PKGS > len(contents):
	MAX_PKGS = len(contents)
if len(contents) - MAX_USERS > MAX_PKGS:
	need_dupes = 0
else:
	need_dupes = 1

# select random usernames
#
if DBUG: print "Generating random user names..."
user_id = USER_ID
while len(seen_users) < MAX_USERS:
	user = random.randrange(0, len(contents))
	word = contents[user].replace("'", "").replace(".","").replace(" ", "_")
	word = word.strip().lower()
	if not seen_users.has_key(word):
		seen_users[word] = user_id
		user_id += 1
user_keys = seen_users.keys()

# select random package names
#
if DBUG: print "Generating random package names..."
num_pkgs = PKG_ID
while len(seen_pkgs) < MAX_PKGS:
	pkg = random.randrange(0, len(contents))
	word = contents[pkg].replace("'", "").replace(".","").replace(" ", "_")
	word = word.strip().lower()
	if not need_dupes:
		if not seen_pkgs.has_key(word) and not seen_users.has_key(word):
			seen_pkgs[word] = num_pkgs
			num_pkgs += 1
	else:
		if not seen_pkgs.has_key(word):
			seen_pkgs[word] = num_pkgs
			num_pkgs += 1

# free up contents memory
#
contents = None

# Load package categories from database
#
if DBUG: print "Loading package categories/locations..."
q = "SELECT * FROM PackageCategories"
dbc.execute(q)
row = dbc.fetchone()
while row:
	categories[row[1]] = row[0]
	row = dbc.fetchone()
category_keys = categories.keys()

# Load package locations from database
#
q = "SELECT * FROM PackageLocations"
dbc.execute(q)
row = dbc.fetchone()
while row:
	locations[row[1]] = row[0]
	row = dbc.fetchone()
location_keys = locations.keys()

# done with the database
#
dbc.close()
db.close()

# developer/tu IDs
#
developers = []
trustedusers = []
has_devs = 0
has_tus = 0

# Begin by creating the User statements
#
if DBUG: print "Creating SQL statements for users.",
count = 0
for u in user_keys:
	account_type = 1  # default to normal user
	if not has_devs or not has_tus:
		account_type = random.randrange(1, 4)
		if account_type == 3 and not has_devs:
			# this will be a dev account
			#
			developers.append(seen_users[u])
			if len(developers) >= MAX_DEVS * MAX_USERS:
				has_devs = 1
		elif account_type == 2 and not has_tus:
			# this will be a trusted user account
			#
			trustedusers.append(seen_users[u])
			if len(trustedusers) >= MAX_TUS * MAX_USERS:
				has_tus = 1
		else:
			# a normal user account
			#
			pass
	
	s = "INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd) VALUES (%d, %d, '%s', '%s@example.com', MD5('%s'));\n" % (seen_users[u], account_type, u, u, u)
	out.write(s)
	if count % 10 == 0:
		if DBUG: print ".",
	count += 1
if DBUG: print "."
if DBUG:
	print "Number of developers:", len(developers)
	print "Number of trusted users:", len(trustedusers)
	print "Number of users:", (MAX_USERS-len(developers)-len(trustedusers))
	print "Number of packages:", MAX_PKGS

# Create the package statements
#
if DBUG: print "Creating SQL statements for packages.",
count = 0
for p in seen_pkgs.keys():
	NOW = int(time.time())
	if count % 2 == 0:
		muid = developers[random.randrange(0,len(developers))]
	else:
		muid = trustedusers[random.randrange(0,len(trustedusers))]
	if count % 20 == 0: # every so often, there are orphans...
		muid = 0

	location_id = genLocation()
	if location_id == 1: # unsupported pkgs don't have a maintainer
		muid = 0

	uuid = genUID() # the submitter/user

	s = "INSERT INTO Packages (ID, Name, Version, CategoryID, LocationID, SubmittedTS, SubmitterUID, MaintainerUID, AURMaintainerUID) VALUES (%d, '%s', '%s', %d, %d, %d, %d, %d, %d);\n" % (seen_pkgs[p], p, genVersion(location_id),
			genCategory(), location_id, NOW, uuid, uuid, muid)
	out.write(s)
	if count % 100 == 0:
		if DBUG: print ".",
	count += 1

	# create random comments for this package
	#
	num_comments = random.randrange(PKG_CMNTS[0], PKG_CMNTS[1])
	for i in range(0, num_comments):
		fortune = esc(commands.getoutput(FORTUNE_CMD).replace("'",""))
		now = NOW + random.randrange(400, 86400*3)
		s = "INSERT INTO PackageComments (PackageID, UsersID, Comments, CommentTS) VALUES (%d, %d, '%s', %d);\n" % (seen_pkgs[p], genUID(), fortune, now)
		out.write(s)

	if location_id == 1: # Unsupported - just a PKGBUILD and maybe other stuff
		others = random.randrange(0,3)
		s = "INSERT INTO PackageContents (PackageID, URLPath, FSPath, FileSize) VALUES (%d, '%s', '%s', %d);\n" % (seen_pkgs[p], "PKGBUILD", "/home/aur/incoming/%s/PKGBUILD" % p,
				random.randrange(0,999))
		out.write(s)
		if others == 0:
			s = "INSERT INTO PackageContents (PackageID, URLPath, FSPath, FileSize) VALUES (%d, '%s', '%s', %d);\n" % (seen_pkgs[p], "%s.patch" % p,
					"/home/aur/incoming/%s/%s.patch" % (p,p),
					random.randrange(0,999))
			out.write(s)

		elif others == 1:
			s = "INSERT INTO PackageContents (PackageID, URLPath, FSPath, FileSize) VALUES (%d, '%s', '%s', %d);\n" % (seen_pkgs[p], "%s.patch" % p,
					"/home/aur/incoming/%s/%s.patch" % (p,p),
					random.randrange(0,999))
			out.write(s)
			s = "INSERT INTO PackageContents (PackageID, URLPath, FSPath, FileSize) VALUES (%d, '%s', '%s', %d);\n" % (seen_pkgs[p], "arch.patch",
					"/home/aur/incoming/%s/arch.patch" % p,
					random.randrange(0,999))
			out.write(s)

		elif others == 2:
			s = "INSERT INTO PackageContents (PackageID, URLPath, FSPath, FileSize) VALUES (%d, '%s', '%s', %d);\n" % (seen_pkgs[p], "%s.patch" % p,
					"/home/aur/incoming/%s/%s.patch" % (p,p),
					random.randrange(0,999))
			out.write(s)
			s = "INSERT INTO PackageContents (PackageID, URLPath, FSPath, FileSize) VALUES (%d, '%s', '%s', %d);\n" % (seen_pkgs[p], "arch.patch",
					"/home/aur/incoming/%s/arch.patch" % p,
					random.randrange(0,999))
			out.write(s)
			s = "INSERT INTO PackageContents (PackageID, URLPath, FSPath, FileSize) VALUES (%d, '%s', '%s', %d);\n" % (seen_pkgs[p], "%s.install" % p,
					"/home/aur/incoming/%s/%s.install" % (p,p),
					random.randrange(0,999))
			out.write(s)

	else:
		# Create package contents
		#
		num_files = random.randrange(PKG_FILES[0], PKG_FILES[1])
		files = {}
		for f in range(num_files):
			loc = RANDOM_PATHS[random.randrange(len(RANDOM_PATHS))]
			if "lib" in loc:
				path = loc + "/lib" + p + ".so"
			elif "man" in loc:
				path = loc + "/" + p + "." + loc[-1] + ".gz"
			elif "share" in loc:
				path = loc + "/" + p + "/sounds/" + p + ".wav"
			elif "profile" in loc:
				path = loc + "/" + p + ".sh"
			elif "rc.d" in loc:
				path = loc + "/" + p
			elif "etc" in loc:
				path = loc + "/" + p + ".conf"
			elif "opt" in loc:
				path = loc + "/" + p + "/bin/" + p
			else:
				path = loc + "/" + p
			if not files.has_key(path):
				files[path] = 1
				s = "INSERT INTO PackageContents (PackageID, URLPath, FSPath, FileSize) VALUES (%d, '%s', '%s', %d);\n" % (seen_pkgs[p], os.path.basename(path), path,
						random.randrange(0,99999999))
				out.write(s)
if DBUG: print "."

# Cast votes
#
track_votes = {}
if DBUG: print "Casting votes for packages.",
count = 0
for u in user_keys:
	num_votes = random.randrange(int(len(seen_pkgs)*VOTING[0]),
			int(len(seen_pkgs)*VOTING[1]))
	pkgvote = {}
	for v in range(num_votes):
		pkg = random.randrange(0, len(seen_pkgs))
		if not pkgvote.has_key(pkg):
			s = "INSERT INTO PackageVotes (UsersID, PackageID) VALUES (%d, %d);\n" % (seen_users[u], pkg)
			pkgvote[pkg] = 1
			if not track_votes.has_key(pkg):
				track_votes[pkg] = 0
			track_votes[pkg] += 1
			out.write(s)
			if count % 100 == 0:
				if DBUG: print ".",
			count += 1

# Update statements for package votes
#
for p in track_votes.keys():
	s = "UPDATE Packages SET NumVotes = %d WHERE ID = %d;\n" % (track_votes[p], p)
	out.write(s)

# Create package dependencies and sources
#
if DBUG: print "."; print "Creating statements for package depends/sources.",
count = 0
for p in seen_pkgs.keys():
	num_deps = random.randrange(PKG_DEPS[0], PKG_DEPS[1])
	this_deps = {}
	i = 0
	while i != num_deps:
		dep = random.randrange(0, len(seen_pkgs))
		if not this_deps.has_key(dep):
			s = "INSERT INTO PackageDepends VALUES (%d, %d);\n" % (seen_pkgs[p], dep)
			out.write(s)
			i += 1

	num_sources = random.randrange(PKG_SRC[0], PKG_SRC[1])
	for i in range(num_sources):
		src_file = user_keys[random.randrange(0, len(user_keys))]
		src = "%s%s.%s/%s/%s-%s.tar.gz" % (
				RANDOM_URL[random.randrange(0,len(RANDOM_URL))],
				p, RANDOM_TLDS[random.randrange(0,len(RANDOM_TLDS))],
				RANDOM_LOCS[random.randrange(0,len(RANDOM_LOCS))],
				src_file, genVersion())
		s = "INSERT INTO PackageSources VALUES (%d, '%s');\n" % (
				seen_pkgs[p], src)
		out.write(s)

	if count % 100 == 0:
		if DBUG: print ".",
	count += 1


# close output file
#
out.write("\n")
out.close()

if DBUG: print "."
if DBUG: print "Done."