diff options
Diffstat (limited to 'extensions/SiteMapIndex/lib')
-rw-r--r-- | extensions/SiteMapIndex/lib/Constants.pm | 47 | ||||
-rw-r--r-- | extensions/SiteMapIndex/lib/Util.pm | 205 |
2 files changed, 252 insertions, 0 deletions
diff --git a/extensions/SiteMapIndex/lib/Constants.pm b/extensions/SiteMapIndex/lib/Constants.pm new file mode 100644 index 000000000..fce858121 --- /dev/null +++ b/extensions/SiteMapIndex/lib/Constants.pm @@ -0,0 +1,47 @@ +# -*- Mode: perl; indent-tabs-mode: nil -*- +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# The Original Code is the Sitemap Bugzilla Extension. +# +# The Initial Developer of the Original Code is Everything Solved, Inc. +# Portions created by the Initial Developer are Copyright (C) 2010 the +# Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Max Kanat-Alexander <mkanat@bugzilla.org> + +package Bugzilla::Extension::SiteMapIndex::Constants; +use strict; +use base qw(Exporter); +our @EXPORT = qw( + SITEMAP_AGE + SITEMAP_MAX + SITEMAP_DELAY + SITEMAP_URL +); + +# This is the amount of hours a sitemap index and it's files are considered +# valid before needing to be regenerated. +use constant SITEMAP_AGE => 12; + +# This is the largest number of entries that can be in a single sitemap file, +# per the sitemaps.org standard. +use constant SITEMAP_MAX => 50_000; + +# We only show bugs that are at least 12 hours old, because if somebody +# files a bug that's a security bug but doesn't protect it, we want to give +# them time to fix that. +use constant SITEMAP_DELAY => 12; + +use constant SITEMAP_URL => 'page.cgi?id=sitemap/sitemap.xml'; + +1; diff --git a/extensions/SiteMapIndex/lib/Util.pm b/extensions/SiteMapIndex/lib/Util.pm new file mode 100644 index 000000000..3c322d8c7 --- /dev/null +++ b/extensions/SiteMapIndex/lib/Util.pm @@ -0,0 +1,205 @@ +# -*- Mode: perl; indent-tabs-mode: nil -*- +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# The Original Code is the Sitemap Bugzilla Extension. +# +# The Initial Developer of the Original Code is Everything Solved, Inc. +# Portions created by the Initial Developer are Copyright (C) 2010 the +# Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Max Kanat-Alexander <mkanat@bugzilla.org> +# Dave Lawrence <dkl@mozilla.com> + +package Bugzilla::Extension::SiteMapIndex::Util; +use strict; +use base qw(Exporter); +our @EXPORT = qw( + generate_sitemap + bug_is_ok_to_index +); + +use Bugzilla::Extension::SiteMapIndex::Constants; + +use Bugzilla::Util qw(correct_urlbase datetime_from url_quote); +use Bugzilla::Constants qw(bz_locations); + +use Scalar::Util qw(blessed); +use IO::Compress::Gzip qw(gzip $GzipError); + +sub too_young_date { + my $hours_ago = DateTime->now(time_zone => Bugzilla->local_timezone); + $hours_ago->subtract(hours => SITEMAP_DELAY); + return $hours_ago; +} + +sub bug_is_ok_to_index { + my ($bug) = @_; + return 1 unless blessed($bug) && $bug->isa('Bugzilla::Bug'); + my $creation_ts = datetime_from($bug->creation_ts); + return ($creation_ts lt too_young_date()) ? 1 : 0; +} + +# We put two things in the Sitemap: a list of Browse links for products, +# and links to bugs. +sub generate_sitemap { + my ($extension_name) = @_; + + # If file is less than SITEMAP_AGE hours old, then read in and send to caller. + # If greater, then regenerate and send the new version. + my $index_file = bz_locations->{'datadir'} . "/$extension_name/sitemap_index.xml"; + if (-e $index_file) { + my $index_mtime = (stat($index_file))[9]; + my $index_hours = sprintf("%d", (time() - $index_mtime) / 60 / 60); # in hours + if ($index_hours < SITEMAP_AGE) { + my $index_fh = new IO::File($index_file, 'r'); + $index_fh || die "Could not open current sitemap index: $!"; + my $index_xml; + { local $/; $index_xml = <$index_fh> } + $index_fh->close() || die "Could not close current sitemap index: $!"; + + return $index_xml; + } + } + + # Set the atime and mtime of the index file to the current time + # in case another request is made before we finish. + utime(undef, undef, $index_file); + + # Sitemaps must never contain private data. + Bugzilla->logout_request(); + my $user = Bugzilla->user; + my $products = $user->get_accessible_products; + + my $num_bugs = SITEMAP_MAX - scalar(@$products); + # We do this date math outside of the database because databases + # usually do better with a straight comparison value. + my $hours_ago = too_young_date(); + + # We don't use Bugzilla::Bug objects, because this could be a tremendous + # amount of data, and we only want a little. Also, we only display + # bugs that are not in any group. We show the last $num_bugs + # most-recently-updated bugs. + my $dbh = Bugzilla->dbh; + my $bug_sth = $dbh->prepare( + 'SELECT bugs.bug_id, bugs.delta_ts + FROM bugs + LEFT JOIN bug_group_map ON bugs.bug_id = bug_group_map.bug_id + WHERE bug_group_map.bug_id IS NULL AND creation_ts < ? + ' . $dbh->sql_limit($num_bugs, '?')); + + my $filecount = 1; + my $filelist = []; + my $offset = 0; + + while (1) { + my $bugs = []; + + $bug_sth->execute($hours_ago, $offset); + + while (my ($bug_id, $delta_ts) = $bug_sth->fetchrow_array()) { + push(@$bugs, { bug_id => $bug_id, delta_ts => $delta_ts }); + } + + last if !@$bugs; + + # We only need the product links in the first sitemap file + $products = [] if $filecount > 1; + + push(@$filelist, _generate_sitemap_file($extension_name, $filecount, $products, $bugs)); + + $filecount++; + $offset += $num_bugs; + } + + # Generate index file + return _generate_sitemap_index($extension_name, $filelist); +} + +sub _generate_sitemap_index { + my ($extension_name, $filelist) = @_; + + my $dbh = Bugzilla->dbh; + my $timestamp = $dbh->selectrow_array( + "SELECT " . $dbh->sql_date_format('NOW()', '%Y-%m-%d')); + + my $index_xml = <<END; +<?xml version="1.0" encoding="UTF-8"?> +<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> +END + + foreach my $filename (@$filelist) { + $index_xml .= " + <sitemap> + <loc>" . correct_urlbase() . "data/$extension_name/$filename</loc> + <lastmod>$timestamp</lastmod> + </sitemap> +"; + } + + $index_xml .= <<END; +</sitemapindex> +END + + my $index_file = bz_locations->{'datadir'} . "/$extension_name/sitemap_index.xml"; + my $index_fh = new IO::File($index_file, 'w'); + $index_fh || die "Could not open new sitemap index: $!"; + print $index_fh $index_xml; + $index_fh->close() || die "Could not close new sitemap index: $!"; + + return $index_xml; +} + +sub _generate_sitemap_file { + my ($extension_name, $filecount, $products, $bugs) = @_; + + my $bug_url = correct_urlbase() . 'show_bug.cgi?id='; + my $product_url = correct_urlbase() . 'describecomponents.cgi?product='; + + my $sitemap_xml = <<END; +<?xml version="1.0" encoding="UTF-8"?> +<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> +END + + foreach my $product (@$products) { + $sitemap_xml .= " + <url> + <loc>" . $product_url . url_quote($product->name) . "</loc> + <changefreq>daily</changefreq> + <priority>0.4</priority> + </url> +"; + } + + foreach my $bug (@$bugs) { + $sitemap_xml .= " + <url> + <loc>" . $bug_url . $bug->{bug_id} . "</loc> + <lastmod>" . datetime_from($bug->{delta_ts}, 'UTC')->iso8601 . 'Z' . "</lastmod> + </url> +"; + } + + $sitemap_xml .= <<END; +</urlset> +END + + # Write the compressed sitemap data to a file in the cgi root so that they can + # be accessed by the search engines. + my $filename = "sitemap$filecount.xml.gz"; + gzip \$sitemap_xml => bz_locations->{'datadir'} . "/$extension_name/$filename" + || die "gzip failed: $GzipError\n"; + + return $filename; +} + +1; |