summaryrefslogtreecommitdiffstats
path: root/extensions/SiteMapIndex
diff options
context:
space:
mode:
Diffstat (limited to 'extensions/SiteMapIndex')
-rw-r--r--extensions/SiteMapIndex/Config.pm36
-rw-r--r--extensions/SiteMapIndex/Extension.pm157
-rw-r--r--extensions/SiteMapIndex/lib/Constants.pm47
-rw-r--r--extensions/SiteMapIndex/lib/Util.pm205
-rw-r--r--extensions/SiteMapIndex/robots.txt9
-rw-r--r--extensions/SiteMapIndex/template/en/default/hook/global/header-additional_header.html.tmpl23
-rw-r--r--extensions/SiteMapIndex/template/en/default/hook/global/messages-messages.html.tmpl37
7 files changed, 514 insertions, 0 deletions
diff --git a/extensions/SiteMapIndex/Config.pm b/extensions/SiteMapIndex/Config.pm
new file mode 100644
index 000000000..e10d6ec8b
--- /dev/null
+++ b/extensions/SiteMapIndex/Config.pm
@@ -0,0 +1,36 @@
+# -*- Mode: perl; indent-tabs-mode: nil -*-
+#
+# The contents of this file are subject to the Mozilla Public
+# License Version 1.1 (the "License"); you may not use this file
+# except in compliance with the License. You may obtain a copy of
+# the License at http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS
+# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# rights and limitations under the License.
+#
+# The Original Code is the Sitemap Bugzilla Extension.
+#
+# The Initial Developer of the Original Code is Everything Solved, Inc.
+# Portions created by the Initial Developer are Copyright (C) 2010 the
+# Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Max Kanat-Alexander <mkanat@bugzilla.org>
+# Dave Lawrence <dkl@mozilla.com>
+
+package Bugzilla::Extension::SiteMapIndex;
+use strict;
+
+use constant NAME => 'SiteMapIndex';
+
+use constant REQUIRED_MODULES => [
+ {
+ package => 'IO-Compress-Gzip',
+ module => 'IO::Compress::Gzip',
+ version => 0,
+ }
+];
+
+__PACKAGE__->NAME;
diff --git a/extensions/SiteMapIndex/Extension.pm b/extensions/SiteMapIndex/Extension.pm
new file mode 100644
index 000000000..f36fa8c81
--- /dev/null
+++ b/extensions/SiteMapIndex/Extension.pm
@@ -0,0 +1,157 @@
+# -*- Mode: perl; indent-tabs-mode: nil -*-
+#
+# The contents of this file are subject to the Mozilla Public
+# License Version 1.1 (the "License"); you may not use this file
+# except in compliance with the License. You may obtain a copy of
+# the License at http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS
+# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# rights and limitations under the License.
+#
+# The Original Code is the Sitemap Bugzilla Extension.
+#
+# The Initial Developer of the Original Code is Everything Solved, Inc.
+# Portions created by the Initial Developer are Copyright (C) 2010 the
+# Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Max Kanat-Alexander <mkanat@bugzilla.org>
+# Dave Lawrence <dkl@mozilla.com>
+
+package Bugzilla::Extension::SiteMapIndex;
+use strict;
+use base qw(Bugzilla::Extension);
+
+our $VERSION = '1.0';
+
+use Bugzilla::Constants qw(bz_locations ON_WINDOWS);
+use Bugzilla::Util qw(correct_urlbase get_text);
+use Bugzilla::Install::Filesystem;
+
+use Bugzilla::Extension::SiteMapIndex::Constants;
+use Bugzilla::Extension::SiteMapIndex::Util;
+
+use DateTime;
+use IO::File;
+use POSIX;
+
+#########
+# Pages #
+#########
+
+sub template_before_process {
+ my ($self, $args) = @_;
+ my ($vars, $file) = @$args{qw(vars file)};
+
+ return if !$file eq 'global/header.html.tmpl';
+ return unless (exists $vars->{bug} or exists $vars->{bugs});
+ my $bugs = exists $vars->{bugs} ? $vars->{bugs} : [$vars->{bug}];
+ return if !ref $bugs eq 'ARRAY';
+
+ foreach my $bug (@$bugs) {
+ if (!bug_is_ok_to_index($bug)) {
+ $vars->{sitemap_noindex} = 1;
+ last;
+ }
+ }
+}
+
+sub page_before_template {
+ my ($self, $args) = @_;
+ my $page = $args->{page_id};
+
+ if ($page =~ m{^sitemap/sitemap\.}) {
+ my $map = generate_sitemap(__PACKAGE__->NAME);
+ print Bugzilla->cgi->header('text/xml');
+ print $map;
+ exit;
+ }
+}
+
+################
+# Installation #
+################
+
+sub install_before_final_checks {
+ my ($self) = @_;
+ if (!correct_urlbase()) {
+ print STDERR get_text('sitemap_no_urlbase'), "\n";
+ return;
+ }
+ if (Bugzilla->params->{'requirelogin'}) {
+ print STDERR get_text('sitemap_requirelogin'), "\n";
+ return;
+ }
+
+ $self->_fix_robots_txt();
+}
+
+sub install_filesystem {
+ my ($self, $args) = @_;
+ my $create_dirs = $args->{'create_dirs'};
+ my $recurse_dirs = $args->{'recurse_dirs'};
+ my $htaccess = $args->{'htaccess'};
+
+ # Create the sitemap directory to store the index and sitemap files
+ my $sitemap_path = bz_locations->{'datadir'} . "/" . __PACKAGE__->NAME;
+
+ $create_dirs->{$sitemap_path} = Bugzilla::Install::Filesystem::DIR_CGI_WRITE
+ | Bugzilla::Install::Filesystem::DIR_ALSO_WS_SERVE;
+
+ $recurse_dirs->{$sitemap_path} = {
+ files => Bugzilla::Install::Filesystem::CGI_WRITE
+ | Bugzilla::Install::Filesystem::DIR_ALSO_WS_SERVE,
+ dirs => Bugzilla::Install::Filesystem::DIR_CGI_WRITE
+ | Bugzilla::Install::Filesystem::DIR_ALSO_WS_SERVE
+ };
+
+ # Create a htaccess file that allows the sitemap files to be served out
+ $htaccess->{"$sitemap_path/.htaccess"} = {
+ perms => Bugzilla::Install::Filesystem::WS_SERVE,
+ contents => <<EOT
+# Allow access to sitemap files created by the SiteMapIndex extension
+<FilesMatch ^sitemap.*\\.xml(.gz)?\$>
+ Allow from all
+</FilesMatch>
+Deny from all
+EOT
+ };
+}
+
+sub _fix_robots_txt {
+ my ($self) = @_;
+ my $cgi_path = bz_locations()->{'cgi_path'};
+ my $robots_file = "$cgi_path/robots.txt";
+ my $current_fh = new IO::File("$cgi_path/robots.txt", 'r');
+ if (!$current_fh) {
+ warn "$robots_file: $!";
+ return;
+ }
+
+ my $current_contents;
+ { local $/; $current_contents = <$current_fh> }
+ $current_fh->close();
+
+ return if $current_contents =~ m{^Allow: \/\*show_bug\.cgi}ms;
+ my $backup_name = "$cgi_path/robots.txt.old";
+ print get_text('sitemap_fixing_robots', { current => $robots_file,
+ backup => $backup_name }), "\n";
+ rename $robots_file, $backup_name or die "backup failed: $!";
+
+ my $new_fh = new IO::File($self->package_dir . '/robots.txt', 'r');
+ $new_fh || die "Could not open new robots.txt template file: $!";
+ my $new_contents;
+ { local $/; $new_contents = <$new_fh> }
+ $new_fh->close() || die "Could not close new robots.txt template file: $!";
+
+ my $sitemap_url = correct_urlbase() . SITEMAP_URL;
+ $new_contents =~ s/SITEMAP_URL/$sitemap_url/;
+ $new_fh = new IO::File("$cgi_path/robots.txt", 'w');
+ $new_fh || die "Could not open new robots.txt file: $!";
+ print $new_fh $new_contents;
+ $new_fh->close() || die "Could not close new robots.txt file: $!";
+}
+
+__PACKAGE__->NAME;
diff --git a/extensions/SiteMapIndex/lib/Constants.pm b/extensions/SiteMapIndex/lib/Constants.pm
new file mode 100644
index 000000000..fce858121
--- /dev/null
+++ b/extensions/SiteMapIndex/lib/Constants.pm
@@ -0,0 +1,47 @@
+# -*- Mode: perl; indent-tabs-mode: nil -*-
+#
+# The contents of this file are subject to the Mozilla Public
+# License Version 1.1 (the "License"); you may not use this file
+# except in compliance with the License. You may obtain a copy of
+# the License at http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS
+# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# rights and limitations under the License.
+#
+# The Original Code is the Sitemap Bugzilla Extension.
+#
+# The Initial Developer of the Original Code is Everything Solved, Inc.
+# Portions created by the Initial Developer are Copyright (C) 2010 the
+# Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Max Kanat-Alexander <mkanat@bugzilla.org>
+
+package Bugzilla::Extension::SiteMapIndex::Constants;
+use strict;
+use base qw(Exporter);
+our @EXPORT = qw(
+ SITEMAP_AGE
+ SITEMAP_MAX
+ SITEMAP_DELAY
+ SITEMAP_URL
+);
+
+# This is the amount of hours a sitemap index and it's files are considered
+# valid before needing to be regenerated.
+use constant SITEMAP_AGE => 12;
+
+# This is the largest number of entries that can be in a single sitemap file,
+# per the sitemaps.org standard.
+use constant SITEMAP_MAX => 50_000;
+
+# We only show bugs that are at least 12 hours old, because if somebody
+# files a bug that's a security bug but doesn't protect it, we want to give
+# them time to fix that.
+use constant SITEMAP_DELAY => 12;
+
+use constant SITEMAP_URL => 'page.cgi?id=sitemap/sitemap.xml';
+
+1;
diff --git a/extensions/SiteMapIndex/lib/Util.pm b/extensions/SiteMapIndex/lib/Util.pm
new file mode 100644
index 000000000..b0e4c6eab
--- /dev/null
+++ b/extensions/SiteMapIndex/lib/Util.pm
@@ -0,0 +1,205 @@
+# -*- Mode: perl; indent-tabs-mode: nil -*-
+#
+# The contents of this file are subject to the Mozilla Public
+# License Version 1.1 (the "License"); you may not use this file
+# except in compliance with the License. You may obtain a copy of
+# the License at http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS
+# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# rights and limitations under the License.
+#
+# The Original Code is the Sitemap Bugzilla Extension.
+#
+# The Initial Developer of the Original Code is Everything Solved, Inc.
+# Portions created by the Initial Developer are Copyright (C) 2010 the
+# Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Max Kanat-Alexander <mkanat@bugzilla.org>
+# Dave Lawrence <dkl@mozilla.com>
+
+package Bugzilla::Extension::SiteMapIndex::Util;
+use strict;
+use base qw(Exporter);
+our @EXPORT = qw(
+ generate_sitemap
+ bug_is_ok_to_index
+);
+
+use Bugzilla::Extension::SiteMapIndex::Constants;
+
+use Bugzilla::Util qw(correct_urlbase datetime_from url_quote);
+use Bugzilla::Constants qw(bz_locations);
+
+use Scalar::Util qw(blessed);
+use IO::Compress::Gzip qw(gzip $GzipError);
+
+sub too_young_date {
+ my $hours_ago = DateTime->now(time_zone => Bugzilla->local_timezone);
+ $hours_ago->subtract(hours => SITEMAP_DELAY);
+ return $hours_ago;
+}
+
+sub bug_is_ok_to_index {
+ my ($bug) = @_;
+ return 1 unless blessed($bug) && $bug->isa('Bugzilla::Bug');
+ my $creation_ts = datetime_from($bug->creation_ts);
+ return ($creation_ts && $creation_ts lt too_young_date()) ? 1 : 0;
+}
+
+# We put two things in the Sitemap: a list of Browse links for products,
+# and links to bugs.
+sub generate_sitemap {
+ my ($extension_name) = @_;
+
+ # If file is less than SITEMAP_AGE hours old, then read in and send to caller.
+ # If greater, then regenerate and send the new version.
+ my $index_file = bz_locations->{'datadir'} . "/$extension_name/sitemap_index.xml";
+ if (-e $index_file) {
+ my $index_mtime = (stat($index_file))[9];
+ my $index_hours = sprintf("%d", (time() - $index_mtime) / 60 / 60); # in hours
+ if ($index_hours < SITEMAP_AGE) {
+ my $index_fh = new IO::File($index_file, 'r');
+ $index_fh || die "Could not open current sitemap index: $!";
+ my $index_xml;
+ { local $/; $index_xml = <$index_fh> }
+ $index_fh->close() || die "Could not close current sitemap index: $!";
+
+ return $index_xml;
+ }
+ }
+
+ # Set the atime and mtime of the index file to the current time
+ # in case another request is made before we finish.
+ utime(undef, undef, $index_file);
+
+ # Sitemaps must never contain private data.
+ Bugzilla->logout_request();
+ my $user = Bugzilla->user;
+ my $products = $user->get_accessible_products;
+
+ my $num_bugs = SITEMAP_MAX - scalar(@$products);
+ # We do this date math outside of the database because databases
+ # usually do better with a straight comparison value.
+ my $hours_ago = too_young_date();
+
+ # We don't use Bugzilla::Bug objects, because this could be a tremendous
+ # amount of data, and we only want a little. Also, we only display
+ # bugs that are not in any group. We show the last $num_bugs
+ # most-recently-updated bugs.
+ my $dbh = Bugzilla->dbh;
+ my $bug_sth = $dbh->prepare(
+ 'SELECT bugs.bug_id, bugs.delta_ts
+ FROM bugs
+ LEFT JOIN bug_group_map ON bugs.bug_id = bug_group_map.bug_id
+ WHERE bug_group_map.bug_id IS NULL AND creation_ts < ?
+ ' . $dbh->sql_limit($num_bugs, '?'));
+
+ my $filecount = 1;
+ my $filelist = [];
+ my $offset = 0;
+
+ while (1) {
+ my $bugs = [];
+
+ $bug_sth->execute($hours_ago, $offset);
+
+ while (my ($bug_id, $delta_ts) = $bug_sth->fetchrow_array()) {
+ push(@$bugs, { bug_id => $bug_id, delta_ts => $delta_ts });
+ }
+
+ last if !@$bugs;
+
+ # We only need the product links in the first sitemap file
+ $products = [] if $filecount > 1;
+
+ push(@$filelist, _generate_sitemap_file($extension_name, $filecount, $products, $bugs));
+
+ $filecount++;
+ $offset += $num_bugs;
+ }
+
+ # Generate index file
+ return _generate_sitemap_index($extension_name, $filelist);
+}
+
+sub _generate_sitemap_index {
+ my ($extension_name, $filelist) = @_;
+
+ my $dbh = Bugzilla->dbh;
+ my $timestamp = $dbh->selectrow_array(
+ "SELECT " . $dbh->sql_date_format('NOW()', '%Y-%m-%d'));
+
+ my $index_xml = <<END;
+<?xml version="1.0" encoding="UTF-8"?>
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+END
+
+ foreach my $filename (@$filelist) {
+ $index_xml .= "
+ <sitemap>
+ <loc>" . correct_urlbase() . "data/$extension_name/$filename</loc>
+ <lastmod>$timestamp</lastmod>
+ </sitemap>
+";
+ }
+
+ $index_xml .= <<END;
+</sitemapindex>
+END
+
+ my $index_file = bz_locations->{'datadir'} . "/$extension_name/sitemap_index.xml";
+ my $index_fh = new IO::File($index_file, 'w');
+ $index_fh || die "Could not open new sitemap index: $!";
+ print $index_fh $index_xml;
+ $index_fh->close() || die "Could not close new sitemap index: $!";
+
+ return $index_xml;
+}
+
+sub _generate_sitemap_file {
+ my ($extension_name, $filecount, $products, $bugs) = @_;
+
+ my $bug_url = correct_urlbase() . 'show_bug.cgi?id=';
+ my $product_url = correct_urlbase() . 'describecomponents.cgi?product=';
+
+ my $sitemap_xml = <<END;
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+END
+
+ foreach my $product (@$products) {
+ $sitemap_xml .= "
+ <url>
+ <loc>" . $product_url . url_quote($product->name) . "</loc>
+ <changefreq>daily</changefreq>
+ <priority>0.4</priority>
+ </url>
+";
+ }
+
+ foreach my $bug (@$bugs) {
+ $sitemap_xml .= "
+ <url>
+ <loc>" . $bug_url . $bug->{bug_id} . "</loc>
+ <lastmod>" . datetime_from($bug->{delta_ts}, 'UTC')->iso8601 . 'Z' . "</lastmod>
+ </url>
+";
+ }
+
+ $sitemap_xml .= <<END;
+</urlset>
+END
+
+ # Write the compressed sitemap data to a file in the cgi root so that they can
+ # be accessed by the search engines.
+ my $filename = "sitemap$filecount.xml.gz";
+ gzip \$sitemap_xml => bz_locations->{'datadir'} . "/$extension_name/$filename"
+ || die "gzip failed: $GzipError\n";
+
+ return $filename;
+}
+
+1;
diff --git a/extensions/SiteMapIndex/robots.txt b/extensions/SiteMapIndex/robots.txt
new file mode 100644
index 000000000..139edbf93
--- /dev/null
+++ b/extensions/SiteMapIndex/robots.txt
@@ -0,0 +1,9 @@
+User-agent: *
+Disallow: /*.cgi
+Disallow: /*show_bug.cgi*ctype=*
+Allow: /
+Allow: /*index.cgi
+Allow: /*page.cgi
+Allow: /*show_bug.cgi
+Allow: /*describecomponents.cgi
+Sitemap: SITEMAP_URL
diff --git a/extensions/SiteMapIndex/template/en/default/hook/global/header-additional_header.html.tmpl b/extensions/SiteMapIndex/template/en/default/hook/global/header-additional_header.html.tmpl
new file mode 100644
index 000000000..682f6093f
--- /dev/null
+++ b/extensions/SiteMapIndex/template/en/default/hook/global/header-additional_header.html.tmpl
@@ -0,0 +1,23 @@
+[%# The contents of this file are subject to the Mozilla Public
+ # License Version 1.1 (the "License"); you may not use this file
+ # except in compliance with the License. You may obtain a copy of
+ # the License at http://www.mozilla.org/MPL/
+ #
+ # Software distributed under the License is distributed on an "AS
+ # IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
+ # implied. See the License for the specific language governing
+ # rights and limitations under the License.
+ #
+ # The Initial Developer of the Original Code is Everything Solved, Inc.
+ # Portions created by Everything Solved are Copyright (C) 2010
+ # Everything Solved. All Rights Reserved.
+ #
+ # The Original Code is the Bugzilla Sitemap Extension.
+ #
+ # Contributor(s):
+ # Max Kanat-Alexander <mkanat@bugzilla.org>
+ #%]
+
+[% SET meta_robots = ['noarchive'] %]
+[% meta_robots.push('noindex') IF sitemap_noindex %]
+<meta name="robots" content="[% meta_robots.join(',') FILTER html %]">
diff --git a/extensions/SiteMapIndex/template/en/default/hook/global/messages-messages.html.tmpl b/extensions/SiteMapIndex/template/en/default/hook/global/messages-messages.html.tmpl
new file mode 100644
index 000000000..0d0e9fd74
--- /dev/null
+++ b/extensions/SiteMapIndex/template/en/default/hook/global/messages-messages.html.tmpl
@@ -0,0 +1,37 @@
+[%# The contents of this file are subject to the Mozilla Public
+ # License Version 1.1 (the "License"); you may not use this file
+ # except in compliance with the License. You may obtain a copy of
+ # the License at http://www.mozilla.org/MPL/
+ #
+ # Software distributed under the License is distributed on an "AS
+ # IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
+ # implied. See the License for the specific language governing
+ # rights and limitations under the License.
+ #
+ # The Initial Developer of the Original Code is Everything Solved, Inc.
+ # Portions created by Everything Solved are Copyright (C) 2010
+ # Everything Solved. All Rights Reserved.
+ #
+ # The Original Code is the Bugzilla Sitemap Extension.
+ #
+ # Contributor(s):
+ # Max Kanat-Alexander <mkanat@bugzilla.org>
+ #%]
+
+[% IF message_tag == "sitemap_fixing_robots" %]
+ Replacing [% current FILTER html %]. (The old version will be saved
+ as "[% backup FILTER html %]". You can delete the old version if you
+ do not need its contents.)
+
+[% ELSIF message_tag == "sitemap_requirelogin" %]
+ Not updating search engines with your sitemap, because you have the
+ "requirelogin" parameter turned on, and so search engines will not be
+ able to access your sitemap.
+
+[% ELSIF message_tag == "sitemap_no_urlbase" %]
+ You have not yet set the "urlbase" parameter. We cannot update
+ search engines and inform them about your sitemap without a
+ urlbase. Please set the "urlbase" parameter and re-run
+ checksetup.pl.
+
+[% END %]