diff options
Diffstat (limited to 'extensions/SiteMapIndex')
-rw-r--r-- | extensions/SiteMapIndex/Config.pm | 36 | ||||
-rw-r--r-- | extensions/SiteMapIndex/Extension.pm | 157 | ||||
-rw-r--r-- | extensions/SiteMapIndex/lib/Constants.pm | 47 | ||||
-rw-r--r-- | extensions/SiteMapIndex/lib/Util.pm | 205 | ||||
-rw-r--r-- | extensions/SiteMapIndex/robots.txt | 9 | ||||
-rw-r--r-- | extensions/SiteMapIndex/template/en/default/hook/global/header-additional_header.html.tmpl | 23 | ||||
-rw-r--r-- | extensions/SiteMapIndex/template/en/default/hook/global/messages-messages.html.tmpl | 37 |
7 files changed, 514 insertions, 0 deletions
diff --git a/extensions/SiteMapIndex/Config.pm b/extensions/SiteMapIndex/Config.pm new file mode 100644 index 000000000..e10d6ec8b --- /dev/null +++ b/extensions/SiteMapIndex/Config.pm @@ -0,0 +1,36 @@ +# -*- Mode: perl; indent-tabs-mode: nil -*- +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# The Original Code is the Sitemap Bugzilla Extension. +# +# The Initial Developer of the Original Code is Everything Solved, Inc. +# Portions created by the Initial Developer are Copyright (C) 2010 the +# Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Max Kanat-Alexander <mkanat@bugzilla.org> +# Dave Lawrence <dkl@mozilla.com> + +package Bugzilla::Extension::SiteMapIndex; +use strict; + +use constant NAME => 'SiteMapIndex'; + +use constant REQUIRED_MODULES => [ + { + package => 'IO-Compress-Gzip', + module => 'IO::Compress::Gzip', + version => 0, + } +]; + +__PACKAGE__->NAME; diff --git a/extensions/SiteMapIndex/Extension.pm b/extensions/SiteMapIndex/Extension.pm new file mode 100644 index 000000000..f36fa8c81 --- /dev/null +++ b/extensions/SiteMapIndex/Extension.pm @@ -0,0 +1,157 @@ +# -*- Mode: perl; indent-tabs-mode: nil -*- +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# The Original Code is the Sitemap Bugzilla Extension. +# +# The Initial Developer of the Original Code is Everything Solved, Inc. +# Portions created by the Initial Developer are Copyright (C) 2010 the +# Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Max Kanat-Alexander <mkanat@bugzilla.org> +# Dave Lawrence <dkl@mozilla.com> + +package Bugzilla::Extension::SiteMapIndex; +use strict; +use base qw(Bugzilla::Extension); + +our $VERSION = '1.0'; + +use Bugzilla::Constants qw(bz_locations ON_WINDOWS); +use Bugzilla::Util qw(correct_urlbase get_text); +use Bugzilla::Install::Filesystem; + +use Bugzilla::Extension::SiteMapIndex::Constants; +use Bugzilla::Extension::SiteMapIndex::Util; + +use DateTime; +use IO::File; +use POSIX; + +######### +# Pages # +######### + +sub template_before_process { + my ($self, $args) = @_; + my ($vars, $file) = @$args{qw(vars file)}; + + return if !$file eq 'global/header.html.tmpl'; + return unless (exists $vars->{bug} or exists $vars->{bugs}); + my $bugs = exists $vars->{bugs} ? $vars->{bugs} : [$vars->{bug}]; + return if !ref $bugs eq 'ARRAY'; + + foreach my $bug (@$bugs) { + if (!bug_is_ok_to_index($bug)) { + $vars->{sitemap_noindex} = 1; + last; + } + } +} + +sub page_before_template { + my ($self, $args) = @_; + my $page = $args->{page_id}; + + if ($page =~ m{^sitemap/sitemap\.}) { + my $map = generate_sitemap(__PACKAGE__->NAME); + print Bugzilla->cgi->header('text/xml'); + print $map; + exit; + } +} + +################ +# Installation # +################ + +sub install_before_final_checks { + my ($self) = @_; + if (!correct_urlbase()) { + print STDERR get_text('sitemap_no_urlbase'), "\n"; + return; + } + if (Bugzilla->params->{'requirelogin'}) { + print STDERR get_text('sitemap_requirelogin'), "\n"; + return; + } + + $self->_fix_robots_txt(); +} + +sub install_filesystem { + my ($self, $args) = @_; + my $create_dirs = $args->{'create_dirs'}; + my $recurse_dirs = $args->{'recurse_dirs'}; + my $htaccess = $args->{'htaccess'}; + + # Create the sitemap directory to store the index and sitemap files + my $sitemap_path = bz_locations->{'datadir'} . "/" . __PACKAGE__->NAME; + + $create_dirs->{$sitemap_path} = Bugzilla::Install::Filesystem::DIR_CGI_WRITE + | Bugzilla::Install::Filesystem::DIR_ALSO_WS_SERVE; + + $recurse_dirs->{$sitemap_path} = { + files => Bugzilla::Install::Filesystem::CGI_WRITE + | Bugzilla::Install::Filesystem::DIR_ALSO_WS_SERVE, + dirs => Bugzilla::Install::Filesystem::DIR_CGI_WRITE + | Bugzilla::Install::Filesystem::DIR_ALSO_WS_SERVE + }; + + # Create a htaccess file that allows the sitemap files to be served out + $htaccess->{"$sitemap_path/.htaccess"} = { + perms => Bugzilla::Install::Filesystem::WS_SERVE, + contents => <<EOT +# Allow access to sitemap files created by the SiteMapIndex extension +<FilesMatch ^sitemap.*\\.xml(.gz)?\$> + Allow from all +</FilesMatch> +Deny from all +EOT + }; +} + +sub _fix_robots_txt { + my ($self) = @_; + my $cgi_path = bz_locations()->{'cgi_path'}; + my $robots_file = "$cgi_path/robots.txt"; + my $current_fh = new IO::File("$cgi_path/robots.txt", 'r'); + if (!$current_fh) { + warn "$robots_file: $!"; + return; + } + + my $current_contents; + { local $/; $current_contents = <$current_fh> } + $current_fh->close(); + + return if $current_contents =~ m{^Allow: \/\*show_bug\.cgi}ms; + my $backup_name = "$cgi_path/robots.txt.old"; + print get_text('sitemap_fixing_robots', { current => $robots_file, + backup => $backup_name }), "\n"; + rename $robots_file, $backup_name or die "backup failed: $!"; + + my $new_fh = new IO::File($self->package_dir . '/robots.txt', 'r'); + $new_fh || die "Could not open new robots.txt template file: $!"; + my $new_contents; + { local $/; $new_contents = <$new_fh> } + $new_fh->close() || die "Could not close new robots.txt template file: $!"; + + my $sitemap_url = correct_urlbase() . SITEMAP_URL; + $new_contents =~ s/SITEMAP_URL/$sitemap_url/; + $new_fh = new IO::File("$cgi_path/robots.txt", 'w'); + $new_fh || die "Could not open new robots.txt file: $!"; + print $new_fh $new_contents; + $new_fh->close() || die "Could not close new robots.txt file: $!"; +} + +__PACKAGE__->NAME; diff --git a/extensions/SiteMapIndex/lib/Constants.pm b/extensions/SiteMapIndex/lib/Constants.pm new file mode 100644 index 000000000..fce858121 --- /dev/null +++ b/extensions/SiteMapIndex/lib/Constants.pm @@ -0,0 +1,47 @@ +# -*- Mode: perl; indent-tabs-mode: nil -*- +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# The Original Code is the Sitemap Bugzilla Extension. +# +# The Initial Developer of the Original Code is Everything Solved, Inc. +# Portions created by the Initial Developer are Copyright (C) 2010 the +# Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Max Kanat-Alexander <mkanat@bugzilla.org> + +package Bugzilla::Extension::SiteMapIndex::Constants; +use strict; +use base qw(Exporter); +our @EXPORT = qw( + SITEMAP_AGE + SITEMAP_MAX + SITEMAP_DELAY + SITEMAP_URL +); + +# This is the amount of hours a sitemap index and it's files are considered +# valid before needing to be regenerated. +use constant SITEMAP_AGE => 12; + +# This is the largest number of entries that can be in a single sitemap file, +# per the sitemaps.org standard. +use constant SITEMAP_MAX => 50_000; + +# We only show bugs that are at least 12 hours old, because if somebody +# files a bug that's a security bug but doesn't protect it, we want to give +# them time to fix that. +use constant SITEMAP_DELAY => 12; + +use constant SITEMAP_URL => 'page.cgi?id=sitemap/sitemap.xml'; + +1; diff --git a/extensions/SiteMapIndex/lib/Util.pm b/extensions/SiteMapIndex/lib/Util.pm new file mode 100644 index 000000000..b0e4c6eab --- /dev/null +++ b/extensions/SiteMapIndex/lib/Util.pm @@ -0,0 +1,205 @@ +# -*- Mode: perl; indent-tabs-mode: nil -*- +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# The Original Code is the Sitemap Bugzilla Extension. +# +# The Initial Developer of the Original Code is Everything Solved, Inc. +# Portions created by the Initial Developer are Copyright (C) 2010 the +# Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Max Kanat-Alexander <mkanat@bugzilla.org> +# Dave Lawrence <dkl@mozilla.com> + +package Bugzilla::Extension::SiteMapIndex::Util; +use strict; +use base qw(Exporter); +our @EXPORT = qw( + generate_sitemap + bug_is_ok_to_index +); + +use Bugzilla::Extension::SiteMapIndex::Constants; + +use Bugzilla::Util qw(correct_urlbase datetime_from url_quote); +use Bugzilla::Constants qw(bz_locations); + +use Scalar::Util qw(blessed); +use IO::Compress::Gzip qw(gzip $GzipError); + +sub too_young_date { + my $hours_ago = DateTime->now(time_zone => Bugzilla->local_timezone); + $hours_ago->subtract(hours => SITEMAP_DELAY); + return $hours_ago; +} + +sub bug_is_ok_to_index { + my ($bug) = @_; + return 1 unless blessed($bug) && $bug->isa('Bugzilla::Bug'); + my $creation_ts = datetime_from($bug->creation_ts); + return ($creation_ts && $creation_ts lt too_young_date()) ? 1 : 0; +} + +# We put two things in the Sitemap: a list of Browse links for products, +# and links to bugs. +sub generate_sitemap { + my ($extension_name) = @_; + + # If file is less than SITEMAP_AGE hours old, then read in and send to caller. + # If greater, then regenerate and send the new version. + my $index_file = bz_locations->{'datadir'} . "/$extension_name/sitemap_index.xml"; + if (-e $index_file) { + my $index_mtime = (stat($index_file))[9]; + my $index_hours = sprintf("%d", (time() - $index_mtime) / 60 / 60); # in hours + if ($index_hours < SITEMAP_AGE) { + my $index_fh = new IO::File($index_file, 'r'); + $index_fh || die "Could not open current sitemap index: $!"; + my $index_xml; + { local $/; $index_xml = <$index_fh> } + $index_fh->close() || die "Could not close current sitemap index: $!"; + + return $index_xml; + } + } + + # Set the atime and mtime of the index file to the current time + # in case another request is made before we finish. + utime(undef, undef, $index_file); + + # Sitemaps must never contain private data. + Bugzilla->logout_request(); + my $user = Bugzilla->user; + my $products = $user->get_accessible_products; + + my $num_bugs = SITEMAP_MAX - scalar(@$products); + # We do this date math outside of the database because databases + # usually do better with a straight comparison value. + my $hours_ago = too_young_date(); + + # We don't use Bugzilla::Bug objects, because this could be a tremendous + # amount of data, and we only want a little. Also, we only display + # bugs that are not in any group. We show the last $num_bugs + # most-recently-updated bugs. + my $dbh = Bugzilla->dbh; + my $bug_sth = $dbh->prepare( + 'SELECT bugs.bug_id, bugs.delta_ts + FROM bugs + LEFT JOIN bug_group_map ON bugs.bug_id = bug_group_map.bug_id + WHERE bug_group_map.bug_id IS NULL AND creation_ts < ? + ' . $dbh->sql_limit($num_bugs, '?')); + + my $filecount = 1; + my $filelist = []; + my $offset = 0; + + while (1) { + my $bugs = []; + + $bug_sth->execute($hours_ago, $offset); + + while (my ($bug_id, $delta_ts) = $bug_sth->fetchrow_array()) { + push(@$bugs, { bug_id => $bug_id, delta_ts => $delta_ts }); + } + + last if !@$bugs; + + # We only need the product links in the first sitemap file + $products = [] if $filecount > 1; + + push(@$filelist, _generate_sitemap_file($extension_name, $filecount, $products, $bugs)); + + $filecount++; + $offset += $num_bugs; + } + + # Generate index file + return _generate_sitemap_index($extension_name, $filelist); +} + +sub _generate_sitemap_index { + my ($extension_name, $filelist) = @_; + + my $dbh = Bugzilla->dbh; + my $timestamp = $dbh->selectrow_array( + "SELECT " . $dbh->sql_date_format('NOW()', '%Y-%m-%d')); + + my $index_xml = <<END; +<?xml version="1.0" encoding="UTF-8"?> +<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> +END + + foreach my $filename (@$filelist) { + $index_xml .= " + <sitemap> + <loc>" . correct_urlbase() . "data/$extension_name/$filename</loc> + <lastmod>$timestamp</lastmod> + </sitemap> +"; + } + + $index_xml .= <<END; +</sitemapindex> +END + + my $index_file = bz_locations->{'datadir'} . "/$extension_name/sitemap_index.xml"; + my $index_fh = new IO::File($index_file, 'w'); + $index_fh || die "Could not open new sitemap index: $!"; + print $index_fh $index_xml; + $index_fh->close() || die "Could not close new sitemap index: $!"; + + return $index_xml; +} + +sub _generate_sitemap_file { + my ($extension_name, $filecount, $products, $bugs) = @_; + + my $bug_url = correct_urlbase() . 'show_bug.cgi?id='; + my $product_url = correct_urlbase() . 'describecomponents.cgi?product='; + + my $sitemap_xml = <<END; +<?xml version="1.0" encoding="UTF-8"?> +<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> +END + + foreach my $product (@$products) { + $sitemap_xml .= " + <url> + <loc>" . $product_url . url_quote($product->name) . "</loc> + <changefreq>daily</changefreq> + <priority>0.4</priority> + </url> +"; + } + + foreach my $bug (@$bugs) { + $sitemap_xml .= " + <url> + <loc>" . $bug_url . $bug->{bug_id} . "</loc> + <lastmod>" . datetime_from($bug->{delta_ts}, 'UTC')->iso8601 . 'Z' . "</lastmod> + </url> +"; + } + + $sitemap_xml .= <<END; +</urlset> +END + + # Write the compressed sitemap data to a file in the cgi root so that they can + # be accessed by the search engines. + my $filename = "sitemap$filecount.xml.gz"; + gzip \$sitemap_xml => bz_locations->{'datadir'} . "/$extension_name/$filename" + || die "gzip failed: $GzipError\n"; + + return $filename; +} + +1; diff --git a/extensions/SiteMapIndex/robots.txt b/extensions/SiteMapIndex/robots.txt new file mode 100644 index 000000000..139edbf93 --- /dev/null +++ b/extensions/SiteMapIndex/robots.txt @@ -0,0 +1,9 @@ +User-agent: * +Disallow: /*.cgi +Disallow: /*show_bug.cgi*ctype=* +Allow: / +Allow: /*index.cgi +Allow: /*page.cgi +Allow: /*show_bug.cgi +Allow: /*describecomponents.cgi +Sitemap: SITEMAP_URL diff --git a/extensions/SiteMapIndex/template/en/default/hook/global/header-additional_header.html.tmpl b/extensions/SiteMapIndex/template/en/default/hook/global/header-additional_header.html.tmpl new file mode 100644 index 000000000..682f6093f --- /dev/null +++ b/extensions/SiteMapIndex/template/en/default/hook/global/header-additional_header.html.tmpl @@ -0,0 +1,23 @@ +[%# The contents of this file are subject to the Mozilla Public + # License Version 1.1 (the "License"); you may not use this file + # except in compliance with the License. You may obtain a copy of + # the License at http://www.mozilla.org/MPL/ + # + # Software distributed under the License is distributed on an "AS + # IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + # implied. See the License for the specific language governing + # rights and limitations under the License. + # + # The Initial Developer of the Original Code is Everything Solved, Inc. + # Portions created by Everything Solved are Copyright (C) 2010 + # Everything Solved. All Rights Reserved. + # + # The Original Code is the Bugzilla Sitemap Extension. + # + # Contributor(s): + # Max Kanat-Alexander <mkanat@bugzilla.org> + #%] + +[% SET meta_robots = ['noarchive'] %] +[% meta_robots.push('noindex') IF sitemap_noindex %] +<meta name="robots" content="[% meta_robots.join(',') FILTER html %]"> diff --git a/extensions/SiteMapIndex/template/en/default/hook/global/messages-messages.html.tmpl b/extensions/SiteMapIndex/template/en/default/hook/global/messages-messages.html.tmpl new file mode 100644 index 000000000..0d0e9fd74 --- /dev/null +++ b/extensions/SiteMapIndex/template/en/default/hook/global/messages-messages.html.tmpl @@ -0,0 +1,37 @@ +[%# The contents of this file are subject to the Mozilla Public + # License Version 1.1 (the "License"); you may not use this file + # except in compliance with the License. You may obtain a copy of + # the License at http://www.mozilla.org/MPL/ + # + # Software distributed under the License is distributed on an "AS + # IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + # implied. See the License for the specific language governing + # rights and limitations under the License. + # + # The Initial Developer of the Original Code is Everything Solved, Inc. + # Portions created by Everything Solved are Copyright (C) 2010 + # Everything Solved. All Rights Reserved. + # + # The Original Code is the Bugzilla Sitemap Extension. + # + # Contributor(s): + # Max Kanat-Alexander <mkanat@bugzilla.org> + #%] + +[% IF message_tag == "sitemap_fixing_robots" %] + Replacing [% current FILTER html %]. (The old version will be saved + as "[% backup FILTER html %]". You can delete the old version if you + do not need its contents.) + +[% ELSIF message_tag == "sitemap_requirelogin" %] + Not updating search engines with your sitemap, because you have the + "requirelogin" parameter turned on, and so search engines will not be + able to access your sitemap. + +[% ELSIF message_tag == "sitemap_no_urlbase" %] + You have not yet set the "urlbase" parameter. We cannot update + search engines and inform them about your sitemap without a + urlbase. Please set the "urlbase" parameter and re-run + checksetup.pl. + +[% END %] |