From ab504b85c56cf10b6c840ae85080d4adc117a81f Mon Sep 17 00:00:00 2001
From: Derek Jones
Date: Thu, 11 Sep 2008 17:04:30 +0000
Subject: replaced format_characters()' logic replacement with a more
straightforward and accurate regex replacement. The former handled some odd
improper punctuation in a more predictable manner, but as a result failed to
render many proper punctuation cases correctly. The new method is a good
tradeoff, getting all "correct" punctuation rendered properly with better
performance.
---
system/libraries/Typography.php | 258 ++++++----------------------------------
1 file changed, 38 insertions(+), 220 deletions(-)
(limited to 'system/libraries/Typography.php')
diff --git a/system/libraries/Typography.php b/system/libraries/Typography.php
index dabf7a625..bc3baeb5d 100644
--- a/system/libraries/Typography.php
+++ b/system/libraries/Typography.php
@@ -235,231 +235,49 @@ class CI_Typography {
* Format Characters
*
* This function mainly converts double and single quotes
- * to entities, but since these are directional, it does
- * it based on some rules. It also converts em-dashes
- * and a couple other things.
+ * to curly entities, but it also converts em-dashes,
+ * double spaces, and ampersands
*/
function format_characters($str)
- {
- $table = array(
- ' "' => " “",
- '" ' => "” ",
- " '" => " ‘",
- "' " => "’ ",
-
- '>"' => ">“",
- '"<' => "”<",
- ">'" => ">‘",
- "'<" => "’<",
-
- "\"." => "”.",
- "\"," => "”,",
- "\";" => "”;",
- "\":" => "”:",
- "\"!" => "”!",
- "\"?" => "”?",
-
- ". " => ". ",
- "? " => "? ",
- "! " => "! ",
- ": " => ": ",
- );
-
- // These deal with quotes within quotes, like: "'hi here'"
- $start = 0;
- $space = array("\n", "\t", " ");
-
- while(TRUE)
- {
- $current = strpos(substr($str, $start), "\"'");
-
- if ($current === FALSE) break;
-
- $one_before = substr($str, $start+$current-1, 1);
- $one_after = substr($str, $start+$current+2, 1);
-
- if ( ! in_array($one_after, $space, TRUE) && $one_after != "<")
- {
- $str = str_replace( $one_before."\"'".$one_after,
- $one_before."“‘".$one_after,
- $str);
- }
- elseif ( ! in_array($one_before, $space, TRUE) && (in_array($one_after, $space, TRUE) OR $one_after == '<'))
- {
- $str = str_replace( $one_before."\"'".$one_after,
- $one_before."”’".$one_after,
- $str);
- }
-
- $start = $start+$current+2;
- }
-
- $start = 0;
-
- while(TRUE)
- {
- $current = strpos(substr($str, $start), "'\"");
-
- if ($current === FALSE) break;
-
- $one_before = substr($str, $start+$current-1, 1);
- $one_after = substr($str, $start+$current+2, 1);
-
- if ( in_array($one_before, $space, TRUE) && ! in_array($one_after, $space, TRUE) && $one_after != "<")
- {
- $str = str_replace( $one_before."'\"".$one_after,
- $one_before."‘“".$one_after,
- $str);
- }
- elseif ( ! in_array($one_before, $space, TRUE) && $one_before != ">")
- {
- $str = str_replace( $one_before."'\"".$one_after,
- $one_before."’”".$one_after,
- $str);
- }
-
- $start = $start+$current+2;
- }
-
- // Are there quotes within a word, as in: ("something")
- if (preg_match_all("/(.)\"(\S+?)\"(.)/", $str, $matches))
- {
- for ($i=0, $s=sizeof($matches['0']); $i < $s; ++$i)
- {
- if ( ! in_array($matches['1'][$i], $space, TRUE) && ! in_array($matches['3'][$i], $space, TRUE))
- {
- $str = str_replace( $matches['0'][$i],
- $matches['1'][$i]."“".$matches['2'][$i]."”".$matches['3'][$i],
- $str);
- }
- }
- }
-
- if (preg_match_all("/(.)\'(\S+?)\'(.)/", $str, $matches))
- {
- for ($i=0, $s=sizeof($matches['0']); $i < $s; ++$i)
- {
- if ( ! in_array($matches['1'][$i], $space, TRUE) && ! in_array($matches['3'][$i], $space, TRUE))
- {
- $str = str_replace( $matches['0'][$i],
- $matches['1'][$i]."‘".$matches['2'][$i]."’".$matches['3'][$i],
- $str);
- }
- }
- }
-
- // How about one apostrophe, as in Rick's
- $start = 0;
+ {
+ static $table;
- while(TRUE)
+ if ( ! isset($table))
{
- $current = strpos(substr($str, $start), "'");
-
- if ($current === FALSE) break;
-
- $one_before = substr($str, $start+$current-1, 1);
- $one_after = substr($str, $start+$current+1, 1);
-
- if ( ! in_array($one_before, $space, TRUE) && ! in_array($one_after, $space, TRUE))
- {
- $str = str_replace( $one_before."'".$one_after,
- $one_before."’".$one_after,
- $str);
- }
-
- $start = $start+$current+2;
- }
+ $table = array(
+ // nested smart quotes, opening and closing
+ // note that rules for grammar (English) allow only for two levels deep
+ // and that single quotes are _supposed_ to always be on the outside
+ // but we'll accommodate both
+ '/(^|\W|\s)\'"/' => '$1‘“',
+ '/\'"(\s|\W|$)/' => '’”$1',
+ '/(^|\W|\s)"\'/' => '$1“‘',
+ '/"\'(\s|\W|$)/' => '”’$1',
- // Em-dashes
- $start = 0;
- while(TRUE)
- {
- $current = strpos(substr($str, $start), "--");
-
- if ($current === FALSE) break;
-
- $one_before = substr($str, $start+$current-1, 1);
- $one_after = substr($str, $start+$current+2, 1);
- $two_before = substr($str, $start+$current-2, 1);
- $two_after = substr($str, $start+$current+3, 1);
-
- if (( ! in_array($one_before, $space, TRUE) && ! in_array($one_after, $space, TRUE))
- OR
- ( ! in_array($two_before, $space, TRUE) && ! in_array($two_after, $space, TRUE) && $one_before == ' ' && $one_after == ' ')
- )
- {
- $str = str_replace( $two_before.$one_before."--".$one_after.$two_after,
- $two_before.trim($one_before)."—".trim($one_after).$two_after,
- $str);
- }
-
- $start = $start+$current+2;
- }
-
- // Ellipsis
- $str = preg_replace("#(\w)\.\.\.(\s|
|
)#", "\\1…\\2", $str);
- $str = preg_replace("#(\s|
|)\.\.\.(\w)#", "\\1…\\2", $str);
-
- // Run the translation array we defined above
- $str = str_replace(array_keys($table), array_values($table), $str);
-
- // If there are any stray double quotes we'll catch them here
-
- $start = 0;
-
- while(TRUE)
- {
- $current = strpos(substr($str, $start), '"');
-
- if ($current === FALSE) break;
-
- $one_before = substr($str, $start+$current-1, 1);
- $one_after = substr($str, $start+$current+1, 1);
-
- if ( ! in_array($one_after, $space, TRUE))
- {
- $str = str_replace( $one_before.'"'.$one_after,
- $one_before."“".$one_after,
- $str);
- }
- elseif( ! in_array($one_before, $space, TRUE))
- {
- $str = str_replace( $one_before."'".$one_after,
- $one_before."”".$one_after,
- $str);
- }
-
- $start = $start+$current+2;
- }
-
- $start = 0;
-
- while(TRUE)
- {
- $current = strpos(substr($str, $start), "'");
-
- if ($current === FALSE) break;
-
- $one_before = substr($str, $start+$current-1, 1);
- $one_after = substr($str, $start+$current+1, 1);
-
- if ( ! in_array($one_after, $space, TRUE))
- {
- $str = str_replace( $one_before."'".$one_after,
- $one_before."‘".$one_after,
- $str);
- }
- elseif( ! in_array($one_before, $space, TRUE))
- {
- $str = str_replace( $one_before."'".$one_after,
- $one_before."’".$one_after,
- $str);
- }
-
- $start = $start+$current+2;
- }
-
- return $str;
+ // single quote smart quotes
+ '/\'(\s|\W|$)/' => '’$1',
+ '/(^|\W|\s)\'/' => '$1‘',
+
+ // double quote smart quotes
+ '/"(\s|\W|$)/' => '”$1',
+ '/(^|\W|\s)"/' => '$1“',
+
+ // apostrophes
+ "/(\w)'(\w)/" => '$1’$2',
+
+ // Em dash and ellipses dots
+ '/\s?\-\-\s?/' => '—',
+ '/\w\.{3}/' => '…',
+
+ // double space after sentences
+ '/(\W)\s{2}/' => '$1 ',
+
+ // ampersands, if not a character entity
+ '/&(?!#?[a-zA-Z0-9]{2,};)/' => '&'
+ );
+ }
+
+ return preg_replace(array_keys($table), $table, $str);
}
// --------------------------------------------------------------------
--
cgit v1.2.3-24-g4f1b