From ab504b85c56cf10b6c840ae85080d4adc117a81f Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Thu, 11 Sep 2008 17:04:30 +0000 Subject: replaced format_characters()' logic replacement with a more straightforward and accurate regex replacement. The former handled some odd improper punctuation in a more predictable manner, but as a result failed to render many proper punctuation cases correctly. The new method is a good tradeoff, getting all "correct" punctuation rendered properly with better performance. --- system/libraries/Typography.php | 258 ++++++---------------------------------- 1 file changed, 38 insertions(+), 220 deletions(-) (limited to 'system') diff --git a/system/libraries/Typography.php b/system/libraries/Typography.php index dabf7a625..bc3baeb5d 100644 --- a/system/libraries/Typography.php +++ b/system/libraries/Typography.php @@ -235,231 +235,49 @@ class CI_Typography { * Format Characters * * This function mainly converts double and single quotes - * to entities, but since these are directional, it does - * it based on some rules. It also converts em-dashes - * and a couple other things. + * to curly entities, but it also converts em-dashes, + * double spaces, and ampersands */ function format_characters($str) - { - $table = array( - ' "' => " “", - '" ' => "” ", - " '" => " ‘", - "' " => "’ ", - - '>"' => ">“", - '"<' => "”<", - ">'" => ">‘", - "'<" => "’<", - - "\"." => "”.", - "\"," => "”,", - "\";" => "”;", - "\":" => "”:", - "\"!" => "”!", - "\"?" => "”?", - - ". " => ".  ", - "? " => "?  ", - "! " => "!  ", - ": " => ":  ", - ); - - // These deal with quotes within quotes, like: "'hi here'" - $start = 0; - $space = array("\n", "\t", " "); - - while(TRUE) - { - $current = strpos(substr($str, $start), "\"'"); - - if ($current === FALSE) break; - - $one_before = substr($str, $start+$current-1, 1); - $one_after = substr($str, $start+$current+2, 1); - - if ( ! in_array($one_after, $space, TRUE) && $one_after != "<") - { - $str = str_replace( $one_before."\"'".$one_after, - $one_before."“‘".$one_after, - $str); - } - elseif ( ! in_array($one_before, $space, TRUE) && (in_array($one_after, $space, TRUE) OR $one_after == '<')) - { - $str = str_replace( $one_before."\"'".$one_after, - $one_before."”’".$one_after, - $str); - } - - $start = $start+$current+2; - } - - $start = 0; - - while(TRUE) - { - $current = strpos(substr($str, $start), "'\""); - - if ($current === FALSE) break; - - $one_before = substr($str, $start+$current-1, 1); - $one_after = substr($str, $start+$current+2, 1); - - if ( in_array($one_before, $space, TRUE) && ! in_array($one_after, $space, TRUE) && $one_after != "<") - { - $str = str_replace( $one_before."'\"".$one_after, - $one_before."‘“".$one_after, - $str); - } - elseif ( ! in_array($one_before, $space, TRUE) && $one_before != ">") - { - $str = str_replace( $one_before."'\"".$one_after, - $one_before."’”".$one_after, - $str); - } - - $start = $start+$current+2; - } - - // Are there quotes within a word, as in: ("something") - if (preg_match_all("/(.)\"(\S+?)\"(.)/", $str, $matches)) - { - for ($i=0, $s=sizeof($matches['0']); $i < $s; ++$i) - { - if ( ! in_array($matches['1'][$i], $space, TRUE) && ! in_array($matches['3'][$i], $space, TRUE)) - { - $str = str_replace( $matches['0'][$i], - $matches['1'][$i]."“".$matches['2'][$i]."”".$matches['3'][$i], - $str); - } - } - } - - if (preg_match_all("/(.)\'(\S+?)\'(.)/", $str, $matches)) - { - for ($i=0, $s=sizeof($matches['0']); $i < $s; ++$i) - { - if ( ! in_array($matches['1'][$i], $space, TRUE) && ! in_array($matches['3'][$i], $space, TRUE)) - { - $str = str_replace( $matches['0'][$i], - $matches['1'][$i]."‘".$matches['2'][$i]."’".$matches['3'][$i], - $str); - } - } - } - - // How about one apostrophe, as in Rick's - $start = 0; + { + static $table; - while(TRUE) + if ( ! isset($table)) { - $current = strpos(substr($str, $start), "'"); - - if ($current === FALSE) break; - - $one_before = substr($str, $start+$current-1, 1); - $one_after = substr($str, $start+$current+1, 1); - - if ( ! in_array($one_before, $space, TRUE) && ! in_array($one_after, $space, TRUE)) - { - $str = str_replace( $one_before."'".$one_after, - $one_before."’".$one_after, - $str); - } - - $start = $start+$current+2; - } + $table = array( + // nested smart quotes, opening and closing + // note that rules for grammar (English) allow only for two levels deep + // and that single quotes are _supposed_ to always be on the outside + // but we'll accommodate both + '/(^|\W|\s)\'"/' => '$1‘“', + '/\'"(\s|\W|$)/' => '’”$1', + '/(^|\W|\s)"\'/' => '$1“‘', + '/"\'(\s|\W|$)/' => '”’$1', - // Em-dashes - $start = 0; - while(TRUE) - { - $current = strpos(substr($str, $start), "--"); - - if ($current === FALSE) break; - - $one_before = substr($str, $start+$current-1, 1); - $one_after = substr($str, $start+$current+2, 1); - $two_before = substr($str, $start+$current-2, 1); - $two_after = substr($str, $start+$current+3, 1); - - if (( ! in_array($one_before, $space, TRUE) && ! in_array($one_after, $space, TRUE)) - OR - ( ! in_array($two_before, $space, TRUE) && ! in_array($two_after, $space, TRUE) && $one_before == ' ' && $one_after == ' ') - ) - { - $str = str_replace( $two_before.$one_before."--".$one_after.$two_after, - $two_before.trim($one_before)."—".trim($one_after).$two_after, - $str); - } - - $start = $start+$current+2; - } - - // Ellipsis - $str = preg_replace("#(\w)\.\.\.(\s|
|

)#", "\\1…\\2", $str); - $str = preg_replace("#(\s|
|

)\.\.\.(\w)#", "\\1…\\2", $str); - - // Run the translation array we defined above - $str = str_replace(array_keys($table), array_values($table), $str); - - // If there are any stray double quotes we'll catch them here - - $start = 0; - - while(TRUE) - { - $current = strpos(substr($str, $start), '"'); - - if ($current === FALSE) break; - - $one_before = substr($str, $start+$current-1, 1); - $one_after = substr($str, $start+$current+1, 1); - - if ( ! in_array($one_after, $space, TRUE)) - { - $str = str_replace( $one_before.'"'.$one_after, - $one_before."“".$one_after, - $str); - } - elseif( ! in_array($one_before, $space, TRUE)) - { - $str = str_replace( $one_before."'".$one_after, - $one_before."”".$one_after, - $str); - } - - $start = $start+$current+2; - } - - $start = 0; - - while(TRUE) - { - $current = strpos(substr($str, $start), "'"); - - if ($current === FALSE) break; - - $one_before = substr($str, $start+$current-1, 1); - $one_after = substr($str, $start+$current+1, 1); - - if ( ! in_array($one_after, $space, TRUE)) - { - $str = str_replace( $one_before."'".$one_after, - $one_before."‘".$one_after, - $str); - } - elseif( ! in_array($one_before, $space, TRUE)) - { - $str = str_replace( $one_before."'".$one_after, - $one_before."’".$one_after, - $str); - } - - $start = $start+$current+2; - } - - return $str; + // single quote smart quotes + '/\'(\s|\W|$)/' => '’$1', + '/(^|\W|\s)\'/' => '$1‘', + + // double quote smart quotes + '/"(\s|\W|$)/' => '”$1', + '/(^|\W|\s)"/' => '$1“', + + // apostrophes + "/(\w)'(\w)/" => '$1’$2', + + // Em dash and ellipses dots + '/\s?\-\-\s?/' => '—', + '/\w\.{3}/' => '…', + + // double space after sentences + '/(\W)\s{2}/' => '$1  ', + + // ampersands, if not a character entity + '/&(?!#?[a-zA-Z0-9]{2,};)/' => '&' + ); + } + + return preg_replace(array_keys($table), $table, $str); } // -------------------------------------------------------------------- -- cgit v1.2.3-24-g4f1b