From 303c9cb958a52ef2d13e985da397cb49590113b0 Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Thu, 12 Jul 2007 19:12:37 +0000 Subject: added attribute and html entity decode callbacks to xss_clean() --- system/libraries/Input.php | 80 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 69 insertions(+), 11 deletions(-) diff --git a/system/libraries/Input.php b/system/libraries/Input.php index fcca722b7..ba94d854f 100644 --- a/system/libraries/Input.php +++ b/system/libraries/Input.php @@ -519,7 +519,7 @@ class CI_Input { * @param string * @return string */ - function xss_clean($str, $charset = 'ISO-8859-1') + function xss_clean($str) { /* * Remove Null Characters @@ -564,23 +564,46 @@ class CI_Input { $str = str_replace('9u3iovBnRThju941s89rKozm', "%20", $str); /* - * Convert character entities to ASCII + * Convert character entities to ASCII * * This permits our tests below to work reliably. * We only convert entities that are within tags since * these are the ones that will pose security problems. * */ - if (preg_match_all("/<(.+?)>/si", $str, $matches)) - { - for ($i = 0; $i < count($matches['0']); $i++) + + $str = preg_replace_callback("/[a-z]+=([\'\"]).*?\\1/si", array($this, '_attribute_conversion'), $str); + + $str = preg_replace_callback("/<([\w]+)[^>]*>/si", array($this, '_html_entity_decode_callback'), $str); + + /* + + Old Code that when modified to use preg_replace()'s above became more efficient memory-wise + + if (preg_match_all("/[a-z]+=([\'\"]).*?\\1/si", $str, $matches)) + { + for ($i = 0; $i < count($matches[0]); $i++) + { + if (stristr($matches[0][$i], '>')) + { + $str = str_replace( $matches['0'][$i], + str_replace('>', '<', $matches[0][$i]), + $str); + } + } + } + + if (preg_match_all("/<([\w]+)[^>]*>/si", $str, $matches)) + { + for ($i = 0; $i < count($matches[0]); $i++) { - $str = str_replace($matches['1'][$i], - $this->_html_entity_decode($matches['1'][$i], $charset), + $str = str_replace($matches[0][$i], + $this->_html_entity_decode($matches[0][$i], $charset), $str); } } - + */ + /* * Convert all tabs to spaces * @@ -801,7 +824,42 @@ class CI_Input { } // -------------------------------------------------------------------- - + + /** + * Attribute Conversion + * + * Used as a callback for XSS Clean + * + * @access public + * @param array + * @return string + */ + function _attribute_conversion($match) + { + return str_replace('>', '<', $match[0]); + } + + // -------------------------------------------------------------------- + + /** + * HTML Entity Decode Callback + * + * Used as a callback for XSS Clean + * + * @access public + * @param array + * @return string + */ + function _html_entity_decode_callback($match) + { + $CI =& get_instance(); + $charset = $CI->config->item('charset'); + + return $this->_html_entity_decode($match[0], strtoupper($charset)); + } + + // -------------------------------------------------------------------- + /** * HTML Entities Decode * @@ -826,10 +884,10 @@ class CI_Input { character set, and the PHP developers said they were not back porting the fix to versions other than PHP 5.x. */ - function _html_entity_decode($str, $charset='ISO-8859-1') + function _html_entity_decode($str, $charset='UTF-8') { if (stristr($str, '&') === FALSE) return $str; - + // The reason we are not using html_entity_decode() by itself is because // while it is not technically correct to leave out the semicolon // at the end of an entity most browsers will still interpret the entity -- cgit v1.2.3-24-g4f1b