Further improvements to xss_clean()
diff --git a/system/libraries/Input.php b/system/libraries/Input.php
index 0b05c54..783446a 100644
--- a/system/libraries/Input.php
+++ b/system/libraries/Input.php
@@ -511,7 +511,7 @@
 	 * @param	string

 	 * @return	string

 	 */

-	function xss_clean($str, $is_image = FALSE, $loops = 0, $looped_convert = '')

+	function xss_clean($str, $is_image = FALSE)

 	{

 		/*

 		 * Is the string an array?

@@ -528,22 +528,9 @@
 		}

 

 		/*

-		 * Runaway loop prevention.  If the text has had to be examined this many times

-		 * I think it's safe to say that it is best to simply ignore it.

+		 * Remove Invisible Characters

 		 */

-		if ($loops > 9)

-		{

-			return '';

-		}

-

-		/*

-		 * Remove Null Characters

-		 *

-		 * This prevents sandwiching null characters

-		 * between ascii characters, like Java\0script.

-		 *

-		 */

-		$str = preg_replace(array('/\0+/', '/(\\\\0)+/'), '', $str);

+		$str = $this->_remove_invisible_characters($str);

 

 		/*

 		 * Protect GET variables in URLs

@@ -601,6 +588,11 @@
 		$str = preg_replace_callback("/<\w+.*?(?=>|<|$)/si", array($this, '_html_entity_decode_callback'), $str);

 

 		/*

+		 * Remove Invisible Characters Again!

+		 */

+		$str = $this->_remove_invisible_characters($str);

+		

+		/*

 		 * Convert all tabs to spaces

 		 *

 		 * This prevents strings like this: ja	vascript

@@ -613,22 +605,7 @@
  		if (strpos($str, "\t") !== FALSE)

 		{

 			$str = str_replace("\t", ' ', $str);

-		}		

-

-		/*

-		 * Check and set converted string

-		 */

-		if ($looped_convert != '' && $looped_convert == $str)

-		{

-			// if we are in a loop, and the converted string is the same as the last pass,

-			// then this is going to repeat until we hit the runaway loop prevention,

-			// so we might as well stop now.

-			return '';

 		}

-		else

-		{

-			$converted_string = $str;			

-		}		

 		

 		/*

 		 * Not Allowed Under Any Conditions

@@ -677,7 +654,7 @@
 			// That way valid stuff like "dealer to" does not become "dealerto"

 			$str = preg_replace_callback('#('.substr($temp, 0, -3).')(\W)#is', array($this, '_compact_exploded_words'), $str);

 		}

-

+		

 		/*

 		 * Remove disallowed Javascript in links or img tags

 		 * We used to do some version comparisons and use of stripos for PHP5, but it is dog slow compared

@@ -689,17 +666,17 @@
 	

 			if (preg_match("/<a/i", $str))

 			{

-				$str = preg_replace_callback("#<a.*(>|<|$)#si", array($this, '_js_link_removal'), $str);

+				$str = preg_replace_callback("#<a\s*([^>]*?)(>|$)#si", array($this, '_js_link_removal'), $str);

 			}

 	

 			if (preg_match("/<img/i", $str))

 			{

-				$str = preg_replace_callback("#<img.*(>|<|$)#si", array($this, '_js_img_removal'), $str);

+				$str = preg_replace_callback("#<img\s*([^>]*?)(>|$)#si", array($this, '_js_img_removal'), $str);

 			}

 	

 			if (preg_match("/script/i", $str) OR preg_match("/xss/i", $str))

 			{

-				$str = preg_replace("#</*(script|xss).*?\>#si", "", $str);

+				$str = preg_replace("#<(/*)(script|xss)(.*?)\>#si", '[removed]', $str);

 			}

 		}

 		while($original != $str);

@@ -726,6 +703,7 @@
 		}

 		

 		$str = preg_replace("#<([^><]+)(".implode('|', $event_handlers).")(\s*=\s*[^><]*)([><]*)#i", "<\\1\\4", $str);

+		

 		/*

 		 * Sanitize naughty HTML elements

 		 *

@@ -790,16 +768,6 @@
 				return FALSE;

 			}

 		}

-

-		/*

-		 * If something changed after character conversion, we can be fairly confident that something

-		 * malicious was removed, so let's take no chances that the attacker is counting on specific

-		 * mutations taking place to allow a new attack to reveal itself.  So say we all.

-		 */

-		if ($converted_string != $str)

-		{

-			$str = $this->xss_clean($str, $is_image, ++$loops, $converted_string);

-		}

 		

 		log_message('debug', "XSS Filtering completed");

 		return $str;

@@ -831,6 +799,44 @@
 	// --------------------------------------------------------------------

 	

 	/**

+	 * Remove Invisible Characters

+	 *

+	 * This prevents sandwiching null characters

+	 * between ascii characters, like Java\0script.

+	 *

+	 * @access	public

+	 * @param	string

+	 * @return	string

+	 */

+	function _remove_invisible_characters($str)

+	{

+		static $non_displayables;

+		

+		if ( ! isset($non_displayables))

+		{

+			// every control character except newline (10), carriage return (13), and horizontal tab (09),

+			// both as a URL encoded character (::shakes fist at IE and WebKit::), and the actual character

+			$non_displayables = array(

+										'/%0[0-8]/', '/[\x00-\x08]/',			// 00-08

+										'/%11/', '/\x0b/', '/%12/', '/\x0c/',	// 11, 12

+										'/%1[4-9]/', '/%2[0-9]/', '/%3[0-1]/',	// url encoded 14-31

+										'/[\x0e-\x1f]/');						// 14-31

+			

+		}

+

+		do

+		{

+			$cleaned = $str;

+			$str = preg_replace($non_displayables, '', $str);

+		}

+		while ($cleaned != $str);

+

+		return $str;

+	}

+

+	// --------------------------------------------------------------------

+	

+	/**

 	 * Compact Exploded Words

 	 *

 	 * Callback function for xss_clean() to remove whitespace from

@@ -883,7 +889,8 @@
 	 */

 	function _js_link_removal($match)

 	{

-		return preg_replace("#href=.*?(alert\(|alert&\#40;|javascript\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si", "", $match[0]);

+		$attributes = $this->_filter_attributes(str_replace(array('<', '>'), '', $match[1]));

+		return str_replace($match[1], preg_replace("#href=.*?(alert\(|alert&\#40;|javascript\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si", "", $attributes), $match[0]);

 	}

 

 	/**

@@ -900,7 +907,8 @@
 	 */

 	function _js_img_removal($match)

 	{

-		return preg_replace("#src=.*?(alert\(|alert&\#40;|javascript\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si", "", $match[0]);

+		$attributes = $this->_filter_attributes(str_replace(array('<', '>'), '', $match[1]));

+		return str_replace($match[1], preg_replace("#src=.*?(alert\(|alert&\#40;|javascript\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si", "", $attributes), $match[0]);

 	}

 

 	// --------------------------------------------------------------------

@@ -994,6 +1002,34 @@
 		return $str;

 	}

 

+	// --------------------------------------------------------------------

+	

+	/**

+	 * Filter Attributes

+	 *

+	 * Filters tag attributes for consistency and safety

+	 *

+	 * @access	public

+	 * @param	string

+	 * @return	string

+	 */

+	function _filter_attributes($str)

+	{

+		$out = '';

+

+		if (preg_match_all('#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is', $str, $matches))

+		{

+			foreach ($matches[0] as $match)

+			{

+				$out .= "{$match}";

+			}			

+		}

+

+		return $out;

+	}

+

+	// --------------------------------------------------------------------

+

 }

 // END Input class