Andrey Andreev | c5536aa | 2012-11-01 17:33:58 +0200 | [diff] [blame] | 1 | <?php |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 2 | /** |
| 3 | * CodeIgniter |
| 4 | * |
Andrey Andreev | fe9309d | 2015-01-09 17:48:58 +0200 | [diff] [blame] | 5 | * An open source application development framework for PHP |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 6 | * |
Andrey Andreev | bdb96ca | 2014-10-28 00:13:31 +0200 | [diff] [blame] | 7 | * This content is released under the MIT License (MIT) |
Andrey Andreev | c123e11 | 2012-01-08 00:17:34 +0200 | [diff] [blame] | 8 | * |
Andrey Andreev | 125ef47 | 2016-01-11 12:33:00 +0200 | [diff] [blame] | 9 | * Copyright (c) 2014 - 2016, British Columbia Institute of Technology |
Andrey Andreev | c123e11 | 2012-01-08 00:17:34 +0200 | [diff] [blame] | 10 | * |
Andrey Andreev | bdb96ca | 2014-10-28 00:13:31 +0200 | [diff] [blame] | 11 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 12 | * of this software and associated documentation files (the "Software"), to deal |
| 13 | * in the Software without restriction, including without limitation the rights |
| 14 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 15 | * copies of the Software, and to permit persons to whom the Software is |
| 16 | * furnished to do so, subject to the following conditions: |
Derek Jones | f4a4bd8 | 2011-10-20 12:18:42 -0500 | [diff] [blame] | 17 | * |
Andrey Andreev | bdb96ca | 2014-10-28 00:13:31 +0200 | [diff] [blame] | 18 | * The above copyright notice and this permission notice shall be included in |
| 19 | * all copies or substantial portions of the Software. |
| 20 | * |
| 21 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 22 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 23 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 24 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 25 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 26 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 27 | * THE SOFTWARE. |
| 28 | * |
| 29 | * @package CodeIgniter |
| 30 | * @author EllisLab Dev Team |
Andrey Andreev | 1924e87 | 2016-01-11 12:55:34 +0200 | [diff] [blame^] | 31 | * @copyright Copyright (c) 2008 - 2014, EllisLab, Inc. (https://ellislab.com/) |
Andrey Andreev | 125ef47 | 2016-01-11 12:33:00 +0200 | [diff] [blame] | 32 | * @copyright Copyright (c) 2014 - 2016, British Columbia Institute of Technology (http://bcit.ca/) |
Andrey Andreev | bdb96ca | 2014-10-28 00:13:31 +0200 | [diff] [blame] | 33 | * @license http://opensource.org/licenses/MIT MIT License |
Andrey Andreev | bd202c9 | 2016-01-11 12:50:18 +0200 | [diff] [blame] | 34 | * @link https://codeigniter.com |
Andrey Andreev | bdb96ca | 2014-10-28 00:13:31 +0200 | [diff] [blame] | 35 | * @since Version 2.0.0 |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 36 | * @filesource |
| 37 | */ |
Andrey Andreev | c5536aa | 2012-11-01 17:33:58 +0200 | [diff] [blame] | 38 | defined('BASEPATH') OR exit('No direct script access allowed'); |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 39 | |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 40 | /** |
Pascal Kriete | aaec1e4 | 2011-01-20 00:01:21 -0500 | [diff] [blame] | 41 | * Utf8 Class |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 42 | * |
Pascal Kriete | aaec1e4 | 2011-01-20 00:01:21 -0500 | [diff] [blame] | 43 | * Provides support for UTF-8 environments |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 44 | * |
| 45 | * @package CodeIgniter |
| 46 | * @subpackage Libraries |
Pascal Kriete | aaec1e4 | 2011-01-20 00:01:21 -0500 | [diff] [blame] | 47 | * @category UTF-8 |
Derek Jones | f4a4bd8 | 2011-10-20 12:18:42 -0500 | [diff] [blame] | 48 | * @author EllisLab Dev Team |
Andrey Andreev | bd202c9 | 2016-01-11 12:50:18 +0200 | [diff] [blame] | 49 | * @link https://codeigniter.com/user_guide/libraries/utf8.html |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 50 | */ |
Pascal Kriete | aaec1e4 | 2011-01-20 00:01:21 -0500 | [diff] [blame] | 51 | class CI_Utf8 { |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 52 | |
| 53 | /** |
Andrey Andreev | 3e9d2b8 | 2012-10-27 14:28:51 +0300 | [diff] [blame] | 54 | * Class constructor |
Barry Mieny | dd67197 | 2010-10-04 16:33:58 +0200 | [diff] [blame] | 55 | * |
Andrey Andreev | 3e9d2b8 | 2012-10-27 14:28:51 +0300 | [diff] [blame] | 56 | * Determines if UTF-8 support is to be enabled. |
Andrey Andreev | 92ebfb6 | 2012-05-17 12:49:24 +0300 | [diff] [blame] | 57 | * |
| 58 | * @return void |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 59 | */ |
Greg Aker | d2c4ec6 | 2011-12-25 22:52:57 -0600 | [diff] [blame] | 60 | public function __construct() |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 61 | { |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 62 | if ( |
Andrey Andreev | be1496d | 2014-02-11 22:48:45 +0200 | [diff] [blame] | 63 | defined('PREG_BAD_UTF8_ERROR') // PCRE must support UTF-8 |
| 64 | && (ICONV_ENABLED === TRUE OR MB_ENABLED === TRUE) // iconv or mbstring must be installed |
Andrey Andreev | b951f8b | 2014-02-15 21:45:40 +0200 | [diff] [blame] | 65 | && strtoupper(config_item('charset')) === 'UTF-8' // Application charset must be UTF-8 |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 66 | ) |
| 67 | { |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 68 | define('UTF8_ENABLED', TRUE); |
Andrey Andreev | c123e11 | 2012-01-08 00:17:34 +0200 | [diff] [blame] | 69 | log_message('debug', 'UTF-8 Support Enabled'); |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 70 | } |
| 71 | else |
| 72 | { |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 73 | define('UTF8_ENABLED', FALSE); |
Andrey Andreev | c123e11 | 2012-01-08 00:17:34 +0200 | [diff] [blame] | 74 | log_message('debug', 'UTF-8 Support Disabled'); |
Barry Mieny | dd67197 | 2010-10-04 16:33:58 +0200 | [diff] [blame] | 75 | } |
Andrey Andreev | eb555ed | 2014-02-12 19:25:01 +0200 | [diff] [blame] | 76 | |
Andrey Andreev | 90726b8 | 2015-01-20 12:39:22 +0200 | [diff] [blame] | 77 | log_message('info', 'Utf8 Class Initialized'); |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 78 | } |
Barry Mieny | dd67197 | 2010-10-04 16:33:58 +0200 | [diff] [blame] | 79 | |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 80 | // -------------------------------------------------------------------- |
Barry Mieny | dd67197 | 2010-10-04 16:33:58 +0200 | [diff] [blame] | 81 | |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 82 | /** |
| 83 | * Clean UTF-8 strings |
| 84 | * |
Andrey Andreev | 3e9d2b8 | 2012-10-27 14:28:51 +0300 | [diff] [blame] | 85 | * Ensures strings contain only valid UTF-8 characters. |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 86 | * |
Andrey Andreev | 3e9d2b8 | 2012-10-27 14:28:51 +0300 | [diff] [blame] | 87 | * @param string $str String to clean |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 88 | * @return string |
| 89 | */ |
Greg Aker | d2c4ec6 | 2011-12-25 22:52:57 -0600 | [diff] [blame] | 90 | public function clean_string($str) |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 91 | { |
Andrey Andreev | cd74d36 | 2014-02-15 21:44:02 +0200 | [diff] [blame] | 92 | if ($this->is_ascii($str) === FALSE) |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 93 | { |
Andrey Andreev | bb3edf1 | 2014-02-20 17:51:41 +0200 | [diff] [blame] | 94 | if (MB_ENABLED) |
Andrey Andreev | be1496d | 2014-02-11 22:48:45 +0200 | [diff] [blame] | 95 | { |
| 96 | $str = mb_convert_encoding($str, 'UTF-8', 'UTF-8'); |
| 97 | } |
Andrey Andreev | bb3edf1 | 2014-02-20 17:51:41 +0200 | [diff] [blame] | 98 | elseif (ICONV_ENABLED) |
| 99 | { |
| 100 | $str = @iconv('UTF-8', 'UTF-8//IGNORE', $str); |
| 101 | } |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 102 | } |
Barry Mieny | dd67197 | 2010-10-04 16:33:58 +0200 | [diff] [blame] | 103 | |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 104 | return $str; |
| 105 | } |
| 106 | |
| 107 | // -------------------------------------------------------------------- |
Barry Mieny | dd67197 | 2010-10-04 16:33:58 +0200 | [diff] [blame] | 108 | |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 109 | /** |
| 110 | * Remove ASCII control characters |
| 111 | * |
| 112 | * Removes all ASCII control characters except horizontal tabs, |
| 113 | * line feeds, and carriage returns, as all others can cause |
Andrey Andreev | 3e9d2b8 | 2012-10-27 14:28:51 +0300 | [diff] [blame] | 114 | * problems in XML. |
Barry Mieny | dd67197 | 2010-10-04 16:33:58 +0200 | [diff] [blame] | 115 | * |
Andrey Andreev | 3e9d2b8 | 2012-10-27 14:28:51 +0300 | [diff] [blame] | 116 | * @param string $str String to clean |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 117 | * @return string |
| 118 | */ |
Greg Aker | d2c4ec6 | 2011-12-25 22:52:57 -0600 | [diff] [blame] | 119 | public function safe_ascii_for_xml($str) |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 120 | { |
Pascal Kriete | 14a0ac6 | 2011-04-05 14:55:56 -0400 | [diff] [blame] | 121 | return remove_invisible_characters($str, FALSE); |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 122 | } |
| 123 | |
| 124 | // -------------------------------------------------------------------- |
Barry Mieny | dd67197 | 2010-10-04 16:33:58 +0200 | [diff] [blame] | 125 | |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 126 | /** |
| 127 | * Convert to UTF-8 |
| 128 | * |
Andrey Andreev | 3e9d2b8 | 2012-10-27 14:28:51 +0300 | [diff] [blame] | 129 | * Attempts to convert a string to UTF-8. |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 130 | * |
Andrey Andreev | 3e9d2b8 | 2012-10-27 14:28:51 +0300 | [diff] [blame] | 131 | * @param string $str Input string |
| 132 | * @param string $encoding Input encoding |
| 133 | * @return string $str encoded in UTF-8 or FALSE on failure |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 134 | */ |
Greg Aker | d2c4ec6 | 2011-12-25 22:52:57 -0600 | [diff] [blame] | 135 | public function convert_to_utf8($str, $encoding) |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 136 | { |
Andrey Andreev | bb3edf1 | 2014-02-20 17:51:41 +0200 | [diff] [blame] | 137 | if (MB_ENABLED) |
| 138 | { |
| 139 | return mb_convert_encoding($str, 'UTF-8', $encoding); |
| 140 | } |
| 141 | elseif (ICONV_ENABLED) |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 142 | { |
Andrey Andreev | c123e11 | 2012-01-08 00:17:34 +0200 | [diff] [blame] | 143 | return @iconv($encoding, 'UTF-8', $str); |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 144 | } |
Barry Mieny | dd67197 | 2010-10-04 16:33:58 +0200 | [diff] [blame] | 145 | |
Andrey Andreev | c123e11 | 2012-01-08 00:17:34 +0200 | [diff] [blame] | 146 | return FALSE; |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 147 | } |
| 148 | |
| 149 | // -------------------------------------------------------------------- |
Barry Mieny | dd67197 | 2010-10-04 16:33:58 +0200 | [diff] [blame] | 150 | |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 151 | /** |
| 152 | * Is ASCII? |
| 153 | * |
Andrey Andreev | 3e9d2b8 | 2012-10-27 14:28:51 +0300 | [diff] [blame] | 154 | * Tests if a string is standard 7-bit ASCII or not. |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 155 | * |
Andrey Andreev | 3e9d2b8 | 2012-10-27 14:28:51 +0300 | [diff] [blame] | 156 | * @param string $str String to check |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 157 | * @return bool |
| 158 | */ |
Andrey Andreev | cd74d36 | 2014-02-15 21:44:02 +0200 | [diff] [blame] | 159 | public function is_ascii($str) |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 160 | { |
Greg Aker | d2c4ec6 | 2011-12-25 22:52:57 -0600 | [diff] [blame] | 161 | return (preg_match('/[^\x00-\x7F]/S', $str) === 0); |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 162 | } |
| 163 | |
Derek Jones | 98badc1 | 2010-03-02 13:08:02 -0600 | [diff] [blame] | 164 | } |