blob: b58c611e140033b8c9a496035db7ca7312f3d79b [file] [log] [blame]
Andrey Andreevc5536aa2012-11-01 17:33:58 +02001<?php
Derek Jones98badc12010-03-02 13:08:02 -06002/**
3 * CodeIgniter
4 *
Phil Sturgeon07c1ac82012-03-09 17:03:37 +00005 * An open source application development framework for PHP 5.2.4 or newer
Derek Jones98badc12010-03-02 13:08:02 -06006 *
Derek Jonesf4a4bd82011-10-20 12:18:42 -05007 * NOTICE OF LICENSE
Andrey Andreevc123e112012-01-08 00:17:34 +02008 *
Derek Jonesf4a4bd82011-10-20 12:18:42 -05009 * Licensed under the Open Software License version 3.0
Andrey Andreevc123e112012-01-08 00:17:34 +020010 *
Derek Jonesf4a4bd82011-10-20 12:18:42 -050011 * This source file is subject to the Open Software License (OSL 3.0) that is
12 * bundled with this package in the files license.txt / license.rst. It is
13 * also available through the world wide web at this URL:
14 * http://opensource.org/licenses/OSL-3.0
15 * If you did not receive a copy of the license and are unable to obtain it
16 * through the world wide web, please send an email to
17 * licensing@ellislab.com so we can send you a copy immediately.
18 *
Derek Jones98badc12010-03-02 13:08:02 -060019 * @package CodeIgniter
Derek Jonesf4a4bd82011-10-20 12:18:42 -050020 * @author EllisLab Dev Team
darwinel871754a2014-02-11 17:34:57 +010021 * @copyright Copyright (c) 2008 - 2014, EllisLab, Inc. (http://ellislab.com/)
Derek Jonesf4a4bd82011-10-20 12:18:42 -050022 * @license http://opensource.org/licenses/OSL-3.0 Open Software License (OSL 3.0)
Derek Jones98badc12010-03-02 13:08:02 -060023 * @link http://codeigniter.com
Pascal Kriete5b2d2da2010-11-04 17:23:40 -040024 * @since Version 2.0
Derek Jones98badc12010-03-02 13:08:02 -060025 * @filesource
26 */
Andrey Andreevc5536aa2012-11-01 17:33:58 +020027defined('BASEPATH') OR exit('No direct script access allowed');
Derek Jones98badc12010-03-02 13:08:02 -060028
Derek Jones98badc12010-03-02 13:08:02 -060029/**
Pascal Krieteaaec1e42011-01-20 00:01:21 -050030 * Utf8 Class
Derek Jones98badc12010-03-02 13:08:02 -060031 *
Pascal Krieteaaec1e42011-01-20 00:01:21 -050032 * Provides support for UTF-8 environments
Derek Jones98badc12010-03-02 13:08:02 -060033 *
34 * @package CodeIgniter
35 * @subpackage Libraries
Pascal Krieteaaec1e42011-01-20 00:01:21 -050036 * @category UTF-8
Derek Jonesf4a4bd82011-10-20 12:18:42 -050037 * @author EllisLab Dev Team
Pascal Krieteaaec1e42011-01-20 00:01:21 -050038 * @link http://codeigniter.com/user_guide/libraries/utf8.html
Derek Jones98badc12010-03-02 13:08:02 -060039 */
Pascal Krieteaaec1e42011-01-20 00:01:21 -050040class CI_Utf8 {
Derek Jones98badc12010-03-02 13:08:02 -060041
42 /**
Andrey Andreev3e9d2b82012-10-27 14:28:51 +030043 * Class constructor
Barry Mienydd671972010-10-04 16:33:58 +020044 *
Andrey Andreev3e9d2b82012-10-27 14:28:51 +030045 * Determines if UTF-8 support is to be enabled.
Andrey Andreev92ebfb62012-05-17 12:49:24 +030046 *
47 * @return void
Derek Jones98badc12010-03-02 13:08:02 -060048 */
Greg Akerd2c4ec62011-12-25 22:52:57 -060049 public function __construct()
Derek Jones98badc12010-03-02 13:08:02 -060050 {
Andrey Andreevc123e112012-01-08 00:17:34 +020051 log_message('debug', 'Utf8 Class Initialized');
Barry Mienydd671972010-10-04 16:33:58 +020052
Andrey Andreev6c5f7512012-10-10 15:56:18 +030053 $charset = strtoupper(config_item('charset'));
54
55 // set internal encoding for multibyte string functions if necessary
56 // and set a flag so we don't have to repeatedly use extension_loaded()
57 // or function_exists()
58 if (extension_loaded('mbstring'))
59 {
60 define('MB_ENABLED', TRUE);
61 mb_internal_encoding($charset);
Andrey Andreevbe1496d2014-02-11 22:48:45 +020062 // This is required for mb_convert_encoding() to strip invalid characters
63 ini_set('mbstring.substitute_character', 'none');
Andrey Andreev6c5f7512012-10-10 15:56:18 +030064 }
65 else
66 {
67 define('MB_ENABLED', FALSE);
68 }
69
Andrey Andreevbe1496d2014-02-11 22:48:45 +020070 // Do the same for iconv, which actually has more easy to remember
71 // predefined constants (such as ICONV_IMPL), but the iconv PHP
72 // manual page says that using them is "strongly discouraged".
73 if (extension_loaded('iconv'))
74 {
75 define('ICONV_ENABLED', TRUE);
76 iconv_set_encoding('internal_encoding', $charset);
77 }
78 else
79 {
80 define('ICONV_ENABLED', FALSE);
81 }
82
Derek Jones98badc12010-03-02 13:08:02 -060083 if (
Andrey Andreevbe1496d2014-02-11 22:48:45 +020084 defined('PREG_BAD_UTF8_ERROR') // PCRE must support UTF-8
85 && (ICONV_ENABLED === TRUE OR MB_ENABLED === TRUE) // iconv or mbstring must be installed
86 && $charset === 'UTF-8' // Application charset must be UTF-8
Derek Jones98badc12010-03-02 13:08:02 -060087 )
88 {
Derek Jones98badc12010-03-02 13:08:02 -060089 define('UTF8_ENABLED', TRUE);
Andrey Andreevc123e112012-01-08 00:17:34 +020090 log_message('debug', 'UTF-8 Support Enabled');
Derek Jones98badc12010-03-02 13:08:02 -060091 }
92 else
93 {
Derek Jones98badc12010-03-02 13:08:02 -060094 define('UTF8_ENABLED', FALSE);
Andrey Andreevc123e112012-01-08 00:17:34 +020095 log_message('debug', 'UTF-8 Support Disabled');
Barry Mienydd671972010-10-04 16:33:58 +020096 }
Derek Jones98badc12010-03-02 13:08:02 -060097 }
Barry Mienydd671972010-10-04 16:33:58 +020098
Derek Jones98badc12010-03-02 13:08:02 -060099 // --------------------------------------------------------------------
Barry Mienydd671972010-10-04 16:33:58 +0200100
Derek Jones98badc12010-03-02 13:08:02 -0600101 /**
102 * Clean UTF-8 strings
103 *
Andrey Andreev3e9d2b82012-10-27 14:28:51 +0300104 * Ensures strings contain only valid UTF-8 characters.
Derek Jones98badc12010-03-02 13:08:02 -0600105 *
Andrey Andreev3e9d2b82012-10-27 14:28:51 +0300106 * @uses CI_Utf8::_is_ascii() Decide whether a conversion is needed
107 *
108 * @param string $str String to clean
Derek Jones98badc12010-03-02 13:08:02 -0600109 * @return string
110 */
Greg Akerd2c4ec62011-12-25 22:52:57 -0600111 public function clean_string($str)
Derek Jones98badc12010-03-02 13:08:02 -0600112 {
113 if ($this->_is_ascii($str) === FALSE)
114 {
Andrey Andreevbe1496d2014-02-11 22:48:45 +0200115 if (ICONV_ENABLED)
116 {
117 $str = @iconv('UTF-8', 'UTF-8//IGNORE', $str);
118 }
119 elseif (MB_ENABLED)
120 {
121 $str = mb_convert_encoding($str, 'UTF-8', 'UTF-8');
122 }
Derek Jones98badc12010-03-02 13:08:02 -0600123 }
Barry Mienydd671972010-10-04 16:33:58 +0200124
Derek Jones98badc12010-03-02 13:08:02 -0600125 return $str;
126 }
127
128 // --------------------------------------------------------------------
Barry Mienydd671972010-10-04 16:33:58 +0200129
Derek Jones98badc12010-03-02 13:08:02 -0600130 /**
131 * Remove ASCII control characters
132 *
133 * Removes all ASCII control characters except horizontal tabs,
134 * line feeds, and carriage returns, as all others can cause
Andrey Andreev3e9d2b82012-10-27 14:28:51 +0300135 * problems in XML.
Barry Mienydd671972010-10-04 16:33:58 +0200136 *
Andrey Andreev3e9d2b82012-10-27 14:28:51 +0300137 * @param string $str String to clean
Derek Jones98badc12010-03-02 13:08:02 -0600138 * @return string
139 */
Greg Akerd2c4ec62011-12-25 22:52:57 -0600140 public function safe_ascii_for_xml($str)
Derek Jones98badc12010-03-02 13:08:02 -0600141 {
Pascal Kriete14a0ac62011-04-05 14:55:56 -0400142 return remove_invisible_characters($str, FALSE);
Derek Jones98badc12010-03-02 13:08:02 -0600143 }
144
145 // --------------------------------------------------------------------
Barry Mienydd671972010-10-04 16:33:58 +0200146
Derek Jones98badc12010-03-02 13:08:02 -0600147 /**
148 * Convert to UTF-8
149 *
Andrey Andreev3e9d2b82012-10-27 14:28:51 +0300150 * Attempts to convert a string to UTF-8.
Derek Jones98badc12010-03-02 13:08:02 -0600151 *
Andrey Andreev3e9d2b82012-10-27 14:28:51 +0300152 * @param string $str Input string
153 * @param string $encoding Input encoding
154 * @return string $str encoded in UTF-8 or FALSE on failure
Derek Jones98badc12010-03-02 13:08:02 -0600155 */
Greg Akerd2c4ec62011-12-25 22:52:57 -0600156 public function convert_to_utf8($str, $encoding)
Derek Jones98badc12010-03-02 13:08:02 -0600157 {
Andrey Andreevbe1496d2014-02-11 22:48:45 +0200158 if (ICONV_ENABLED)
Derek Jones98badc12010-03-02 13:08:02 -0600159 {
Andrey Andreevc123e112012-01-08 00:17:34 +0200160 return @iconv($encoding, 'UTF-8', $str);
Derek Jones98badc12010-03-02 13:08:02 -0600161 }
Andrey Andreev9f44c212012-10-10 16:07:17 +0300162 elseif (MB_ENABLED === TRUE)
Derek Jones98badc12010-03-02 13:08:02 -0600163 {
Andrey Andreevc123e112012-01-08 00:17:34 +0200164 return @mb_convert_encoding($str, 'UTF-8', $encoding);
Derek Jones98badc12010-03-02 13:08:02 -0600165 }
Barry Mienydd671972010-10-04 16:33:58 +0200166
Andrey Andreevc123e112012-01-08 00:17:34 +0200167 return FALSE;
Derek Jones98badc12010-03-02 13:08:02 -0600168 }
169
170 // --------------------------------------------------------------------
Barry Mienydd671972010-10-04 16:33:58 +0200171
Derek Jones98badc12010-03-02 13:08:02 -0600172 /**
173 * Is ASCII?
174 *
Andrey Andreev3e9d2b82012-10-27 14:28:51 +0300175 * Tests if a string is standard 7-bit ASCII or not.
Derek Jones98badc12010-03-02 13:08:02 -0600176 *
Andrey Andreev3e9d2b82012-10-27 14:28:51 +0300177 * @param string $str String to check
Derek Jones98badc12010-03-02 13:08:02 -0600178 * @return bool
179 */
Greg Akerd2c4ec62011-12-25 22:52:57 -0600180 protected function _is_ascii($str)
Derek Jones98badc12010-03-02 13:08:02 -0600181 {
Greg Akerd2c4ec62011-12-25 22:52:57 -0600182 return (preg_match('/[^\x00-\x7F]/S', $str) === 0);
Derek Jones98badc12010-03-02 13:08:02 -0600183 }
184
Derek Jones98badc12010-03-02 13:08:02 -0600185}
Derek Jones98badc12010-03-02 13:08:02 -0600186
Pascal Krieteaaec1e42011-01-20 00:01:21 -0500187/* End of file Utf8.php */
Timothy Warren40403d22012-04-19 16:38:50 -0400188/* Location: ./system/core/Utf8.php */