| [ Index ] |
|
Code source de Typo3 4.1.3 |
1 <?php 2 /*************************************************************** 3 * Copyright notice 4 * 5 * (c) 2003-2007 Kasper Skaarhoj (kasperYYYY@typo3.com) 6 * All rights reserved 7 * 8 * This script is part of the Typo3 project. The Typo3 project is 9 * free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * The GNU General Public License can be found at 15 * http://www.gnu.org/copyleft/gpl.html. 16 * 17 * This script is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * GNU General Public License for more details. 21 * 22 * This copyright notice MUST APPEAR in all copies of the script! 23 ***************************************************************/ 24 /** 25 * Class for conversion between charsets. 26 * 27 * $Id: class.t3lib_cs.php 2531 2007-10-06 19:26:53Z masi $ 28 * 29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com> 30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 31 */ 32 /** 33 * [CLASS/FUNCTION INDEX of SCRIPT] 34 * 35 * 36 * 37 * 136: class t3lib_cs 38 * 488: function parse_charset($charset) 39 * 507: function get_locale_charset($locale) 40 * 41 * SECTION: Charset Conversion functions 42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) 43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) 44 * 617: function utf8_encode($str,$charset) 45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0) 46 * 706: function utf8_to_entities($str) 47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0) 48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0) 49 * 823: function UnumberToChar($cbyte) 50 * 868: function utf8CharToUnumber($str,$hex=0) 51 * 52 * SECTION: Init functions 53 * 911: function initCharset($charset) 54 * 973: function initUnicodeData($mode=null) 55 * 1198: function initCaseFolding($charset) 56 * 1260: function initToASCII($charset) 57 * 58 * SECTION: String operation functions 59 * 1331: function substr($charset,$string,$start,$len=null) 60 * 1384: function strlen($charset,$string) 61 * 1414: function crop($charset,$string,$len,$crop='') 62 * 1467: function strtrunc($charset,$string,$len) 63 * 1501: function conv_case($charset,$string,$case) 64 * 1527: function specCharsToASCII($charset,$string) 65 * 66 * SECTION: Internal string operation functions 67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='') 68 * 69 * SECTION: Internal UTF-8 string operation functions 70 * 1622: function utf8_substr($str,$start,$len=null) 71 * 1655: function utf8_strlen($str) 72 * 1676: function utf8_strtrunc($str,$len) 73 * 1698: function utf8_strpos($haystack,$needle,$offset=0) 74 * 1723: function utf8_strrpos($haystack,$needle) 75 * 1745: function utf8_char2byte_pos($str,$pos) 76 * 1786: function utf8_byte2char_pos($str,$pos) 77 * 1809: function utf8_char_mapping($str,$mode,$opt='') 78 * 79 * SECTION: Internal EUC string operation functions 80 * 1885: function euc_strtrunc($str,$len,$charset) 81 * 1914: function euc_substr($str,$start,$charset,$len=null) 82 * 1939: function euc_strlen($str,$charset) 83 * 1966: function euc_char2byte_pos($str,$pos,$charset) 84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='') 85 * 86 * TOTAL FUNCTIONS: 35 87 * (This index is automatically created/updated by the extension "extdeveval") 88 * 89 */ 90 91 92 93 94 95 96 97 98 /** 99 * Notes on UTF-8 100 * 101 * Functions working on UTF-8 strings: 102 * 103 * - strchr/strstr 104 * - strrchr 105 * - substr_count 106 * - implode/explode/join 107 * 108 * Functions nearly working on UTF-8 strings: 109 * 110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen 111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII 112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos 113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0 114 * 115 * Functions NOT working on UTF-8 strings: 116 * 117 * - str*cmp 118 * - stristr 119 * - stripos 120 * - substr 121 * - strrev 122 * - ereg/eregi 123 * - split/spliti 124 * - preg_* 125 * - ... 126 * 127 */ 128 /** 129 * Class for conversion between charsets 130 * 131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com> 132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 133 * @package TYPO3 134 * @subpackage t3lib 135 */ 136 class t3lib_cs { 137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent. 138 139 // This is the array where parsed conversion tables are stored (cached) 140 var $parsedCharsets=array(); 141 142 // An array where case folding data will be stored (cached) 143 var $caseFolding=array(); 144 145 // An array where charset-to-ASCII mappings are stored (cached) 146 var $toASCII=array(); 147 148 // This tells the converter which charsets has two bytes per char: 149 var $twoByteSets=array( 150 'ucs-2'=>1, // 2-byte Unicode 151 ); 152 153 // This tells the converter which charsets has four bytes per char: 154 var $fourByteSets=array( 155 'ucs-4'=>1, // 4-byte Unicode 156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16) 157 ); 158 159 // This tells the converter which charsets use a scheme like the Extended Unix Code: 160 var $eucBasedSets=array( 161 'gb2312'=>1, // Chinese, simplified. 162 'big5'=>1, // Chinese, traditional. 163 'euc-kr'=>1, // Korean 164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80! 165 ); 166 167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html 168 // http://czyborra.com/charsets/iso8859.html 169 var $synonyms=array( 170 'us' => 'ascii', 171 'us-ascii'=> 'ascii', 172 'cp819' => 'iso-8859-1', 173 'ibm819' => 'iso-8859-1', 174 'iso-ir-100' => 'iso-8859-1', 175 'iso-ir-109' => 'iso-8859-2', 176 'iso-ir-148' => 'iso-8859-9', 177 'iso-ir-199' => 'iso-8859-14', 178 'iso-ir-203' => 'iso-8859-15', 179 'csisolatin1' => 'iso-8859-1', 180 'csisolatin2' => 'iso-8859-2', 181 'csisolatin3' => 'iso-8859-3', 182 'csisolatin5' => 'iso-8859-9', 183 'csisolatin8' => 'iso-8859-14', 184 'csisolatin9' => 'iso-8859-15', 185 'csisolatingreek' => 'iso-8859-7', 186 'iso-celtic' => 'iso-8859-14', 187 'latin1' => 'iso-8859-1', 188 'latin2' => 'iso-8859-2', 189 'latin3' => 'iso-8859-3', 190 'latin5' => 'iso-8859-9', 191 'latin6' => 'iso-8859-10', 192 'latin8' => 'iso-8859-14', 193 'latin9' => 'iso-8859-15', 194 'l1' => 'iso-8859-1', 195 'l2' => 'iso-8859-2', 196 'l3' => 'iso-8859-3', 197 'l5' => 'iso-8859-9', 198 'l6' => 'iso-8859-10', 199 'l8' => 'iso-8859-14', 200 'l9' => 'iso-8859-15', 201 'cyrillic' => 'iso-8859-5', 202 'arabic' => 'iso-8859-6', 203 'tis-620' => 'iso-8859-11', 204 'win874' => 'windows-874', 205 'win1250' => 'windows-1250', 206 'win1251' => 'windows-1251', 207 'win1252' => 'windows-1252', 208 'win1253' => 'windows-1253', 209 'win1254' => 'windows-1254', 210 'win1255' => 'windows-1255', 211 'win1256' => 'windows-1256', 212 'win1257' => 'windows-1257', 213 'win1258' => 'windows-1258', 214 'cp1250' => 'windows-1250', 215 'cp1251' => 'windows-1251', 216 'cp1252' => 'windows-1252', 217 'ms-ee' => 'windows-1250', 218 'ms-ansi' => 'windows-1252', 219 'ms-greek' => 'windows-1253', 220 'ms-turk' => 'windows-1254', 221 'winbaltrim' => 'windows-1257', 222 'koi-8ru' => 'koi-8r', 223 'koi8r' => 'koi-8r', 224 'cp878' => 'koi-8r', 225 'mac' => 'macroman', 226 'macintosh' => 'macroman', 227 'euc-cn' => 'gb2312', 228 'x-euc-cn' => 'gb2312', 229 'euccn' => 'gb2312', 230 'cp936' => 'gb2312', 231 'big-5' => 'big5', 232 'cp950' => 'big5', 233 'eucjp' => 'euc-jp', 234 'sjis' => 'shift_jis', 235 'shift-jis' => 'shift_jis', 236 'cp932' => 'shift_jis', 237 'cp949' => 'euc-kr', 238 'utf7' => 'utf-7', 239 'utf8' => 'utf-8', 240 'utf16' => 'utf-16', 241 'utf32' => 'utf-32', 242 'utf8' => 'utf-8', 243 'ucs2' => 'ucs-2', 244 'ucs4' => 'ucs-4', 245 ); 246 247 // mapping of iso-639:2 language codes to script names 248 var $lang_to_script=array( 249 // iso-639:2 language codes, see: 250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm 251 // http://www.loc.gov/standards/iso639-2/langcodes.html 252 // http://www.unicode.org/onlinedat/languages.html 253 'ar' => 'arabic', 254 'bg' => 'cyrillic', // Bulgarian 255 'bs' => 'east_european', // Bosnian 256 'cs' => 'east_european', // Czech 257 'da' => 'west_european', // Danish 258 'de' => 'west_european', // German 259 'es' => 'west_european', // Spanish 260 'et' => 'estonian', 261 'eo' => 'unicode', // Esperanto 262 'eu' => 'west_european', // Basque 263 'fa' => 'arabic', // Persian 264 'fi' => 'west_european', // Finish 265 'fo' => 'west_european', // Faroese 266 'fr' => 'west_european', // French 267 'gr' => 'greek', 268 'he' => 'hebrew', // Hebrew (since 1998) 269 'hi' => 'unicode', // Hindi 270 'hr' => 'east_european', // Croatian 271 'hu' => 'east_european', // Hungarian 272 'iw' => 'hebrew', // Hebrew (til 1998) 273 'is' => 'west_european', // Icelandic 274 'it' => 'west_european', // Italian 275 'ja' => 'japanese', 276 'kl' => 'west_european', // Greenlandic 277 'ko' => 'korean', 278 'lt' => 'lithuanian', 279 'lv' => 'west_european', // Latvian/Lettish 280 'nl' => 'west_european', // Dutch 281 'no' => 'west_european', // Norwegian 282 'pl' => 'east_european', // Polish 283 'pt' => 'west_european', // Portuguese 284 'ro' => 'east_european', // Romanian 285 'ru' => 'cyrillic', // Russian 286 'sk' => 'east_european', // Slovak 287 'sl' => 'east_european', // Slovenian 288 'sr' => 'cyrillic', // Serbian 289 'sv' => 'west_european', // Swedish 290 'th' => 'thai', 291 'uk' => 'cyrillic', // Ukranian 292 'vi' => 'vietnamese', 293 'zh' => 'chinese', 294 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp 295 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp 296 'ara' => 'arabic', 297 'bgr' => 'cyrillic', // Bulgarian 298 'cat' => 'west_european', // Catalan 299 'chs' => 'simpl_chinese', 300 'cht' => 'trad_chinese', 301 'csy' => 'east_european', // Czech 302 'dan' => 'west_european', // Danisch 303 'deu' => 'west_european', // German 304 'dea' => 'west_european', // German (Austrian) 305 'des' => 'west_european', // German (Swiss) 306 'ena' => 'west_european', // English (Australian) 307 'enc' => 'west_european', // English (Canadian) 308 'eng' => 'west_european', // English 309 'enz' => 'west_european', // English (New Zealand) 310 'enu' => 'west_european', // English (United States) 311 'euq' => 'west_european', // Basque 312 'fos' => 'west_european', // Faroese 313 'far' => 'arabic', // Persian 314 'fin' => 'west_european', // Finish 315 'fra' => 'west_european', // French 316 'frb' => 'west_european', // French (Belgian) 317 'frc' => 'west_european', // French (Canadian) 318 'frs' => 'west_european', // French (Swiss) 319 'ell' => 'greek', 320 'heb' => 'hebrew', 321 'hin' => 'unicode', // Hindi 322 'hun' => 'east_european', // Hungarian 323 'isl' => 'west_euorpean', // Icelandic 324 'ita' => 'west_european', // Italian 325 'its' => 'west_european', // Italian (Swiss) 326 'jpn' => 'japanese', 327 'kor' => 'korean', 328 'lth' => 'lithuanian', 329 'lvi' => 'west_european', // Latvian/Lettish 330 'msl' => 'west_european', // Malay 331 'nlb' => 'west_european', // Dutch (Belgian) 332 'nld' => 'west_european', // Dutch 333 'nor' => 'west_european', // Norwegian (bokmal) 334 'non' => 'west_european', // Norwegian (nynorsk) 335 'plk' => 'east_european', // Polish 336 'ptg' => 'west_european', // Portuguese 337 'ptb' => 'west_european', // Portuguese (Brazil) 338 'rom' => 'east_european', // Romanian 339 'rus' => 'cyrillic', // Russian 340 'slv' => 'east_european', // Slovenian 341 'sky' => 'east_european', // Slovak 342 'srl' => 'east_european', // Serbian (Latin) 343 'srb' => 'cyrillic', // Serbian (Cyrillic) 344 'esp' => 'west_european', // Spanish (trad. sort) 345 'esm' => 'west_european', // Spanish (Mexican) 346 'esn' => 'west_european', // Spanish (internat. sort) 347 'sve' => 'west_european', // Swedish 348 'tha' => 'thai', 349 'trk' => 'turkish', 350 'ukr' => 'cyrillic', // Ukrainian 351 // English language names 352 'arabic' => 'arabic', 353 'basque' => 'west_european', 354 'bosnian' => 'east_european', 355 'bulgarian' => 'east_european', 356 'catalan' => 'west_european', 357 'croatian' => 'east_european', 358 'czech' => 'east_european', 359 'danish' => 'west_european', 360 'dutch' => 'west_european', 361 'english' => 'west_european', 362 'esperanto' => 'unicode', 363 'estonian' => 'estonian', 364 'faroese' => 'west_european', 365 'farsi' => 'arabic', 366 'finnish' => 'west_european', 367 'french' => 'west_european', 368 'galician' => 'west_european', 369 'german' => 'west_european', 370 'greek' => 'greek', 371 'greenlandic' => 'west_european', 372 'hebrew' => 'hebrew', 373 'hindi' => 'unicode', 374 'hungarian' => 'east_european', 375 'icelandic' => 'west_european', 376 'italian' => 'west_european', 377 'latvian' => 'west_european', 378 'lettish' => 'west_european', 379 'lithuanian' => 'lithuanian', 380 'malay' => 'west_european', 381 'norwegian' => 'west_european', 382 'persian' => 'arabic', 383 'polish' => 'east_european', 384 'portuguese' => 'west_european', 385 'russian' => 'cyrillic', 386 'romanian' => 'east_european', 387 'serbian' => 'cyrillic', 388 'slovak' => 'east_european', 389 'slovenian' => 'east_european', 390 'spanish' => 'west_european', 391 'svedish' => 'west_european', 392 'that' => 'thai', 393 'turkish' => 'turkish', 394 'ukrainian' => 'cyrillic', 395 ); 396 397 // mapping of language (family) names to charsets on Unix 398 var $script_to_charset_unix=array( 399 'west_european' => 'iso-8859-1', 400 'estonian' => 'iso-8859-1', 401 'east_european' => 'iso-8859-2', 402 'baltic' => 'iso-8859-4', 403 'cyrillic' => 'iso-8859-5', 404 'arabic' => 'iso-8859-6', 405 'greek' => 'iso-8859-7', 406 'hebrew' => 'iso-8859-8', 407 'turkish' => 'iso-8859-9', 408 'thai' => 'iso-8859-11', // = TIS-620 409 'lithuanian' => 'iso-8859-13', 410 'chinese' => 'gb2312', // = euc-cn 411 'japanese' => 'euc-jp', 412 'korean' => 'euc-kr', 413 'simpl_chinese' => 'gb2312', 414 'trad_chinese' => 'big5', 415 'vietnamese' => '', 416 'unicode' => 'utf-8', 417 ); 418 419 // mapping of language (family) names to charsets on Windows 420 var $script_to_charset_windows=array( 421 'east_european' => 'windows-1250', 422 'cyrillic' => 'windows-1251', 423 'west_european' => 'windows-1252', 424 'greek' => 'windows-1253', 425 'turkish' => 'windows-1254', 426 'hebrew' => 'windows-1255', 427 'arabic' => 'windows-1256', 428 'baltic' => 'windows-1257', 429 'estonian' => 'windows-1257', 430 'lithuanian' => 'windows-1257', 431 'vietnamese' => 'windows-1258', 432 'thai' => 'cp874', 433 'korean' => 'cp949', 434 'chinese' => 'gb2312', 435 'japanese' => 'shift_jis', 436 'simpl_chinese' => 'gb2312', 437 'trad_chinese' => 'big5', 438 ); 439 440 // mapping of locale names to charsets 441 var $locale_to_charset=array( 442 'japanese.euc' => 'euc-jp', 443 'ja_jp.ujis' => 'euc-jp', 444 'korean.euc' => 'euc-kr', 445 'sr@Latn' => 'iso-8859-2', 446 'zh_cn' => 'gb2312', 447 'zh_hk' => 'big5', 448 'zh_tw' => 'big5', 449 ); 450 451 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3: 452 // Empty values means "iso-8859-1" 453 var $charSetArray = array( 454 'dk' => '', 455 'de' => '', 456 'no' => '', 457 'it' => '', 458 'fr' => '', 459 'es' => '', 460 'nl' => '', 461 'cz' => 'windows-1250', 462 'pl' => 'iso-8859-2', 463 'si' => 'windows-1250', 464 'fi' => '', 465 'tr' => 'iso-8859-9', 466 'se' => '', 467 'pt' => '', 468 'ru' => 'windows-1251', 469 'ro' => 'iso-8859-2', 470 'ch' => 'gb2312', 471 'sk' => 'windows-1250', 472 'lt' => 'windows-1257', 473 'is' => 'utf-8', 474 'hr' => 'windows-1250', 475 'hu' => 'iso-8859-2', 476 'gl' => '', 477 'th' => 'iso-8859-11', 478 'gr' => 'iso-8859-7', 479 'hk' => 'big5', 480 'eu' => '', 481 'bg' => 'windows-1251', 482 'br' => '', 483 'et' => 'iso-8859-4', 484 'ar' => 'iso-8859-6', 485 'he' => 'utf-8', 486 'ua' => 'windows-1251', 487 'jp' => 'shift_jis', 488 'lv' => 'utf-8', 489 'vn' => 'utf-8', 490 'ca' => 'iso-8859-15', 491 'ba' => 'iso-8859-2', 492 'kr' => 'euc-kr', 493 'eo' => 'utf-8', 494 'my' => '', 495 'hi' => 'utf-8', 496 'fo' => 'utf-8', 497 'fa' => 'utf-8', 498 'sr' => 'utf-8' 499 ); 500 501 // TYPO3 specific: Array with the iso names used for each system language in TYPO3: 502 // Missing keys means: same as Typo3 503 var $isoArray = array( 504 'ba' => 'bs', 505 'br' => 'pt_BR', 506 'ch' => 'zh_CN', 507 'cz' => 'cs', 508 'dk' => 'da', 509 'si' => 'sl', 510 'se' => 'sv', 511 'gl' => 'kl', 512 'gr' => 'el', 513 'hk' => 'zh_HK', 514 'kr' => 'ko', 515 'ua' => 'uk', 516 'jp' => 'ja', 517 'vn' => 'vi', 518 ); 519 520 /** 521 * Normalize - changes input character set to lowercase letters. 522 * 523 * @param string Input charset 524 * @return string Normalized charset 525 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 526 */ 527 function parse_charset($charset) { 528 $charset = trim(strtolower($charset)); 529 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset]; 530 531 return $charset; 532 } 533 534 /** 535 * Get the charset of a locale. 536 * 537 * ln language 538 * ln_CN language / country 539 * ln_CN.cs language / country / charset 540 * ln_CN.cs@mod language / country / charset / modifier 541 * 542 * @param string Locale string 543 * @return string Charset resolved for locale string 544 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 545 */ 546 function get_locale_charset($locale) { 547 $locale = strtolower($locale); 548 549 // exact locale specific charset? 550 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale]; 551 552 // get modifier 553 list($locale,$modifier) = explode('@',$locale); 554 555 // locale contains charset: use it 556 list($locale,$charset) = explode('.',$locale); 557 if ($charset) return $this->parse_charset($charset); 558 559 // modifier is 'euro' (after charset check, because of xx.utf-8@euro) 560 if ($modifier == 'euro') return 'iso-8859-15'; 561 562 // get language 563 list($language,$country) = explode('_',$locale); 564 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language]; 565 566 if (TYPO3_OS == 'WIN') { 567 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252'; 568 } else { 569 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1'; 570 } 571 572 return $cs; 573 } 574 575 576 577 578 579 580 581 582 583 /******************************************** 584 * 585 * Charset Conversion functions 586 * 587 ********************************************/ 588 589 /** 590 * Convert from one charset to another charset. 591 * 592 * @param string Input string 593 * @param string From charset (the current charset of the string) 594 * @param string To charset (the output charset wanted) 595 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 596 * @return string Converted string 597 * @see convArray() 598 */ 599 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) { 600 if ($fromCS==$toCS) return $str; 601 602 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything 603 if ($toCS=='utf-8' || !$useEntityForNoChar) { 604 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) { 605 case 'mbstring': 606 $conv_str = mb_convert_encoding($str,$toCS,$fromCS); 607 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets 608 break; 609 610 case 'iconv': 611 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str); 612 if (false !== $conv_str) return $conv_str; 613 break; 614 615 case 'recode': 616 $conv_str = recode_string($fromCS.'..'.$toCS,$str); 617 if (false !== $conv_str) return $conv_str; 618 break; 619 } 620 // fallback to TYPO3 conversion 621 } 622 623 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS); 624 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar); 625 return $str; 626 } 627 628 /** 629 * Convert all elements in ARRAY from one charset to another charset. 630 * NOTICE: Array is passed by reference! 631 * 632 * @param string Input array, possibly multidimensional 633 * @param string From charset (the current charset of the string) 634 * @param string To charset (the output charset wanted) 635 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 636 * @return void 637 * @see conv() 638 */ 639 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) { 640 foreach($array as $key => $value) { 641 if (is_array($array[$key])) { 642 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar); 643 } else { 644 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar); 645 } 646 } 647 } 648 649 /** 650 * Converts $str from $charset to UTF-8 651 * 652 * @param string String in local charset to convert to UTF-8 653 * @param string Charset, lowercase. Must be found in csconvtbl/ folder. 654 * @return string Output string, converted to UTF-8 655 */ 656 function utf8_encode($str,$charset) { 657 658 if ($charset === 'utf-8') return $str; 659 660 // Charset is case-insensitive. 661 if ($this->initCharset($charset)) { // Parse conv. table if not already... 662 $strLen = strlen($str); 663 $outStr=''; 664 665 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string. 666 $chr=substr($str,$a,1); 667 $ord=ord($chr); 668 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char 669 $ord2 = ord($str{$a+1}); 670 $ord = $ord<<8 | $ord2; // assume big endian 671 672 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 673 $outStr.=$this->parsedCharsets[$charset]['local'][$ord]; 674 } else $outStr.=chr($this->noCharByteVal); // No char exists 675 $a++; 676 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8 677 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int. 678 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte 679 $a++; 680 $ord2=ord(substr($str,$a,1)); 681 $ord = $ord*256+$ord2; 682 } 683 } 684 685 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 686 $outStr.= $this->parsedCharsets[$charset]['local'][$ord]; 687 } else $outStr.= chr($this->noCharByteVal); // No char exists 688 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 689 } 690 return $outStr; 691 } 692 } 693 694 /** 695 * Converts $str from UTF-8 to $charset 696 * 697 * @param string String in UTF-8 to convert to local charset 698 * @param string Charset, lowercase. Must be found in csconvtbl/ folder. 699 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 700 * @return string Output string, converted to local charset 701 */ 702 function utf8_decode($str,$charset,$useEntityForNoChar=0) { 703 704 // Charset is case-insensitive. 705 if ($this->initCharset($charset)) { // Parse conv. table if not already... 706 $strLen = strlen($str); 707 $outStr=''; 708 $buf=''; 709 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string. 710 $chr=substr($str,$a,1); 711 $ord=ord($chr); 712 if ($ord>127) { // This means multibyte! (first byte!) 713 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 714 715 $buf=$chr; // Add first byte 716 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 717 $ord = $ord << 1; // Shift it left and ... 718 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 719 $a++; // Increase pointer... 720 $buf.=substr($str,$a,1); // ... and add the next char. 721 } else break; 722 } 723 724 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then... 725 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number 726 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars. 727 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255); 728 } else $outStr.= chr($mByte); 729 } elseif ($useEntityForNoChar) { // Create num entity: 730 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';'; 731 } else $outStr.=chr($this->noCharByteVal); // No char exists 732 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!) 733 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 734 } 735 return $outStr; 736 } 737 } 738 739 /** 740 * Converts all chars > 127 to numeric entities. 741 * 742 * @param string Input string 743 * @return string Output string 744 */ 745 function utf8_to_entities($str) { 746 $strLen = strlen($str); 747 $outStr=''; 748 $buf=''; 749 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string. 750 $chr=substr($str,$a,1); 751 $ord=ord($chr); 752 if ($ord>127) { // This means multibyte! (first byte!) 753 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 754 $buf=$chr; // Add first byte 755 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 756 $ord = $ord << 1; // Shift it left and ... 757 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 758 $a++; // Increase pointer... 759 $buf.=substr($str,$a,1); // ... and add the next char. 760 } else break; 761 } 762 763 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';'; 764 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!) 765 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 766 } 767 768 return $outStr; 769 } 770 771 /** 772 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars 773 * 774 * @param string Input string, UTF-8 775 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well) 776 * @return string Output string 777 */ 778 function entities_to_utf8($str,$alsoStdHtmlEnt=0) { 779 if ($alsoStdHtmlEnt) { 780 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below. 781 } 782 783 $token = md5(microtime()); 784 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str)); 785 foreach($parts as $k => $v) { 786 if ($k%2) { 787 if (substr($v,0,1)=='#') { // Dec or hex entities: 788 if (substr($v,1,1)=='x') { 789 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2))); 790 } else { 791 $parts[$k] = $this->UnumberToChar(substr($v,1)); 792 } 793 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities: 794 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1'); 795 } else { // No conversion: 796 $parts[$k] ='&'.$v.';'; 797 } 798 } 799 } 800 801 return implode('',$parts); 802 } 803 804 /** 805 * Converts all chars in the input UTF-8 string into integer numbers returned in an array 806 * 807 * @param string Input string, UTF-8 808 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters. 809 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned. 810 * @return array Output array with the char numbers 811 */ 812 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) { 813 // If entities must be registered as well...: 814 if ($convEntities) { 815 $str = $this->entities_to_utf8($str,1); 816 } 817 // Do conversion: 818 $strLen = strlen($str); 819 $outArr=array(); 820 $buf=''; 821 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string. 822 $chr=substr($str,$a,1); 823 $ord=ord($chr); 824 if ($ord>127) { // This means multibyte! (first byte!) 825 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 826 $buf=$chr; // Add first byte 827 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 828 $ord = $ord << 1; // Shift it left and ... 829 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 830 $a++; // Increase pointer... 831 $buf.=substr($str,$a,1); // ... and add the next char. 832 } else break; 833 } 834 835 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf); 836 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!) 837 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 838 } 839 840 return $outArr; 841 } 842 843 /** 844 * Converts a UNICODE number to a UTF-8 multibyte character 845 * Algorithm based on script found at From: http://czyborra.com/utf/ 846 * Unit-tested by Kasper 847 * 848 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence: 849 * 850 * bytes | bits | representation 851 * 1 | 7 | 0vvvvvvv 852 * 2 | 11 | 110vvvvv 10vvvvvv 853 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv 854 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv 855 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 856 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 857 * 858 * @param integer UNICODE integer 859 * @return string UTF-8 multibyte character string 860 * @see utf8CharToUnumber() 861 */ 862 function UnumberToChar($cbyte) { 863 $str=''; 864 865 if ($cbyte < 0x80) { 866 $str.=chr($cbyte); 867 } else if ($cbyte < 0x800) { 868 $str.=chr(0xC0 | ($cbyte >> 6)); 869 $str.=chr(0x80 | ($cbyte & 0x3F)); 870 } else if ($cbyte < 0x10000) { 871 $str.=chr(0xE0 | ($cbyte >> 12)); 872 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 873 $str.=chr(0x80 | ($cbyte & 0x3F)); 874 } else if ($cbyte < 0x200000) { 875 $str.=chr(0xF0 | ($cbyte >> 18)); 876 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 877 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 878 $str.=chr(0x80 | ($cbyte & 0x3F)); 879 } else if ($cbyte < 0x4000000) { 880 $str.=chr(0xF8 | ($cbyte >> 24)); 881 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F)); 882 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 883 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 884 $str.=chr(0x80 | ($cbyte & 0x3F)); 885 } else if ($cbyte < 0x80000000) { 886 $str.=chr(0xFC | ($cbyte >> 30)); 887 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F)); 888 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F)); 889 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 890 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 891 $str.=chr(0x80 | ($cbyte & 0x3F)); 892 } else { // Cannot express a 32-bit character in UTF-8 893 $str .= chr($this->noCharByteVal); 894 } 895 return $str; 896 } 897 898 /** 899 * Converts a UTF-8 Multibyte character to a UNICODE number 900 * Unit-tested by Kasper 901 * 902 * @param string UTF-8 multibyte character string 903 * @param boolean If set, then a hex. number is returned. 904 * @return integer UNICODE integer 905 * @see UnumberToChar() 906 */ 907 function utf8CharToUnumber($str,$hex=0) { 908 $ord=ord(substr($str,0,1)); // First char 909 910 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string 911 $binBuf=''; 912 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 913 $ord = $ord << 1; // Shift it left and ... 914 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 915 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6); 916 } else break; 917 } 918 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf; 919 920 $int = bindec($binBuf); 921 } else $int = $ord; 922 923 return $hex ? 'x'.dechex($int) : $int; 924 } 925 926 927 928 929 930 931 932 933 934 /******************************************** 935 * 936 * Init functions 937 * 938 ********************************************/ 939 940 /** 941 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder 942 * This function is automatically called by the conversion functions 943 * 944 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/ 945 * 946 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl) 947 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed. 948 * @access private 949 */ 950 function initCharset($charset) { 951 // Only process if the charset is not yet loaded: 952 if (!is_array($this->parsedCharsets[$charset])) { 953 954 // Conversion table filename: 955 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl'; 956 957 // If the conversion table is found: 958 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) { 959 // Cache file for charsets: 960 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero. 961 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl'); 962 if ($cacheFile && @is_file($cacheFile)) { 963 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile)); 964 } else { 965 // Parse conversion table into lines: 966 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1); 967 // Initialize the internal variable holding the conv. table: 968 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array()); 969 // traverse the lines: 970 $detectedType=''; 971 foreach($lines as $value) { 972 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored. 973 974 // Detect type if not done yet: (Done on first real line) 975 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE" 976 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token'; 977 978 if ($detectedType=='ms-token') { 979 list($hexbyte,$utf8) = split('=|:',$value,3); 980 } elseif ($detectedType=='whitespaced') { 981 $regA=array(); 982 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA); 983 $hexbyte = $regA[1]; 984 $utf8 = 'U+'.$regA[2]; 985 } 986 $decval = hexdec(trim($hexbyte)); 987 if ($decval>127) { 988 $utf8decval = hexdec(substr(trim($utf8),2)); 989 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval); 990 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval; 991 } 992 } 993 } 994 if ($cacheFile) { 995 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset])); 996 } 997 } 998 return 2; 999 } else return false; 1000 } else return 1; 1001 } 1002 1003 /** 1004 * This function initializes all UTF-8 character data tables. 1005 * 1006 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/ 1007 * 1008 * @param string Mode ("case", "ascii", ...) 1009 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 1010 * @access private 1011 */ 1012 function initUnicodeData($mode=null) { 1013 // cache files 1014 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl'); 1015 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl'); 1016 1017 // Only process if the tables are not yet loaded 1018 switch($mode) { 1019 case 'case': 1020 if (is_array($this->caseFolding['utf-8'])) return 1; 1021 1022 // Use cached version if possible 1023 if ($cacheFileCase && @is_file($cacheFileCase)) { 1024 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase)); 1025 return 2; 1026 } 1027 break; 1028 1029 case 'ascii': 1030 if (is_array($this->toASCII['utf-8'])) return 1; 1031 1032 // Use cached version if possible 1033 if ($cacheFileASCII && @is_file($cacheFileASCII)) { 1034 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII)); 1035 return 2; 1036 } 1037 break; 1038 } 1039 1040 // process main Unicode data file 1041 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt'; 1042 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false; 1043 1044 $fh = fopen($unicodeDataFile,'rb'); 1045 if (!$fh) return false; 1046 1047 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence) 1048 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper) 1049 $this->caseFolding['utf-8'] = array(); 1050 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand 1051 $utf8CaseFolding['toUpper'] = array(); 1052 $utf8CaseFolding['toLower'] = array(); 1053 $utf8CaseFolding['toTitle'] = array(); 1054 1055 $decomposition = array(); // array of temp. decompositions 1056 $mark = array(); // array of chars that are marks (eg. composing accents) 1057 $number = array(); // array of chars that are numbers (eg. digits) 1058 $omit = array(); // array of chars to be omitted (eg. Russian hard sign) 1059 1060 while (!feof($fh)) { 1061 $line = fgets($fh,4096); 1062 // has a lot of info 1063 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line)); 1064 1065 $ord = hexdec($char); 1066 if ($ord > 0xFFFF) break; // only process the BMP 1067 1068 $utf8_char = $this->UnumberToChar($ord); 1069 1070 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper)); 1071 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower)); 1072 // store "title" only when different from "upper" (only a few) 1073 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title)); 1074 1075 switch ($cat{0}) { 1076 case 'M': // mark (accent, umlaut, ...) 1077 $mark["U+$char"] = 1; 1078 break; 1079 1080 case 'N': // numeric value 1081 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num; 1082 } 1083 1084 // accented Latin letters without "official" decomposition 1085 $match = array(); 1086 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) { 1087 $c = ord($match[2]); 1088 if ($match[1] == 'SMALL') $c += 32; 1089 1090 $decomposition["U+$char"] = array(dechex($c)); 1091 continue; 1092 } 1093 1094 $match = array(); 1095 if (ereg('(<.*>)? *(.+)',$decomp,$match)) { 1096 switch($match[1]) { 1097 case '<circle>': // add parenthesis as circle replacement, eg (1) 1098 $match[2] = '0028 '.$match[2].' 0029'; 1099 break; 1100 1101 case '<square>': // add square brackets as square replacement, eg [1] 1102 $match[2] = '005B '.$match[2].' 005D'; 1103 break; 1104 1105 case '<compat>': // ignore multi char decompositions that start with a space 1106 if (ereg('^0020 ',$match[2])) continue 2; 1107 break; 1108 1109 // ignore Arabic and vertical layout presentation decomposition 1110 case '<initial>': 1111 case '<medial>': 1112 case '<final>': 1113 case '<isolated>': 1114 case '<vertical>': 1115 continue 2; 1116 } 1117 $decomposition["U+$char"] = split(' ',$match[2]); 1118 } 1119 } 1120 fclose($fh); 1121 1122 // process additional Unicode data for casing (allow folded characters to expand into a sequence) 1123 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt'; 1124 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) { 1125 $fh = fopen($specialCasingFile,'rb'); 1126 if ($fh) { 1127 while (!feof($fh)) { 1128 $line = fgets($fh,4096); 1129 if ($line{0} != '#' && trim($line) != '') { 1130 1131 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line); 1132 if ($cond == '' || $cond{0} == '#') { 1133 $utf8_char = $this->UnumberToChar(hexdec($char)); 1134 if ($char != $lower) { 1135 $arr = split(' ',$lower); 1136 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 1137 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr); 1138 } 1139 if ($char != $title && $title != $upper) { 1140 $arr = split(' ',$title); 1141 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 1142 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr); 1143 } 1144 if ($char != $upper) { 1145 $arr = split(' ',$upper); 1146 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 1147 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr); 1148 } 1149 } 1150 } 1151 } 1152 fclose($fh); 1153 } 1154 } 1155 1156 // process custom decompositions 1157 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt'; 1158 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) { 1159 $fh = fopen($customTranslitFile,'rb'); 1160 if ($fh) { 1161 while (!feof($fh)) { 1162 $line = fgets($fh,4096); 1163 if ($line{0} != '#' && trim($line) != '') { 1164 list($char,$translit) = t3lib_div::trimExplode(';', $line); 1165 if (!$translit) $omit["U+$char"] = 1; 1166 $decomposition["U+$char"] = split(' ', $translit); 1167 1168 } 1169 } 1170 fclose($fh); 1171 } 1172 } 1173 1174 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>) 1175 foreach($decomposition as $from => $to) { 1176 $code_decomp = array(); 1177 1178 while ($code_value = array_shift($to)) { 1179 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition 1180 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) { 1181 array_unshift($to, $cv); 1182 } 1183 } elseif (!isset($mark["U+$code_value"])) { // remove mark 1184 array_push($code_decomp, $code_value); 1185 } 1186 } 1187 if (count($code_decomp) || isset($omit[$from])) { 1188 $decomposition[$from] = $code_decomp; 1189 } else { 1190 unset($decomposition[$from]); 1191 } 1192 } 1193 1194 // create ascii only mapping 1195 $this->toASCII['utf-8'] = array(); 1196 $ascii =& $this->toASCII['utf-8']; 1197 1198 foreach($decomposition as $from => $to) { 1199 $code_decomp = array(); 1200 while ($code_value = array_shift($to)) { 1201 $ord = hexdec($code_value); 1202 if ($ord > 127) 1203 continue 2; // skip decompositions containing non-ASCII chars 1204 else 1205 array_push($code_decomp,chr($ord)); 1206 } 1207 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp); 1208 } 1209 1210 // add numeric decompositions 1211 foreach($number as $from => $to) { 1212 $utf8_char = $this->UnumberToChar(hexdec($from)); 1213 if (!isset($ascii[$utf8_char])) { 1214 $ascii[$utf8_char] = $to; 1215 } 1216 } 1217 1218 if ($cacheFileCase) { 1219 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding)); 1220 } 1221 1222 if ($cacheFileASCII) { 1223 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii)); 1224 } 1225 1226 return 3; 1227 } 1228 1229 /** 1230 * This function initializes the folding table for a charset other than UTF-8. 1231 * This function is automatically called by the case folding functions. 1232 * 1233 * @param string Charset for which to initialize case folding. 1234 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 1235 * @access private 1236 */ 1237 function initCaseFolding($charset) { 1238 // Only process if the case table is not yet loaded: 1239 if (is_array($this->caseFolding[$charset])) return 1; 1240 1241 // Use cached version if possible 1242 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl'); 1243 if ($cacheFile && @is_file($cacheFile)) { 1244 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 1245 return 2; 1246 } 1247 1248 // init UTF-8 conversion for this charset 1249 if (!$this->initCharset($charset)) { 1250 return false; 1251 } 1252 1253 // UTF-8 case folding is used as the base conversion table 1254 if (!$this->initUnicodeData('case')) { 1255 return false; 1256 } 1257 1258 $nochar = chr($this->noCharByteVal); 1259 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 1260 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 1261 $c = $this->utf8_decode($utf8, $charset); 1262 1263 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset); 1264 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset); 1265 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc; 1266 1267 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset); 1268 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset); 1269 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc; 1270 1271 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset); 1272 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset); 1273 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc; 1274 } 1275 1276 // add the ASCII case table 1277 for ($i=ord('a'); $i<=ord('z'); $i++) { 1278 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32); 1279 } 1280 for ($i=ord('A'); $i<=ord('Z'); $i++) { 1281 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32); 1282 } 1283 1284 if ($cacheFile) { 1285 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset])); 1286 } 1287 1288 return 3; 1289 } 1290 1291 /** 1292 * This function initializes the to-ASCII conversion table for a charset other than UTF-8. 1293 * This function is automatically called by the ASCII transliteration functions. 1294 * 1295 * @param string Charset for which to initialize conversion. 1296 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 1297 * @access private 1298 */ 1299 function initToASCII($charset) { 1300 // Only process if the case table is not yet loaded: 1301 if (is_array($this->toASCII[$charset])) return 1; 1302 1303 // Use cached version if possible 1304 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl'); 1305 if ($cacheFile && @is_file($cacheFile)) { 1306 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 1307 return 2; 1308 } 1309 1310 // init UTF-8 conversion for this charset 1311 if (!$this->initCharset($charset)) { 1312 return false; 1313 } 1314 1315 // UTF-8/ASCII transliteration is used as the base conversion table 1316 if (!$this->initUnicodeData('ascii')) { 1317 return false; 1318 } 1319 1320 $nochar = chr($this->noCharByteVal); 1321 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 1322 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 1323 $c = $this->utf8_decode($utf8, $charset); 1324 1325 if (isset($this->toASCII['utf-8'][$utf8])) { 1326 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8]; 1327 } 1328 } 1329 1330 if ($cacheFile) { 1331 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset])); 1332 } 1333 1334 return 3; 1335 } 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 /******************************************** 1353 * 1354 * String operation functions 1355 * 1356 ********************************************/ 1357 1358 /** 1359 * Returns a part of a string. 1360 * Unit-tested by Kasper (single byte charsets only) 1361 * 1362 * @param string The character set 1363 * @param string Character string 1364 * @param integer Start position (character position) 1365 * @param integer Length (in characters) 1366 * @return string The substring 1367 * @see substr(), mb_substr() 1368 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1369 */ 1370 function substr($charset,$string,$start,$len=null) { 1371 if ($len===0) return ''; 1372 1373 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1374 // cannot omit $len, when specifying charset 1375 if ($len==null) { 1376 $enc = mb_internal_encoding(); // save internal encoding 1377 mb_internal_encoding($charset); 1378 $str = mb_substr($string,$start); 1379 mb_internal_encoding($enc); // restore internal encoding 1380 1381 return $str; 1382 } 1383 else { 1384 return mb_substr($string,$start,$len,$charset); 1385 } 1386 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1387 // cannot omit $len, when specifying charset 1388 if ($len==null) { 1389 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding 1390 iconv_set_encoding('internal_encoding',$charset); 1391 $str = iconv_substr($string,$start); 1392 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding 1393 1394 return $str; 1395 } 1396 else { 1397 return iconv_substr($string,$start,$len,$charset); 1398 } 1399 } elseif ($charset == 'utf-8') { 1400 return $this->utf8_substr($string,$start,$len); 1401 } elseif ($this->eucBasedSets[$charset]) { 1402 return $this->euc_substr($string,$start,$charset,$len); 1403 } elseif ($this->twoByteSets[$charset]) { 1404 return substr($string,$start*2,$len*2); 1405 } elseif ($this->fourByteSets[$charset]) { 1406 return substr($string,$start*4,$len*4); 1407 } 1408 1409 // treat everything else as single-byte encoding 1410 return $len === NULL ? substr($string,$start) : substr($string,$start,$len); 1411 } 1412 1413 /** 1414 * Counts the number of characters. 1415 * Unit-tested by Kasper (single byte charsets only) 1416 * 1417 * @param string The character set 1418 * @param string Character string 1419 * @return integer The number of characters 1420 * @see strlen() 1421 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1422 */ 1423 function strlen($charset,$string) { 1424 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1425 return mb_strlen($string,$charset); 1426 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1427 return iconv_strlen($string,$charset); 1428 } elseif ($charset == 'utf-8') { 1429 return $this->utf8_strlen($string); 1430 } elseif ($this->eucBasedSets[$charset]) { 1431 return $this->euc_strlen($string,$charset); 1432 } elseif ($this->twoByteSets[$charset]) { 1433 return strlen($string)/2; 1434 } elseif ($this->fourByteSets[$charset]) { 1435 return strlen($string)/4; 1436 } 1437 // treat everything else as single-byte encoding 1438 return strlen($string); 1439 } 1440 1441 /** 1442 * Truncates a string and pre-/appends a string. 1443 * Unit tested by Kasper 1444 * 1445 * @param string The character set 1446 * @param string Character string 1447 * @param integer Length (in characters) 1448 * @param string Crop signifier 1449 * @return string The shortened string 1450 * @see substr(), mb_strimwidth() 1451 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1452 */ 1453 function crop($charset,$string,$len,$crop='') { 1454 if (intval($len) == 0) return $string; 1455 1456 if ($charset == 'utf-8') { 1457 $i = $this->utf8_char2byte_pos($string,$len); 1458 } elseif ($this->eucBasedSets[$charset]) { 1459 $i = $this->euc_char2byte_pos($string,$len,$charset); 1460 } else { 1461 if ($len > 0) { 1462 $i = $len; 1463 } else { 1464 $i = strlen($string)+$len; 1465 if ($i<=0) $i = false; 1466 } 1467 } 1468 1469 if ($i === false) { // $len outside actual string length 1470 return $string; 1471 } else { 1472 if ($len > 0) { 1473 if (strlen($string{$i})) { 1474 return substr($string,0,$i).$crop; 1475 1476 } 1477 } else { 1478 if (strlen($string{$i-1})) { 1479 return $crop.substr($string,$i); 1480 } 1481 } 1482 1483 /* 1484 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...) 1485 if ($len > 0) { 1486 return substr($string,0,$i).$crop; 1487 } else { 1488 return $crop.substr($string,$i); 1489 } 1490 } 1491 */ 1492 } 1493 return $string; 1494 } 1495 1496 /** 1497 * Cuts a string short at a given byte length. 1498 * 1499 * @param string The character set 1500 * @param string Character string 1501 * @param integer The byte length 1502 * @return string The shortened string 1503 * @see mb_strcut() 1504 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1505 */ 1506 function strtrunc($charset,$string,$len) { 1507 if ($len <= 0) return ''; 1508 1509 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1510 return mb_strcut($string,0,$len,$charset); 1511 } elseif ($charset == 'utf-8') { 1512 return $this->utf8_strtrunc($string,$len); 1513 } elseif ($this->eucBasedSets[$charset]) { 1514 return $this->euc_strtrunc($string,$charset); 1515 } elseif ($this->twoByteSets[$charset]) { 1516 if ($len % 2) $len--; // don't cut at odd positions 1517 } elseif ($this->fourByteSets[$charset]) { 1518 $x = $len % 4; 1519 $len -= $x; // realign to position dividable by four 1520 } 1521 // treat everything else as single-byte encoding 1522 return substr($string,0,$len); 1523 } 1524 1525 /** 1526 * Translates all characters of a string into their respective case values. 1527 * Unlike strtolower() and strtoupper() this method is locale independent. 1528 * Note that the string length may change! 1529 * eg. lower case German �(sharp S) becomes upper case "SS" 1530 * Unit-tested by Kasper 1531 * Real case folding is language dependent, this method ignores this fact. 1532 * 1533 * @param string Character set of string 1534 * @param string Input string to convert case for 1535 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" ) 1536 * @return string The converted string 1537 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1538 * @see strtolower(), strtoupper() 1539 */ 1540 function conv_case($charset,$string,$case) { 1541 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1542 if ($case == 'toLower') { 1543 $string = mb_strtolower($string,$charset); 1544 } else { 1545 $string = mb_strtoupper($string,$charset); 1546 } 1547 } elseif ($charset == 'utf-8') { 1548 $string = $this->utf8_char_mapping($string,'case',$case); 1549 } elseif (isset($this->eucBasedSets[$charset])) { 1550 $string = $this->euc_char_mapping($string,$charset,'case',$case); 1551 } else { 1552 // treat everything else as single-byte encoding 1553 $string = $this->sb_char_mapping($string,$charset,'case',$case); 1554 } 1555 1556 return $string; 1557 } 1558 1559 /** 1560 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.) 1561 * 1562 * @param string Character set of string 1563 * @param string Input string to convert 1564 * @return string The converted string 1565 */ 1566 function specCharsToASCII($charset,$string) { 1567 if ($charset == 'utf-8') { 1568 $string = $this->utf8_char_mapping($string,'ascii'); 1569 } elseif (isset($this->eucBasedSets[$charset])) { 1570 $string = $this->euc_char_mapping($string,$charset,'ascii'); 1571 } else { 1572 // treat everything else as single-byte encoding 1573 $string = $this->sb_char_mapping($string,$charset,'ascii'); 1574 } 1575 1576 return $string; 1577 } 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 /******************************************** 1591 * 1592 * Internal string operation functions 1593 * 1594 ********************************************/ 1595 1596 /** 1597 * Maps all characters of a string in a single byte charset. 1598 * 1599 * @param string the string 1600 * @param string the charset 1601 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 1602 * @param string 'case': conversion 'toLower' or 'toUpper' 1603 * @return string the converted string 1604 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1605 */ 1606 function sb_char_mapping($str,$charset,$mode,$opt='') { 1607 switch($mode) { 1608 case 'case': 1609 if (!$this->initCaseFolding($charset)) return $str; // do nothing 1610 $map =& $this->caseFolding[$charset][$opt]; 1611 break; 1612 1613 case 'ascii': 1614 if (!$this->initToASCII($charset)) return $str; // do nothing 1615 $map =& $this->toASCII[$charset]; 1616 break; 1617 1618 default: 1619 return $str; 1620 } 1621 1622 $out = ''; 1623 for($i=0; strlen($str{$i}); $i++) { 1624 $c = $str{$i}; 1625 if (isset($map[$c])) { 1626 $out .= $map[$c]; 1627 } else { 1628 $out .= $c; 1629 } 1630 } 1631 1632 return $out; 1633 } 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 /******************************************** 1645 * 1646 * Internal UTF-8 string operation functions 1647 * 1648 ********************************************/ 1649 1650 /** 1651 * Returns a part of a UTF-8 string. 1652 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len 1653 * 1654 * @param string UTF-8 string 1655 * @param integer Start position (character position) 1656 * @param integer Length (in characters) 1657 * @return string The substring 1658 * @see substr() 1659 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1660 */ 1661 function utf8_substr($str,$start,$len=null) { 1662 if (!strcmp($len,'0')) return ''; 1663 1664 $byte_start = $this->utf8_char2byte_pos($str,$start); 1665 if ($byte_start === false) { 1666 if ($start > 0) { 1667 return false; // $start outside string length 1668 } else { 1669 $start = 0; 1670 } 1671 } 1672 1673 $str = substr($str,$byte_start); 1674 1675 if ($len!=null) { 1676 $byte_end = $this->utf8_char2byte_pos($str,$len); 1677 if ($byte_end === false) // $len outside actual string length 1678 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string. 1679 else 1680 return substr($str,0,$byte_end); 1681 } 1682 else return $str; 1683 } 1684 1685 /** 1686 * Counts the number of characters of a string in UTF-8. 1687 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen() 1688 * 1689 * @param string UTF-8 multibyte character string 1690 * @return integer The number of characters 1691 * @see strlen() 1692 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1693 */ 1694 function utf8_strlen($str) { 1695 $n=0; 1696 for($i=0; strlen($str{$i}); $i++) { 1697 $c = ord($str{$i}); 1698 if (!($c & 0x80)) // single-byte (0xxxxxx) 1699 $n++; 1700 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 1701 $n++; 1702 } 1703 return $n; 1704 } 1705 1706 /** 1707 * Truncates a string in UTF-8 short at a given byte length. 1708 * 1709 * @param string UTF-8 multibyte character string 1710 * @param integer the byte length 1711 * @return string the shortened string 1712 * @see mb_strcut() 1713 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1714 */ 1715 function utf8_strtrunc($str,$len) { 1716 $i = $len-1; 1717 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence 1718 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte 1719 if ($i <= 0) return ''; // sanity check 1720 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes 1721 if ($bc+$i > $len) return substr($str,0,$i); 1722 // fallthru: multibyte char fits into length 1723 } 1724 return substr($str,0,$len); 1725 } 1726 1727 /** 1728 * Find position of first occurrence of a string, both arguments are in UTF-8. 1729 * 1730 * @param string UTF-8 string to search in 1731 * @param string UTF-8 string to search for 1732 * @param integer Positition to start the search 1733 * @return integer The character position 1734 * @see strpos() 1735 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1736 */ 1737 function utf8_strpos($haystack,$needle,$offset=0) { 1738 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1739 return mb_strpos($haystack,$needle,$offset,'utf-8'); 1740 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1741 return iconv_strpos($haystack,$needle,$offset,'utf-8'); 1742 } 1743 1744 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset); 1745 if ($byte_offset === false) return false; // offset beyond string length 1746 1747 $byte_pos = strpos($haystack,$needle,$byte_offset); 1748 if ($byte_pos === false) return false; // needle not found 1749 1750 return $this->utf8_byte2char_pos($haystack,$byte_pos); 1751 } 1752 1753 /** 1754 * Find position of last occurrence of a char in a string, both arguments are in UTF-8. 1755 * 1756 * @param string UTF-8 string to search in 1757 * @param string UTF-8 character to search for (single character) 1758 * @return integer The character position 1759 * @see strrpos() 1760 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1761 */ 1762 function utf8_strrpos($haystack,$needle) { 1763 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1764 return mb_strrpos($haystack,$needle,'utf-8'); 1765 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1766 return iconv_strrpos($haystack,$needle,'utf-8'); 1767 } 1768 1769 $byte_pos = strrpos($haystack,$needle); 1770 if ($byte_pos === false) return false; // needle not found 1771 1772 return $this->utf8_byte2char_pos($haystack,$byte_pos); 1773 } 1774 1775 /** 1776 * Translates a character position into an 'absolute' byte position. 1777 * Unit tested by Kasper. 1778 * 1779 * @param string UTF-8 string 1780 * @param integer Character position (negative values start from the end) 1781 * @return integer Byte position 1782 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1783 */ 1784 function utf8_char2byte_pos($str,$pos) { 1785 $n = 0; // number of characters found 1786 $p = abs($pos); // number of characters wanted 1787 1788 if ($pos >= 0) { 1789 $i = 0; 1790 $d = 1; 1791 } else { 1792 $i = strlen($str)-1; 1793 $d = -1; 1794 } 1795 1796 for( ; strlen($str{$i}) && $n<$p; $i+=$d) { 1797 $c = (int)ord($str{$i}); 1798 if (!($c & 0x80)) // single-byte (0xxxxxx) 1799 $n++; 1800 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 1801 $n++; 1802 } 1803 if (!strlen($str{$i})) return false; // offset beyond string length 1804 1805 if ($pos >= 0) { 1806 // skip trailing multi-byte data bytes 1807 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; } 1808 } else { 1809 // correct offset 1810 $i++; 1811 } 1812 1813 return $i; 1814 } 1815 1816 /** 1817 * Translates an 'absolute' byte position into a character position. 1818 * Unit tested by Kasper. 1819 * 1820 * @param string UTF-8 string 1821 * @param integer byte position 1822 * @return integer character position 1823 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1824 */ 1825 function utf8_byte2char_pos($str,$pos) { 1826 $n = 0; // number of characters 1827 for($i=$pos; $i>0; $i--) { 1828 $c = (int)ord($str{$i}); 1829 if (!($c & 0x80)) // single-byte (0xxxxxx) 1830 $n++; 1831 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 1832 $n++; 1833 } 1834 if (!strlen($str{$i})) return false; // offset beyond string length 1835 1836 return $n; 1837 } 1838 1839 /** 1840 * Maps all characters of an UTF-8 string. 1841 * 1842 * @param string UTF-8 string 1843 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 1844 * @param string 'case': conversion 'toLower' or 'toUpper' 1845 * @return string the converted string 1846 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1847 */ 1848 function utf8_char_mapping($str,$mode,$opt='') { 1849 if (!$this->initUnicodeData($mode)) return $str; // do nothing 1850 1851 $out = ''; 1852 switch($mode) { 1853 case 'case': 1854 $map =& $this->caseFolding['utf-8'][$opt]; 1855 break; 1856 1857 case 'ascii': 1858 $map =& $this->toASCII['utf-8']; 1859 break; 1860 1861 default: 1862 return $str; 1863 } 1864 1865 for($i=0; strlen($str{$i}); $i++) { 1866 $c = ord($str{$i}); 1867 if (!($c & 0x80)) // single-byte (0xxxxxx) 1868 $mbc = $str{$i}; 1869 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx) 1870 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes 1871 $mbc = substr($str,$i,$bc); 1872 $i += $bc-1; 1873 } 1874 1875 if (isset($map[$mbc])) { 1876 $out .= $map[$mbc]; 1877 } else { 1878 $out .= $mbc; 1879 } 1880 } 1881 1882 return $out; 1883 } 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 /******************************************** 1903 * 1904 * Internal EUC string operation functions 1905 * 1906 * Extended Unix Code: 1907 * ASCII compatible 7bit single bytes chars 1908 * 8bit two byte chars 1909 * 1910 * Shift-JIS is treated as a special case. 1911 * 1912 ********************************************/ 1913 1914 /** 1915 * Cuts a string in the EUC charset family short at a given byte length. 1916 * 1917 * @param string EUC multibyte character string 1918 * @param integer the byte length 1919 * @param string the charset 1920 * @return string the shortened string 1921 * @see mb_strcut() 1922 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1923 */ 1924 function euc_strtrunc($str,$len,$charset) { 1925 $sjis = ($charset == 'shift_jis'); 1926 for ($i=0; strlen($str{$i}) && $i<$len; $i++) { 1927 $c = ord($str{$i}); 1928 if ($sjis) { 1929 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char 1930 } 1931 else { 1932 if ($c >= 0x80) $i++; // advance a double-byte char 1933 } 1934 } 1935 if (!strlen($str{$i})) return $str; // string shorter than supplied length 1936 1937 if ($i>$len) 1938 return substr($str,0,$len-1); // we ended on a first byte 1939 else 1940 return substr($str,0,$len); 1941 } 1942 1943 /** 1944 * Returns a part of a string in the EUC charset family. 1945 * 1946 * @param string EUC multibyte character string 1947 * @param integer start position (character position) 1948 * @param string the charset 1949 * @param integer length (in characters) 1950 * @return string the substring 1951 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1952 */ 1953 function euc_substr($str,$start,$charset,$len=null) { 1954 $byte_start = $this->euc_char2byte_pos($str,$start,$charset); 1955 if ($byte_start === false) return false; // $start outside string length 1956 1957 $str = substr($str,$byte_start); 1958 1959 if ($len!=null) { 1960 $byte_end = $this->euc_char2byte_pos($str,$len,$charset); 1961 if ($byte_end === false) // $len outside actual string length 1962 return $str; 1963 else 1964 return substr($str,0,$byte_end); 1965 } 1966 else return $str; 1967 } 1968 1969 /** 1970 * Counts the number of characters of a string in the EUC charset family. 1971 * 1972 * @param string EUC multibyte character string 1973 * @param string the charset 1974 * @return integer the number of characters 1975 * @see strlen() 1976 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1977 */ 1978 function euc_strlen($str,$charset) { 1979 $sjis = ($charset == 'shift_jis'); 1980 $n=0; 1981 for ($i=0; strlen($str{$i}); $i++) { 1982 $c = ord($str{$i}); 1983 if ($sjis) { 1984 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char 1985 } 1986 else { 1987 if ($c >= 0x80) $i++; // advance a double-byte char 1988 } 1989 1990 $n++; 1991 } 1992 1993 return $n; 1994 } 1995 1996 /** 1997 * Translates a character position into an 'absolute' byte position. 1998 * 1999 * @param string EUC multibyte character string 2000 * @param integer character position (negative values start from the end) 2001 * @param string the charset 2002 * @return integer byte position 2003 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2004 */ 2005 function euc_char2byte_pos($str,$pos,$charset) { 2006 $sjis = ($charset == 'shift_jis'); 2007 $n = 0; // number of characters seen 2008 $p = abs($pos); // number of characters wanted 2009 2010 if ($pos >= 0) { 2011 $i = 0; 2012 $d = 1; 2013 } else { 2014 $i = strlen($str)-1; 2015 $d = -1; 2016 } 2017 2018 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) { 2019 $c = ord($str{$i}); 2020 if ($sjis) { 2021 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char 2022 } 2023 else { 2024 if ($c >= 0x80) $i+=$d; // advance a double-byte char 2025 } 2026 2027 $n++; 2028 } 2029 if (!strlen($str{$i})) return false; // offset beyond string length 2030 2031 if ($pos < 0) $i++; // correct offset 2032 2033 return $i; 2034 } 2035 2036 /** 2037 * Maps all characters of a string in the EUC charset family. 2038 * 2039 * @param string EUC multibyte character string 2040 * @param string the charset 2041 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 2042 * @param string 'case': conversion 'toLower' or 'toUpper' 2043 * @return string the converted string 2044 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2045 */ 2046 function euc_char_mapping($str,$charset,$mode,$opt='') { 2047 switch($mode) { 2048 case 'case': 2049 if (!$this->initCaseFolding($charset)) return $str; // do nothing 2050 $map =& $this->caseFolding[$charset][$opt]; 2051 break; 2052 2053 case 'ascii': 2054 if (!$this->initToASCII($charset)) return $str; // do nothing 2055 $map =& $this->toASCII[$charset]; 2056 break; 2057 2058 default: 2059 return $str; 2060 } 2061 2062 $sjis = ($charset == 'shift_jis'); 2063 $out = ''; 2064 for($i=0; strlen($str{$i}); $i++) { 2065 $mbc = $str{$i}; 2066 $c = ord($mbc); 2067 2068 if ($sjis) { 2069 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char 2070 $mbc = substr($str,$i,2); 2071 $i++; 2072 } 2073 } 2074 else { 2075 if ($c >= 0x80) { // a double-byte char 2076 $mbc = substr($str,$i,2); 2077 $i++; 2078 } 2079 } 2080 2081 if (isset($map[$mbc])) { 2082 $out .= $map[$mbc]; 2083 } else { 2084 $out .= $mbc; 2085 } 2086 } 2087 2088 return $out; 2089 } 2090 2091 } 2092 2093 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) { 2094 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']); 2095 } 2096 ?>
titre
Description
Corps
titre
Description
Corps
titre
Description
Corps
titre
Corps
| Généré le : Sun Nov 25 17:13:16 2007 | par Balluche grâce à PHPXref 0.7 |
|