[ Index ]
 

Code source de Typo3 4.1.3

Accédez au Source d'autres logiciels libres

Classes | Fonctions | Variables | Constantes | Tables

title

Body

[fermer]

/t3lib/ -> class.t3lib_cs.php (source)

   1  <?php
   2  /***************************************************************
   3  *  Copyright notice
   4  *
   5  *  (c) 2003-2007 Kasper Skaarhoj (kasperYYYY@typo3.com)
   6  *  All rights reserved
   7  *
   8  *  This script is part of the Typo3 project. The Typo3 project is
   9  *  free software; you can redistribute it and/or modify
  10  *  it under the terms of the GNU General Public License as published by
  11  *  the Free Software Foundation; either version 2 of the License, or
  12  *  (at your option) any later version.
  13  *
  14  *  The GNU General Public License can be found at
  15  *  http://www.gnu.org/copyleft/gpl.html.
  16  *
  17  *  This script is distributed in the hope that it will be useful,
  18  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  *  GNU General Public License for more details.
  21  *
  22  *  This copyright notice MUST APPEAR in all copies of the script!
  23  ***************************************************************/
  24  /**
  25   * Class for conversion between charsets.
  26   *
  27   * $Id: class.t3lib_cs.php 2531 2007-10-06 19:26:53Z masi $
  28   *
  29   * @author    Kasper Skaarhoj <kasperYYYY@typo3.com>
  30   * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
  31   */
  32  /**
  33   * [CLASS/FUNCTION INDEX of SCRIPT]
  34   *
  35   *
  36   *
  37   *  136: class t3lib_cs
  38   *  488:     function parse_charset($charset)
  39   *  507:     function get_locale_charset($locale)
  40   *
  41   *              SECTION: Charset Conversion functions
  42   *  560:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
  43   *  600:     function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
  44   *  617:     function utf8_encode($str,$charset)
  45   *  663:     function utf8_decode($str,$charset,$useEntityForNoChar=0)
  46   *  706:     function utf8_to_entities($str)
  47   *  739:     function entities_to_utf8($str,$alsoStdHtmlEnt=0)
  48   *  773:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
  49   *  823:     function UnumberToChar($cbyte)
  50   *  868:     function utf8CharToUnumber($str,$hex=0)
  51   *
  52   *              SECTION: Init functions
  53   *  911:     function initCharset($charset)
  54   *  973:     function initUnicodeData($mode=null)
  55   * 1198:     function initCaseFolding($charset)
  56   * 1260:     function initToASCII($charset)
  57   *
  58   *              SECTION: String operation functions
  59   * 1331:     function substr($charset,$string,$start,$len=null)
  60   * 1384:     function strlen($charset,$string)
  61   * 1414:     function crop($charset,$string,$len,$crop='')
  62   * 1467:     function strtrunc($charset,$string,$len)
  63   * 1501:     function conv_case($charset,$string,$case)
  64   * 1527:     function specCharsToASCII($charset,$string)
  65   *
  66   *              SECTION: Internal string operation functions
  67   * 1567:     function sb_char_mapping($str,$charset,$mode,$opt='')
  68   *
  69   *              SECTION: Internal UTF-8 string operation functions
  70   * 1622:     function utf8_substr($str,$start,$len=null)
  71   * 1655:     function utf8_strlen($str)
  72   * 1676:     function utf8_strtrunc($str,$len)
  73   * 1698:     function utf8_strpos($haystack,$needle,$offset=0)
  74   * 1723:     function utf8_strrpos($haystack,$needle)
  75   * 1745:     function utf8_char2byte_pos($str,$pos)
  76   * 1786:     function utf8_byte2char_pos($str,$pos)
  77   * 1809:     function utf8_char_mapping($str,$mode,$opt='')
  78   *
  79   *              SECTION: Internal EUC string operation functions
  80   * 1885:     function euc_strtrunc($str,$len,$charset)
  81   * 1914:     function euc_substr($str,$start,$charset,$len=null)
  82   * 1939:     function euc_strlen($str,$charset)
  83   * 1966:     function euc_char2byte_pos($str,$pos,$charset)
  84   * 2007:     function euc_char_mapping($str,$charset,$mode,$opt='')
  85   *
  86   * TOTAL FUNCTIONS: 35
  87   * (This index is automatically created/updated by the extension "extdeveval")
  88   *
  89   */
  90  
  91  
  92  
  93  
  94  
  95  
  96  
  97  
  98  /**
  99   * Notes on UTF-8
 100   *
 101   * Functions working on UTF-8 strings:
 102   *
 103   * - strchr/strstr
 104   * - strrchr
 105   * - substr_count
 106   * - implode/explode/join
 107   *
 108   * Functions nearly working on UTF-8 strings:
 109   *
 110   * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
 111   * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
 112   * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
 113   * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
 114   *
 115   * Functions NOT working on UTF-8 strings:
 116   *
 117   * - str*cmp
 118   * - stristr
 119   * - stripos
 120   * - substr
 121   * - strrev
 122   * - ereg/eregi
 123   * - split/spliti
 124   * - preg_*
 125   * - ...
 126   *
 127   */
 128  /**
 129   * Class for conversion between charsets
 130   *
 131   * @author    Kasper Skaarhoj <kasperYYYY@typo3.com>
 132   * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
 133   * @package TYPO3
 134   * @subpackage t3lib
 135   */
 136  class t3lib_cs {
 137      var $noCharByteVal=63;        // ASCII Value for chars with no equivalent.
 138  
 139          // This is the array where parsed conversion tables are stored (cached)
 140      var $parsedCharsets=array();
 141  
 142          // An array where case folding data will be stored (cached)
 143      var $caseFolding=array();
 144  
 145          // An array where charset-to-ASCII mappings are stored (cached)
 146      var $toASCII=array();
 147  
 148          // This tells the converter which charsets has two bytes per char:
 149      var $twoByteSets=array(
 150          'ucs-2'=>1,    // 2-byte Unicode
 151      );
 152  
 153          // This tells the converter which charsets has four bytes per char:
 154      var $fourByteSets=array(
 155          'ucs-4'=>1,    // 4-byte Unicode
 156          'utf-32'=>1,    // 4-byte Unicode (limited to the 21-bits of UTF-16)
 157      );
 158  
 159          // This tells the converter which charsets use a scheme like the Extended Unix Code:
 160      var $eucBasedSets=array(
 161          'gb2312'=>1,        // Chinese, simplified.
 162          'big5'=>1,        // Chinese, traditional.
 163          'euc-kr'=>1,        // Korean
 164          'shift_jis'=>1,        // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
 165      );
 166  
 167          // see    http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
 168          // http://czyborra.com/charsets/iso8859.html
 169      var $synonyms=array(
 170          'us' => 'ascii',
 171          'us-ascii'=> 'ascii',
 172          'cp819' => 'iso-8859-1',
 173          'ibm819' => 'iso-8859-1',
 174          'iso-ir-100' => 'iso-8859-1',
 175          'iso-ir-109' => 'iso-8859-2',
 176          'iso-ir-148' => 'iso-8859-9',
 177          'iso-ir-199' => 'iso-8859-14',
 178          'iso-ir-203' => 'iso-8859-15',
 179          'csisolatin1' => 'iso-8859-1',
 180          'csisolatin2' => 'iso-8859-2',
 181          'csisolatin3' => 'iso-8859-3',
 182          'csisolatin5' => 'iso-8859-9',
 183          'csisolatin8' => 'iso-8859-14',
 184          'csisolatin9' => 'iso-8859-15',
 185          'csisolatingreek' => 'iso-8859-7',
 186          'iso-celtic' => 'iso-8859-14',
 187          'latin1' => 'iso-8859-1',
 188          'latin2' => 'iso-8859-2',
 189          'latin3' => 'iso-8859-3',
 190          'latin5' => 'iso-8859-9',
 191          'latin6' => 'iso-8859-10',
 192          'latin8' => 'iso-8859-14',
 193          'latin9' => 'iso-8859-15',
 194          'l1' => 'iso-8859-1',
 195          'l2' => 'iso-8859-2',
 196          'l3' => 'iso-8859-3',
 197          'l5' => 'iso-8859-9',
 198          'l6' => 'iso-8859-10',
 199          'l8' => 'iso-8859-14',
 200          'l9' => 'iso-8859-15',
 201          'cyrillic' => 'iso-8859-5',
 202          'arabic' => 'iso-8859-6',
 203          'tis-620' => 'iso-8859-11',
 204          'win874' => 'windows-874',
 205          'win1250' => 'windows-1250',
 206          'win1251' => 'windows-1251',
 207          'win1252' => 'windows-1252',
 208          'win1253' => 'windows-1253',
 209          'win1254' => 'windows-1254',
 210          'win1255' => 'windows-1255',
 211          'win1256' => 'windows-1256',
 212          'win1257' => 'windows-1257',
 213          'win1258' => 'windows-1258',
 214          'cp1250' => 'windows-1250',
 215          'cp1251' => 'windows-1251',
 216          'cp1252' => 'windows-1252',
 217          'ms-ee' => 'windows-1250',
 218          'ms-ansi' => 'windows-1252',
 219          'ms-greek' => 'windows-1253',
 220          'ms-turk' => 'windows-1254',
 221          'winbaltrim' => 'windows-1257',
 222          'koi-8ru' => 'koi-8r',
 223          'koi8r' => 'koi-8r',
 224          'cp878' => 'koi-8r',
 225          'mac' => 'macroman',
 226          'macintosh' => 'macroman',
 227          'euc-cn' => 'gb2312',
 228          'x-euc-cn' => 'gb2312',
 229          'euccn' => 'gb2312',
 230          'cp936' => 'gb2312',
 231          'big-5' => 'big5',
 232          'cp950' => 'big5',
 233          'eucjp' => 'euc-jp',
 234          'sjis' => 'shift_jis',
 235          'shift-jis' => 'shift_jis',
 236          'cp932' => 'shift_jis',
 237          'cp949' => 'euc-kr',
 238          'utf7' => 'utf-7',
 239          'utf8' => 'utf-8',
 240          'utf16' => 'utf-16',
 241          'utf32' => 'utf-32',
 242          'utf8' => 'utf-8',
 243          'ucs2' => 'ucs-2',
 244          'ucs4' => 'ucs-4',
 245      );
 246  
 247          // mapping of iso-639:2 language codes to script names
 248      var $lang_to_script=array(
 249              // iso-639:2 language codes, see:
 250              //  http://www.w3.org/WAI/ER/IG/ert/iso639.htm
 251              //  http://www.loc.gov/standards/iso639-2/langcodes.html
 252              //  http://www.unicode.org/onlinedat/languages.html
 253          'ar' => 'arabic',
 254          'bg' => 'cyrillic',        // Bulgarian
 255          'bs' => 'east_european',    // Bosnian
 256          'cs' => 'east_european',    // Czech
 257          'da' => 'west_european',    // Danish
 258          'de' => 'west_european',    // German
 259          'es' => 'west_european',    // Spanish
 260          'et' => 'estonian',
 261          'eo' => 'unicode',        // Esperanto
 262          'eu' => 'west_european',    // Basque
 263          'fa' => 'arabic',    // Persian
 264          'fi' => 'west_european',    // Finish
 265          'fo' => 'west_european',    // Faroese
 266          'fr' => 'west_european',    // French
 267          'gr' => 'greek',
 268          'he' => 'hebrew',        // Hebrew (since 1998)
 269          'hi' => 'unicode',        // Hindi
 270          'hr' => 'east_european',    // Croatian
 271          'hu' => 'east_european',    // Hungarian
 272          'iw' => 'hebrew',        // Hebrew (til 1998)
 273          'is' => 'west_european',    // Icelandic
 274          'it' => 'west_european',    // Italian
 275          'ja' => 'japanese',
 276          'kl' => 'west_european',    // Greenlandic
 277          'ko' => 'korean',
 278          'lt' => 'lithuanian',
 279          'lv' => 'west_european',    // Latvian/Lettish
 280          'nl' => 'west_european',    // Dutch
 281          'no' => 'west_european',    // Norwegian
 282          'pl' => 'east_european',    // Polish
 283          'pt' => 'west_european',    // Portuguese
 284          'ro' => 'east_european',    // Romanian
 285          'ru' => 'cyrillic',        // Russian
 286          'sk' => 'east_european',    // Slovak
 287          'sl' => 'east_european',    // Slovenian
 288          'sr' => 'cyrillic',        // Serbian
 289          'sv' => 'west_european',    // Swedish
 290          'th' => 'thai',
 291          'uk' => 'cyrillic',        // Ukranian
 292          'vi' => 'vietnamese',
 293          'zh' => 'chinese',
 294              // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
 295              // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
 296          'ara' => 'arabic',
 297          'bgr' => 'cyrillic',        // Bulgarian
 298          'cat' => 'west_european',    // Catalan
 299          'chs' => 'simpl_chinese',
 300          'cht' => 'trad_chinese',
 301          'csy' => 'east_european',    // Czech
 302          'dan' => 'west_european',    // Danisch
 303          'deu' => 'west_european',    // German
 304          'dea' => 'west_european',    // German (Austrian)
 305          'des' => 'west_european',    // German (Swiss)
 306          'ena' => 'west_european',    // English (Australian)
 307          'enc' => 'west_european',    // English (Canadian)
 308          'eng' => 'west_european',    // English
 309          'enz' => 'west_european',    // English (New Zealand)
 310          'enu' => 'west_european',    // English (United States)
 311          'euq' => 'west_european',    // Basque
 312          'fos' => 'west_european',    // Faroese
 313          'far' => 'arabic',    // Persian
 314          'fin' => 'west_european',    // Finish
 315          'fra' => 'west_european',    // French
 316          'frb' => 'west_european',    // French (Belgian)
 317          'frc' => 'west_european',    // French (Canadian)
 318          'frs' => 'west_european',    // French (Swiss)
 319          'ell' => 'greek',
 320          'heb' => 'hebrew',
 321          'hin' => 'unicode',    // Hindi
 322          'hun' => 'east_european',    // Hungarian
 323          'isl' => 'west_euorpean',    // Icelandic
 324          'ita' => 'west_european',    // Italian
 325          'its' => 'west_european',    // Italian (Swiss)
 326          'jpn' => 'japanese',
 327          'kor' => 'korean',
 328          'lth' => 'lithuanian',
 329          'lvi' => 'west_european',    // Latvian/Lettish
 330          'msl' => 'west_european',    // Malay
 331          'nlb' => 'west_european',    // Dutch (Belgian)
 332          'nld' => 'west_european',    // Dutch
 333          'nor' => 'west_european',    // Norwegian (bokmal)
 334          'non' => 'west_european',    // Norwegian (nynorsk)
 335          'plk' => 'east_european',    // Polish
 336          'ptg' => 'west_european',    // Portuguese
 337          'ptb' => 'west_european',    // Portuguese (Brazil)
 338          'rom' => 'east_european',    // Romanian
 339          'rus' => 'cyrillic',        // Russian
 340          'slv' => 'east_european',    // Slovenian
 341          'sky' => 'east_european',    // Slovak
 342          'srl' => 'east_european',    // Serbian (Latin)
 343          'srb' => 'cyrillic',        // Serbian (Cyrillic)
 344          'esp' => 'west_european',    // Spanish (trad. sort)
 345          'esm' => 'west_european',    // Spanish (Mexican)
 346          'esn' => 'west_european',    // Spanish (internat. sort)
 347          'sve' => 'west_european',    // Swedish
 348          'tha' => 'thai',
 349          'trk' => 'turkish',
 350          'ukr' => 'cyrillic',    // Ukrainian
 351              // English language names
 352          'arabic' => 'arabic',
 353          'basque' => 'west_european',
 354          'bosnian' => 'east_european',
 355          'bulgarian' => 'east_european',
 356          'catalan' => 'west_european',
 357          'croatian' => 'east_european',
 358          'czech' => 'east_european',
 359          'danish' => 'west_european',
 360          'dutch' => 'west_european',
 361          'english' => 'west_european',
 362          'esperanto' => 'unicode',
 363          'estonian' => 'estonian',
 364          'faroese' => 'west_european',
 365          'farsi' => 'arabic',
 366          'finnish' => 'west_european',
 367          'french' => 'west_european',
 368          'galician' => 'west_european',
 369          'german' => 'west_european',
 370          'greek' => 'greek',
 371          'greenlandic' => 'west_european',
 372          'hebrew' => 'hebrew',
 373          'hindi' => 'unicode',
 374          'hungarian' => 'east_european',
 375          'icelandic' => 'west_european',
 376          'italian' => 'west_european',
 377          'latvian' => 'west_european',
 378          'lettish' => 'west_european',
 379          'lithuanian' => 'lithuanian',
 380          'malay' => 'west_european',
 381          'norwegian' => 'west_european',
 382          'persian' => 'arabic',
 383          'polish' => 'east_european',
 384          'portuguese' => 'west_european',
 385          'russian' => 'cyrillic',
 386          'romanian' => 'east_european',
 387          'serbian' => 'cyrillic',
 388          'slovak' => 'east_european',
 389          'slovenian' => 'east_european',
 390          'spanish' => 'west_european',
 391          'svedish' => 'west_european',
 392          'that' => 'thai',
 393          'turkish' => 'turkish',
 394          'ukrainian' => 'cyrillic',
 395      );
 396  
 397          // mapping of language (family) names to charsets on Unix
 398      var $script_to_charset_unix=array(
 399          'west_european' => 'iso-8859-1',
 400          'estonian' => 'iso-8859-1',
 401          'east_european' => 'iso-8859-2',
 402          'baltic' => 'iso-8859-4',
 403          'cyrillic' => 'iso-8859-5',
 404          'arabic' => 'iso-8859-6',
 405          'greek' => 'iso-8859-7',
 406          'hebrew' => 'iso-8859-8',
 407          'turkish' => 'iso-8859-9',
 408          'thai' => 'iso-8859-11', // = TIS-620
 409          'lithuanian' => 'iso-8859-13',
 410          'chinese' => 'gb2312', // = euc-cn
 411          'japanese' => 'euc-jp',
 412          'korean' => 'euc-kr',
 413          'simpl_chinese' => 'gb2312',
 414          'trad_chinese' => 'big5',
 415          'vietnamese' => '',
 416          'unicode' => 'utf-8',
 417      );
 418  
 419          // mapping of language (family) names to charsets on Windows
 420      var $script_to_charset_windows=array(
 421          'east_european' => 'windows-1250',
 422          'cyrillic' => 'windows-1251',
 423          'west_european' => 'windows-1252',
 424          'greek' => 'windows-1253',
 425          'turkish' => 'windows-1254',
 426          'hebrew' => 'windows-1255',
 427          'arabic' => 'windows-1256',
 428          'baltic' => 'windows-1257',
 429          'estonian' => 'windows-1257',
 430          'lithuanian' => 'windows-1257',
 431          'vietnamese' => 'windows-1258',
 432          'thai' => 'cp874',
 433          'korean' => 'cp949',
 434          'chinese' => 'gb2312',
 435          'japanese' => 'shift_jis',
 436          'simpl_chinese' => 'gb2312',
 437          'trad_chinese' => 'big5',
 438      );
 439  
 440          // mapping of locale names to charsets
 441      var $locale_to_charset=array(
 442          'japanese.euc' => 'euc-jp',
 443          'ja_jp.ujis' => 'euc-jp',
 444          'korean.euc' => 'euc-kr',
 445          'sr@Latn' => 'iso-8859-2',
 446          'zh_cn' => 'gb2312',
 447          'zh_hk' => 'big5',
 448          'zh_tw' => 'big5',
 449      );
 450  
 451          // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
 452          // Empty values means "iso-8859-1"
 453      var $charSetArray = array(
 454          'dk' => '',
 455          'de' => '',
 456          'no' => '',
 457          'it' => '',
 458          'fr' => '',
 459          'es' => '',
 460          'nl' => '',
 461          'cz' => 'windows-1250',
 462          'pl' => 'iso-8859-2',
 463          'si' => 'windows-1250',
 464          'fi' => '',
 465          'tr' => 'iso-8859-9',
 466          'se' => '',
 467          'pt' => '',
 468          'ru' => 'windows-1251',
 469          'ro' => 'iso-8859-2',
 470          'ch' => 'gb2312',
 471          'sk' => 'windows-1250',
 472          'lt' => 'windows-1257',
 473          'is' => 'utf-8',
 474          'hr' => 'windows-1250',
 475          'hu' => 'iso-8859-2',
 476          'gl' => '',
 477          'th' => 'iso-8859-11',
 478          'gr' => 'iso-8859-7',
 479          'hk' => 'big5',
 480          'eu' => '',
 481          'bg' => 'windows-1251',
 482          'br' => '',
 483          'et' => 'iso-8859-4',
 484          'ar' => 'iso-8859-6',
 485          'he' => 'utf-8',
 486          'ua' => 'windows-1251',
 487          'jp' => 'shift_jis',
 488          'lv' => 'utf-8',
 489          'vn' => 'utf-8',
 490          'ca' => 'iso-8859-15',
 491          'ba' => 'iso-8859-2',
 492          'kr' => 'euc-kr',
 493          'eo' => 'utf-8',
 494          'my' => '',
 495          'hi' => 'utf-8',
 496          'fo' => 'utf-8',
 497          'fa' => 'utf-8',
 498          'sr' => 'utf-8'
 499      );
 500  
 501          // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
 502          // Missing keys means: same as Typo3
 503      var $isoArray = array(
 504          'ba' => 'bs',
 505          'br' => 'pt_BR',
 506          'ch' => 'zh_CN',
 507          'cz' => 'cs',
 508          'dk' => 'da',
 509          'si' => 'sl',
 510          'se' => 'sv',
 511          'gl' => 'kl',
 512          'gr' => 'el',
 513          'hk' => 'zh_HK',
 514          'kr' => 'ko',
 515          'ua' => 'uk',
 516          'jp' => 'ja',
 517          'vn' => 'vi',
 518      );
 519  
 520      /**
 521       * Normalize - changes input character set to lowercase letters.
 522       *
 523       * @param    string        Input charset
 524       * @return    string        Normalized charset
 525       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
 526       */
 527  	function parse_charset($charset)    {
 528          $charset = trim(strtolower($charset));
 529          if (isset($this->synonyms[$charset]))    $charset = $this->synonyms[$charset];
 530  
 531          return $charset;
 532      }
 533  
 534      /**
 535       * Get the charset of a locale.
 536       *
 537       * ln            language
 538       * ln_CN         language / country
 539       * ln_CN.cs      language / country / charset
 540       * ln_CN.cs@mod  language / country / charset / modifier
 541       *
 542       * @param    string        Locale string
 543       * @return    string        Charset resolved for locale string
 544       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
 545       */
 546  	function get_locale_charset($locale)    {
 547          $locale = strtolower($locale);
 548  
 549              // exact locale specific charset?
 550          if (isset($this->locale_to_charset[$locale]))    return $this->locale_to_charset[$locale];
 551  
 552              // get modifier
 553          list($locale,$modifier) = explode('@',$locale);
 554  
 555              // locale contains charset: use it
 556          list($locale,$charset) = explode('.',$locale);
 557          if ($charset)    return $this->parse_charset($charset);
 558  
 559              // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
 560          if ($modifier == 'euro')    return 'iso-8859-15';
 561  
 562              // get language
 563          list($language,$country) = explode('_',$locale);
 564          if (isset($this->lang_to_script[$language]))    $script = $this->lang_to_script[$language];
 565  
 566          if (TYPO3_OS == 'WIN')    {
 567              $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
 568          } else {
 569              $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
 570          }
 571  
 572          return $cs;
 573      }
 574  
 575  
 576  
 577  
 578  
 579  
 580  
 581  
 582  
 583      /********************************************
 584       *
 585       * Charset Conversion functions
 586       *
 587       ********************************************/
 588  
 589      /**
 590       * Convert from one charset to another charset.
 591       *
 592       * @param    string        Input string
 593       * @param    string        From charset (the current charset of the string)
 594       * @param    string        To charset (the output charset wanted)
 595       * @param    boolean        If set, then characters that are not available in the destination character set will be encoded as numeric entities
 596       * @return    string        Converted string
 597       * @see convArray()
 598       */
 599  	function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)    {
 600          if ($fromCS==$toCS)    return $str;
 601  
 602              // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
 603          if ($toCS=='utf-8' || !$useEntityForNoChar)    {
 604              switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod'])    {
 605              case 'mbstring':
 606                  $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
 607                  if (false !== $conv_str)    return $conv_str; // returns false for unsupported charsets
 608                  break;
 609  
 610              case 'iconv':
 611                  $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
 612                  if (false !== $conv_str)    return $conv_str;
 613                  break;
 614  
 615              case 'recode':
 616                  $conv_str = recode_string($fromCS.'..'.$toCS,$str);
 617                  if (false !== $conv_str)    return $conv_str;
 618                  break;
 619              }
 620              // fallback to TYPO3 conversion
 621          }
 622  
 623          if ($fromCS!='utf-8')    $str=$this->utf8_encode($str,$fromCS);
 624          if ($toCS!='utf-8')    $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
 625          return $str;
 626      }
 627  
 628      /**
 629       * Convert all elements in ARRAY from one charset to another charset.
 630       * NOTICE: Array is passed by reference!
 631       *
 632       * @param    string        Input array, possibly multidimensional
 633       * @param    string        From charset (the current charset of the string)
 634       * @param    string        To charset (the output charset wanted)
 635       * @param    boolean        If set, then characters that are not available in the destination character set will be encoded as numeric entities
 636       * @return    void
 637       * @see conv()
 638       */
 639  	function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)    {
 640          foreach($array as $key => $value)    {
 641              if (is_array($array[$key]))    {
 642                  $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
 643              } else {
 644                  $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
 645              }
 646          }
 647      }
 648  
 649      /**
 650       * Converts $str from $charset to UTF-8
 651       *
 652       * @param    string        String in local charset to convert to UTF-8
 653       * @param    string        Charset, lowercase. Must be found in csconvtbl/ folder.
 654       * @return    string        Output string, converted to UTF-8
 655       */
 656  	function utf8_encode($str,$charset)    {
 657  
 658          if ($charset === 'utf-8')    return $str;
 659  
 660              // Charset is case-insensitive.
 661          if ($this->initCharset($charset))    {    // Parse conv. table if not already...
 662              $strLen = strlen($str);
 663              $outStr='';
 664  
 665              for ($a=0;$a<$strLen;$a++)    {    // Traverse each char in string.
 666                  $chr=substr($str,$a,1);
 667                  $ord=ord($chr);
 668                  if (isset($this->twoByteSets[$charset]))    {    // If the charset has two bytes per char
 669                      $ord2 = ord($str{$a+1});
 670                      $ord = $ord<<8 | $ord2; // assume big endian
 671  
 672                      if (isset($this->parsedCharsets[$charset]['local'][$ord]))    {    // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 673                          $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
 674                      } else $outStr.=chr($this->noCharByteVal);    // No char exists
 675                      $a++;
 676                  } elseif ($ord>127)    {    // If char has value over 127 it's a multibyte char in UTF-8
 677                      if (isset($this->eucBasedSets[$charset]))    {    // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
 678                          if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF))    {    // Shift-JIS: chars between 160 and 223 are single byte
 679                              $a++;
 680                              $ord2=ord(substr($str,$a,1));
 681                              $ord = $ord*256+$ord2;
 682                          }
 683                      }
 684  
 685                      if (isset($this->parsedCharsets[$charset]['local'][$ord]))    {    // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 686                          $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
 687                      } else $outStr.= chr($this->noCharByteVal);    // No char exists
 688                  } else $outStr.= $chr;    // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 689              }
 690              return $outStr;
 691          }
 692      }
 693  
 694      /**
 695       * Converts $str from UTF-8 to $charset
 696       *
 697       * @param    string        String in UTF-8 to convert to local charset
 698       * @param    string        Charset, lowercase. Must be found in csconvtbl/ folder.
 699       * @param    boolean        If set, then characters that are not available in the destination character set will be encoded as numeric entities
 700       * @return    string        Output string, converted to local charset
 701       */
 702  	function utf8_decode($str,$charset,$useEntityForNoChar=0)    {
 703  
 704              // Charset is case-insensitive.
 705          if ($this->initCharset($charset))    {    // Parse conv. table if not already...
 706              $strLen = strlen($str);
 707              $outStr='';
 708              $buf='';
 709              for ($a=0,$i=0;$a<$strLen;$a++,$i++)    {    // Traverse each char in UTF-8 string.
 710                  $chr=substr($str,$a,1);
 711                  $ord=ord($chr);
 712                  if ($ord>127)    {    // This means multibyte! (first byte!)
 713                      if ($ord & 64)    {    // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 714  
 715                          $buf=$chr;    // Add first byte
 716                          for ($b=0;$b<8;$b++)    {    // for each byte in multibyte string...
 717                              $ord = $ord << 1;    // Shift it left and ...
 718                              if ($ord & 128)    {    // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 719                                  $a++;    // Increase pointer...
 720                                  $buf.=substr($str,$a,1);    // ... and add the next char.
 721                              } else break;
 722                          }
 723  
 724                          if (isset($this->parsedCharsets[$charset]['utf8'][$buf]))    {    // If the UTF-8 char-sequence is found then...
 725                              $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];    // The local number
 726                              if ($mByte>255)    {    // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
 727                                  $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
 728                              } else $outStr.= chr($mByte);
 729                          } elseif ($useEntityForNoChar) {    // Create num entity:
 730                              $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
 731                          } else $outStr.=chr($this->noCharByteVal);    // No char exists
 732                      } else $outStr.=chr($this->noCharByteVal);    // No char exists (MIDDLE of MB sequence!)
 733                  } else $outStr.=$chr;    // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 734              }
 735              return $outStr;
 736          }
 737      }
 738  
 739      /**
 740       * Converts all chars > 127 to numeric entities.
 741       *
 742       * @param    string        Input string
 743       * @return    string        Output string
 744       */
 745  	function utf8_to_entities($str)    {
 746          $strLen = strlen($str);
 747          $outStr='';
 748          $buf='';
 749          for ($a=0;$a<$strLen;$a++)    {    // Traverse each char in UTF-8 string.
 750              $chr=substr($str,$a,1);
 751              $ord=ord($chr);
 752              if ($ord>127)    {    // This means multibyte! (first byte!)
 753                  if ($ord & 64)    {    // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 754                      $buf=$chr;    // Add first byte
 755                      for ($b=0;$b<8;$b++)    {    // for each byte in multibyte string...
 756                          $ord = $ord << 1;    // Shift it left and ...
 757                          if ($ord & 128)    {    // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 758                              $a++;    // Increase pointer...
 759                              $buf.=substr($str,$a,1);    // ... and add the next char.
 760                          } else break;
 761                      }
 762  
 763                      $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
 764                  } else $outStr.=chr($this->noCharByteVal);    // No char exists (MIDDLE of MB sequence!)
 765              } else $outStr.=$chr;    // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 766          }
 767  
 768          return $outStr;
 769      }
 770  
 771      /**
 772       * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
 773       *
 774       * @param    string        Input string, UTF-8
 775       * @param    boolean        If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
 776       * @return    string        Output string
 777       */
 778  	function entities_to_utf8($str,$alsoStdHtmlEnt=0)    {
 779          if ($alsoStdHtmlEnt)    {
 780              $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));        // Getting them in iso-8859-1 - but thats ok since this is observed below.
 781          }
 782  
 783          $token = md5(microtime());
 784          $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
 785          foreach($parts as $k => $v)    {
 786              if ($k%2)    {
 787                  if (substr($v,0,1)=='#')    {    // Dec or hex entities:
 788                      if (substr($v,1,1)=='x')    {
 789                          $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
 790                      } else {
 791                          $parts[$k] = $this->UnumberToChar(substr($v,1));
 792                      }
 793                  } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {    // Other entities:
 794                      $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
 795                  } else {    // No conversion:
 796                      $parts[$k] ='&'.$v.';';
 797                  }
 798              }
 799          }
 800  
 801          return implode('',$parts);
 802      }
 803  
 804      /**
 805       * Converts all chars in the input UTF-8 string into integer numbers returned in an array
 806       *
 807       * @param    string        Input string, UTF-8
 808       * @param    boolean        If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
 809       * @param    boolean        If set, then instead of integer numbers the real UTF-8 char is returned.
 810       * @return    array        Output array with the char numbers
 811       */
 812  	function utf8_to_numberarray($str,$convEntities=0,$retChar=0)    {
 813              // If entities must be registered as well...:
 814          if ($convEntities)    {
 815              $str = $this->entities_to_utf8($str,1);
 816          }
 817              // Do conversion:
 818          $strLen = strlen($str);
 819          $outArr=array();
 820          $buf='';
 821          for ($a=0;$a<$strLen;$a++)    {    // Traverse each char in UTF-8 string.
 822              $chr=substr($str,$a,1);
 823              $ord=ord($chr);
 824              if ($ord>127)    {    // This means multibyte! (first byte!)
 825                  if ($ord & 64)    {    // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 826                      $buf=$chr;    // Add first byte
 827                      for ($b=0;$b<8;$b++)    {    // for each byte in multibyte string...
 828                          $ord = $ord << 1;    // Shift it left and ...
 829                          if ($ord & 128)    {    // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 830                              $a++;    // Increase pointer...
 831                              $buf.=substr($str,$a,1);    // ... and add the next char.
 832                          } else break;
 833                      }
 834  
 835                      $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
 836                  } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;    // No char exists (MIDDLE of MB sequence!)
 837              } else $outArr[]=$retChar?chr($ord):$ord;    // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 838          }
 839  
 840          return $outArr;
 841      }
 842  
 843      /**
 844       * Converts a UNICODE number to a UTF-8 multibyte character
 845       * Algorithm based on script found at From: http://czyborra.com/utf/
 846       * Unit-tested by Kasper
 847       *
 848       * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
 849       *
 850       *  bytes | bits | representation
 851       *      1 |    7 | 0vvvvvvv
 852       *      2 |   11 | 110vvvvv 10vvvvvv
 853       *      3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
 854       *      4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
 855       *      5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 856       *      6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 857       *
 858       * @param    integer        UNICODE integer
 859       * @return    string        UTF-8 multibyte character string
 860       * @see utf8CharToUnumber()
 861       */
 862  	function UnumberToChar($cbyte)    {
 863          $str='';
 864  
 865          if ($cbyte < 0x80) {
 866              $str.=chr($cbyte);
 867          } else if ($cbyte < 0x800) {
 868              $str.=chr(0xC0 | ($cbyte >> 6));
 869              $str.=chr(0x80 | ($cbyte & 0x3F));
 870          } else if ($cbyte < 0x10000) {
 871              $str.=chr(0xE0 | ($cbyte >> 12));
 872              $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 873              $str.=chr(0x80 | ($cbyte & 0x3F));
 874          } else if ($cbyte < 0x200000) {
 875              $str.=chr(0xF0 | ($cbyte >> 18));
 876              $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 877              $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 878              $str.=chr(0x80 | ($cbyte & 0x3F));
 879          } else if ($cbyte < 0x4000000) {
 880              $str.=chr(0xF8 | ($cbyte >> 24));
 881              $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
 882              $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 883              $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 884              $str.=chr(0x80 | ($cbyte & 0x3F));
 885          } else if ($cbyte < 0x80000000) {
 886              $str.=chr(0xFC | ($cbyte >> 30));
 887              $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
 888              $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
 889              $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 890              $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 891              $str.=chr(0x80 | ($cbyte & 0x3F));
 892          } else { // Cannot express a 32-bit character in UTF-8
 893              $str .= chr($this->noCharByteVal);
 894          }
 895          return $str;
 896      }
 897  
 898      /**
 899       * Converts a UTF-8 Multibyte character to a UNICODE number
 900       * Unit-tested by Kasper
 901       *
 902       * @param    string        UTF-8 multibyte character string
 903       * @param    boolean        If set, then a hex. number is returned.
 904       * @return    integer        UNICODE integer
 905       * @see UnumberToChar()
 906       */
 907  	function utf8CharToUnumber($str,$hex=0)    {
 908          $ord=ord(substr($str,0,1));    // First char
 909  
 910          if (($ord & 192) == 192)    {    // This verifyes that it IS a multi byte string
 911              $binBuf='';
 912              for ($b=0;$b<8;$b++)    {    // for each byte in multibyte string...
 913                  $ord = $ord << 1;    // Shift it left and ...
 914                  if ($ord & 128)    {    // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 915                      $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
 916                  } else break;
 917              }
 918              $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
 919  
 920              $int = bindec($binBuf);
 921          } else $int = $ord;
 922  
 923          return $hex ? 'x'.dechex($int) : $int;
 924      }
 925  
 926  
 927  
 928  
 929  
 930  
 931  
 932  
 933  
 934      /********************************************
 935       *
 936       * Init functions
 937       *
 938       ********************************************/
 939  
 940      /**
 941       * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
 942       * This function is automatically called by the conversion functions
 943       *
 944       * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
 945       *
 946       * @param    string        The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
 947       * @return    integer        Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
 948       * @access private
 949       */
 950  	function initCharset($charset)    {
 951              // Only process if the charset is not yet loaded:
 952          if (!is_array($this->parsedCharsets[$charset]))    {
 953  
 954                  // Conversion table filename:
 955              $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
 956  
 957                  // If the conversion table is found:
 958              if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))    {
 959                      // Cache file for charsets:
 960                      // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
 961                  $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
 962                  if ($cacheFile && @is_file($cacheFile))    {
 963                      $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
 964                  } else {
 965                          // Parse conversion table into lines:
 966                      $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
 967                          // Initialize the internal variable holding the conv. table:
 968                      $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
 969                          // traverse the lines:
 970                      $detectedType='';
 971                      foreach($lines as $value)    {
 972                          if (trim($value) && substr($value,0,1)!='#')    {    // Comment line or blanks are ignored.
 973  
 974                                  // Detect type if not done yet: (Done on first real line)
 975                                  // The "whitespaced" type is on the syntax     "0x0A    0x000A    #LINE FEED"     while     "ms-token" is like         "B9 = U+00B9 : SUPERSCRIPT ONE"
 976                              if (!$detectedType)        $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
 977  
 978                              if ($detectedType=='ms-token')    {
 979                                  list($hexbyte,$utf8) = split('=|:',$value,3);
 980                              } elseif ($detectedType=='whitespaced')    {
 981                                  $regA=array();
 982                                  ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
 983                                  $hexbyte = $regA[1];
 984                                  $utf8 = 'U+'.$regA[2];
 985                              }
 986                              $decval = hexdec(trim($hexbyte));
 987                              if ($decval>127)    {
 988                                  $utf8decval = hexdec(substr(trim($utf8),2));
 989                                  $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
 990                                  $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
 991                              }
 992                          }
 993                      }
 994                      if ($cacheFile)    {
 995                          t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
 996                      }
 997                  }
 998                  return 2;
 999              } else return false;
1000          } else return 1;
1001      }
1002  
1003      /**
1004       * This function initializes all UTF-8 character data tables.
1005       *
1006       * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1007       *
1008       * @param    string        Mode ("case", "ascii", ...)
1009       * @return    integer        Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1010       * @access private
1011       */
1012  	function initUnicodeData($mode=null)    {
1013              // cache files
1014          $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1015          $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1016  
1017              // Only process if the tables are not yet loaded
1018          switch($mode)    {
1019              case 'case':
1020                  if (is_array($this->caseFolding['utf-8']))    return 1;
1021  
1022                      // Use cached version if possible
1023                  if ($cacheFileCase && @is_file($cacheFileCase))    {
1024                      $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1025                      return 2;
1026                  }
1027                  break;
1028  
1029              case 'ascii':
1030                  if (is_array($this->toASCII['utf-8']))    return 1;
1031  
1032                      // Use cached version if possible
1033                  if ($cacheFileASCII && @is_file($cacheFileASCII))    {
1034                      $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1035                      return 2;
1036                  }
1037                  break;
1038          }
1039  
1040              // process main Unicode data file
1041          $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1042          if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile)))    return false;
1043  
1044          $fh = fopen($unicodeDataFile,'rb');
1045          if (!$fh)    return false;
1046  
1047              // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1048              // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1049          $this->caseFolding['utf-8'] = array();
1050          $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1051          $utf8CaseFolding['toUpper'] = array();
1052          $utf8CaseFolding['toLower'] = array();
1053          $utf8CaseFolding['toTitle'] = array();
1054  
1055          $decomposition = array();    // array of temp. decompositions
1056          $mark = array();        // array of chars that are marks (eg. composing accents)
1057          $number = array();        // array of chars that are numbers (eg. digits)
1058          $omit = array();        // array of chars to be omitted (eg. Russian hard sign)
1059  
1060          while (!feof($fh))    {
1061              $line = fgets($fh,4096);
1062                  // has a lot of info
1063              list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1064  
1065              $ord = hexdec($char);
1066              if ($ord > 0xFFFF)    break;    // only process the BMP
1067  
1068              $utf8_char = $this->UnumberToChar($ord);
1069  
1070              if ($upper)    $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1071              if ($lower)    $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1072                  // store "title" only when different from "upper" (only a few)
1073              if ($title && $title != $upper)    $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1074  
1075              switch ($cat{0})    {
1076                  case 'M':    // mark (accent, umlaut, ...)
1077                      $mark["U+$char"] = 1;
1078                      break;
1079  
1080                  case 'N':    // numeric value
1081                      if ($ord > 0x80 && $num != '')    $number["U+$char"] = $num;
1082              }
1083  
1084                  // accented Latin letters without "official" decomposition
1085              $match = array();
1086              if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp)    {
1087                  $c = ord($match[2]);
1088                  if ($match[1] == 'SMALL')    $c += 32;
1089  
1090                  $decomposition["U+$char"] = array(dechex($c));
1091                  continue;
1092              }
1093  
1094              $match = array();
1095              if (ereg('(<.*>)? *(.+)',$decomp,$match))    {
1096                  switch($match[1])    {
1097                      case '<circle>':    // add parenthesis as circle replacement, eg (1)
1098                          $match[2] = '0028 '.$match[2].' 0029';
1099                          break;
1100  
1101                      case '<square>':    // add square brackets as square replacement, eg [1]
1102                          $match[2] = '005B '.$match[2].' 005D';
1103                          break;
1104  
1105                      case '<compat>':    // ignore multi char decompositions that start with a space
1106                          if (ereg('^0020 ',$match[2]))    continue 2;
1107                          break;
1108  
1109                          // ignore Arabic and vertical layout presentation decomposition
1110                      case '<initial>':
1111                      case '<medial>':
1112                      case '<final>':
1113                      case '<isolated>':
1114                      case '<vertical>':
1115                          continue 2;
1116                  }
1117                  $decomposition["U+$char"] = split(' ',$match[2]);
1118              }
1119          }
1120          fclose($fh);
1121  
1122              // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1123          $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1124          if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))    {
1125              $fh = fopen($specialCasingFile,'rb');
1126              if ($fh)    {
1127                  while (!feof($fh))    {
1128                      $line = fgets($fh,4096);
1129                      if ($line{0} != '#' && trim($line) != '')    {
1130  
1131                          list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1132                          if ($cond == '' || $cond{0} == '#')    {
1133                              $utf8_char = $this->UnumberToChar(hexdec($char));
1134                              if ($char != $lower)    {
1135                                  $arr = split(' ',$lower);
1136                                  for ($i=0; isset($arr[$i]); $i++)    $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1137                                  $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1138                              }
1139                              if ($char != $title && $title != $upper)    {
1140                                  $arr = split(' ',$title);
1141                                  for ($i=0; isset($arr[$i]); $i++)    $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1142                                  $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1143                              }
1144                              if ($char != $upper)    {
1145                                      $arr = split(' ',$upper);
1146                                  for ($i=0; isset($arr[$i]); $i++)    $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1147                                  $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1148                              }
1149                          }
1150                      }
1151                  }
1152                  fclose($fh);
1153              }
1154          }
1155  
1156              // process custom decompositions
1157          $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1158          if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))    {
1159              $fh = fopen($customTranslitFile,'rb');
1160              if ($fh)    {
1161                  while (!feof($fh))    {
1162                      $line = fgets($fh,4096);
1163                      if ($line{0} != '#' && trim($line) != '')    {
1164                          list($char,$translit) = t3lib_div::trimExplode(';', $line);
1165                          if (!$translit)    $omit["U+$char"] = 1;
1166                          $decomposition["U+$char"] = split(' ', $translit);
1167  
1168                      }
1169                  }
1170                  fclose($fh);
1171              }
1172          }
1173  
1174              // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1175          foreach($decomposition as $from => $to)    {
1176              $code_decomp = array();
1177  
1178              while ($code_value = array_shift($to))    {
1179                  if (isset($decomposition["U+$code_value"]))    {    // do recursive decomposition
1180                      foreach(array_reverse($decomposition["U+$code_value"]) as $cv)    {
1181                          array_unshift($to, $cv);
1182                      }
1183                  } elseif (!isset($mark["U+$code_value"])) {    // remove mark
1184                      array_push($code_decomp, $code_value);
1185                  }
1186              }
1187              if (count($code_decomp) || isset($omit[$from]))    {
1188                  $decomposition[$from] = $code_decomp;
1189              } else {
1190                  unset($decomposition[$from]);
1191              }
1192          }
1193  
1194              // create ascii only mapping
1195          $this->toASCII['utf-8'] = array();
1196          $ascii =& $this->toASCII['utf-8'];
1197  
1198          foreach($decomposition as $from => $to)    {
1199              $code_decomp = array();
1200              while ($code_value = array_shift($to))    {
1201                  $ord = hexdec($code_value);
1202                  if ($ord > 127)
1203                      continue 2;    // skip decompositions containing non-ASCII chars
1204                  else
1205                      array_push($code_decomp,chr($ord));
1206              }
1207              $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1208          }
1209  
1210              // add numeric decompositions
1211          foreach($number as $from => $to)    {
1212              $utf8_char = $this->UnumberToChar(hexdec($from));
1213              if (!isset($ascii[$utf8_char]))    {
1214                  $ascii[$utf8_char] = $to;
1215              }
1216          }
1217  
1218          if ($cacheFileCase)    {
1219                  t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1220          }
1221  
1222          if ($cacheFileASCII)    {
1223                  t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1224          }
1225  
1226          return 3;
1227      }
1228  
1229      /**
1230       * This function initializes the folding table for a charset other than UTF-8.
1231       * This function is automatically called by the case folding functions.
1232       *
1233       * @param    string        Charset for which to initialize case folding.
1234       * @return    integer        Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1235       * @access private
1236       */
1237  	function initCaseFolding($charset)    {
1238              // Only process if the case table is not yet loaded:
1239          if (is_array($this->caseFolding[$charset]))    return 1;
1240  
1241              // Use cached version if possible
1242          $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1243          if ($cacheFile && @is_file($cacheFile))    {
1244              $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1245              return 2;
1246          }
1247  
1248              // init UTF-8 conversion for this charset
1249          if (!$this->initCharset($charset))    {
1250              return false;
1251          }
1252  
1253              // UTF-8 case folding is used as the base conversion table
1254          if (!$this->initUnicodeData('case'))    {
1255              return false;
1256          }
1257  
1258          $nochar = chr($this->noCharByteVal);
1259          foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)    {
1260                  // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1261              $c = $this->utf8_decode($utf8, $charset);
1262  
1263                  // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1264              $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1265              if ($cc != '' && $cc != $nochar)    $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1266  
1267                  // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1268              $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1269              if ($cc != '' && $cc != $nochar)    $this->caseFolding[$charset]['toLower'][$c] = $cc;
1270  
1271                  // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1272              $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1273              if ($cc != '' && $cc != $nochar)    $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1274          }
1275  
1276              // add the ASCII case table
1277          for ($i=ord('a'); $i<=ord('z'); $i++)    {
1278              $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1279          }
1280          for ($i=ord('A'); $i<=ord('Z'); $i++)    {
1281              $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1282          }
1283  
1284          if ($cacheFile)    {
1285                  t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1286          }
1287  
1288          return 3;
1289      }
1290  
1291      /**
1292       * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1293       * This function is automatically called by the ASCII transliteration functions.
1294       *
1295       * @param    string        Charset for which to initialize conversion.
1296       * @return    integer        Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1297       * @access private
1298       */
1299  	function initToASCII($charset)    {
1300              // Only process if the case table is not yet loaded:
1301          if (is_array($this->toASCII[$charset]))    return 1;
1302  
1303              // Use cached version if possible
1304          $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1305          if ($cacheFile && @is_file($cacheFile))    {
1306              $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1307              return 2;
1308          }
1309  
1310              // init UTF-8 conversion for this charset
1311          if (!$this->initCharset($charset))    {
1312              return false;
1313          }
1314  
1315              // UTF-8/ASCII transliteration is used as the base conversion table
1316          if (!$this->initUnicodeData('ascii'))    {
1317              return false;
1318          }
1319  
1320          $nochar = chr($this->noCharByteVal);
1321          foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)    {
1322                  // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1323              $c = $this->utf8_decode($utf8, $charset);
1324  
1325              if (isset($this->toASCII['utf-8'][$utf8]))    {
1326                  $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1327              }
1328          }
1329  
1330          if ($cacheFile)    {
1331                  t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1332          }
1333  
1334          return 3;
1335      }
1336  
1337  
1338  
1339  
1340  
1341  
1342  
1343  
1344  
1345  
1346  
1347  
1348  
1349  
1350  
1351  
1352      /********************************************
1353       *
1354       * String operation functions
1355       *
1356       ********************************************/
1357  
1358      /**
1359       * Returns a part of a string.
1360       * Unit-tested by Kasper (single byte charsets only)
1361       *
1362       * @param    string        The character set
1363       * @param    string        Character string
1364       * @param    integer        Start position (character position)
1365       * @param    integer        Length (in characters)
1366       * @return    string        The substring
1367       * @see substr(), mb_substr()
1368       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1369       */
1370  	function substr($charset,$string,$start,$len=null)    {
1371          if ($len===0)    return '';
1372  
1373          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring')    {
1374                  // cannot omit $len, when specifying charset
1375              if ($len==null)    {
1376                  $enc = mb_internal_encoding();    // save internal encoding
1377                  mb_internal_encoding($charset);
1378                  $str = mb_substr($string,$start);
1379                  mb_internal_encoding($enc);    // restore internal encoding
1380  
1381                  return $str;
1382              }
1383              else {
1384                  return mb_substr($string,$start,$len,$charset);
1385              }
1386          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')    {
1387                  // cannot omit $len, when specifying charset
1388              if ($len==null)    {
1389                  $enc = iconv_get_encoding('internal_encoding');    // save internal encoding
1390                  iconv_set_encoding('internal_encoding',$charset);
1391                  $str = iconv_substr($string,$start);
1392                  iconv_set_encoding('internal_encoding',$enc);    // restore internal encoding
1393  
1394                  return $str;
1395              }
1396              else {
1397                  return iconv_substr($string,$start,$len,$charset);
1398              }
1399          } elseif ($charset == 'utf-8')    {
1400              return $this->utf8_substr($string,$start,$len);
1401          } elseif ($this->eucBasedSets[$charset])    {
1402              return $this->euc_substr($string,$start,$charset,$len);
1403          } elseif ($this->twoByteSets[$charset])    {
1404              return substr($string,$start*2,$len*2);
1405          } elseif ($this->fourByteSets[$charset])    {
1406              return substr($string,$start*4,$len*4);
1407          }
1408  
1409          // treat everything else as single-byte encoding
1410          return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1411      }
1412  
1413      /**
1414       * Counts the number of characters.
1415       * Unit-tested by Kasper (single byte charsets only)
1416       *
1417       * @param    string        The character set
1418       * @param    string        Character string
1419       * @return    integer        The number of characters
1420       * @see strlen()
1421       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1422       */
1423  	function strlen($charset,$string)    {
1424          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring')    {
1425              return mb_strlen($string,$charset);
1426          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')    {
1427              return iconv_strlen($string,$charset);
1428          } elseif ($charset == 'utf-8')    {
1429              return $this->utf8_strlen($string);
1430          } elseif ($this->eucBasedSets[$charset])    {
1431              return $this->euc_strlen($string,$charset);
1432          } elseif ($this->twoByteSets[$charset])    {
1433              return strlen($string)/2;
1434          } elseif ($this->fourByteSets[$charset])    {
1435              return strlen($string)/4;
1436          }
1437          // treat everything else as single-byte encoding
1438          return strlen($string);
1439      }
1440  
1441      /**
1442       * Truncates a string and pre-/appends a string.
1443       * Unit tested by Kasper
1444       *
1445       * @param    string        The character set
1446       * @param    string        Character string
1447       * @param    integer        Length (in characters)
1448       * @param    string        Crop signifier
1449       * @return    string        The shortened string
1450       * @see substr(), mb_strimwidth()
1451       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1452       */
1453  	function crop($charset,$string,$len,$crop='')    {
1454          if (intval($len) == 0)    return $string;
1455  
1456          if ($charset == 'utf-8')    {
1457              $i = $this->utf8_char2byte_pos($string,$len);
1458          } elseif ($this->eucBasedSets[$charset])    {
1459              $i = $this->euc_char2byte_pos($string,$len,$charset);
1460          } else {
1461              if ($len > 0)    {
1462                  $i = $len;
1463              } else {
1464                  $i = strlen($string)+$len;
1465                  if ($i<=0)    $i = false;
1466              }
1467          }
1468  
1469          if ($i === false)    {    // $len outside actual string length
1470              return $string;
1471          } else    {
1472              if ($len > 0)    {
1473                  if (strlen($string{$i}))    {
1474                      return substr($string,0,$i).$crop;
1475  
1476                  }
1477              } else {
1478                  if (strlen($string{$i-1}))    {
1479                      return $crop.substr($string,$i);
1480                  }
1481              }
1482  
1483  /*
1484              if (abs($len)<$this->strlen($charset,$string))    {    // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1485                  if ($len > 0)    {
1486                      return substr($string,0,$i).$crop;
1487                  } else {
1488                      return $crop.substr($string,$i);
1489                  }
1490              }
1491  */
1492          }
1493          return $string;
1494      }
1495  
1496      /**
1497       * Cuts a string short at a given byte length.
1498       *
1499       * @param    string        The character set
1500       * @param    string        Character string
1501       * @param    integer        The byte length
1502       * @return    string        The shortened string
1503       * @see mb_strcut()
1504       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1505       */
1506  	function strtrunc($charset,$string,$len)    {
1507          if ($len <= 0)    return '';
1508  
1509          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring')    {
1510              return mb_strcut($string,0,$len,$charset);
1511          } elseif ($charset == 'utf-8')    {
1512              return $this->utf8_strtrunc($string,$len);
1513          } elseif ($this->eucBasedSets[$charset])    {
1514              return $this->euc_strtrunc($string,$charset);
1515          } elseif ($this->twoByteSets[$charset])    {
1516              if ($len % 2)    $len--;        // don't cut at odd positions
1517          } elseif ($this->fourByteSets[$charset])    {
1518              $x = $len % 4;
1519              $len -= $x;    // realign to position dividable by four
1520          }
1521          // treat everything else as single-byte encoding
1522          return substr($string,0,$len);
1523      }
1524  
1525      /**
1526       * Translates all characters of a string into their respective case values.
1527       * Unlike strtolower() and strtoupper() this method is locale independent.
1528       * Note that the string length may change!
1529       * eg. lower case German �(sharp S) becomes upper case "SS"
1530       * Unit-tested by Kasper
1531       * Real case folding is language dependent, this method ignores this fact.
1532       *
1533       * @param    string        Character set of string
1534       * @param    string        Input string to convert case for
1535       * @param    string        Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1536       * @return    string        The converted string
1537       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1538       * @see strtolower(), strtoupper()
1539       */
1540  	function conv_case($charset,$string,$case)    {
1541          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring')    {
1542              if ($case == 'toLower')    {
1543                  $string = mb_strtolower($string,$charset);
1544              } else {
1545                  $string = mb_strtoupper($string,$charset);
1546              }
1547          } elseif ($charset == 'utf-8')    {
1548              $string = $this->utf8_char_mapping($string,'case',$case);
1549          } elseif (isset($this->eucBasedSets[$charset]))    {
1550              $string = $this->euc_char_mapping($string,$charset,'case',$case);
1551          } else {
1552                  // treat everything else as single-byte encoding
1553              $string = $this->sb_char_mapping($string,$charset,'case',$case);
1554          }
1555  
1556          return $string;
1557      }
1558  
1559      /**
1560       * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1561       *
1562       * @param    string        Character set of string
1563       * @param    string        Input string to convert
1564       * @return    string        The converted string
1565       */
1566  	function specCharsToASCII($charset,$string)    {
1567          if ($charset == 'utf-8')    {
1568              $string = $this->utf8_char_mapping($string,'ascii');
1569          } elseif (isset($this->eucBasedSets[$charset]))    {
1570              $string = $this->euc_char_mapping($string,$charset,'ascii');
1571          } else {
1572                  // treat everything else as single-byte encoding
1573              $string = $this->sb_char_mapping($string,$charset,'ascii');
1574          }
1575  
1576          return $string;
1577      }
1578  
1579  
1580  
1581  
1582  
1583  
1584  
1585  
1586  
1587  
1588  
1589  
1590      /********************************************
1591       *
1592       * Internal string operation functions
1593       *
1594       ********************************************/
1595  
1596      /**
1597       * Maps all characters of a string in a single byte charset.
1598       *
1599       * @param    string        the string
1600       * @param    string        the charset
1601       * @param    string        mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1602       * @param    string        'case': conversion 'toLower' or 'toUpper'
1603       * @return    string        the converted string
1604       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1605       */
1606  	function sb_char_mapping($str,$charset,$mode,$opt='')    {
1607          switch($mode)    {
1608              case 'case':
1609                  if (!$this->initCaseFolding($charset))    return $str;    // do nothing
1610                  $map =& $this->caseFolding[$charset][$opt];
1611                  break;
1612  
1613              case 'ascii':
1614                  if (!$this->initToASCII($charset))    return $str;    // do nothing
1615                  $map =& $this->toASCII[$charset];
1616                  break;
1617  
1618              default:
1619                  return $str;
1620          }
1621  
1622          $out = '';
1623          for($i=0; strlen($str{$i}); $i++)    {
1624              $c = $str{$i};
1625              if (isset($map[$c]))    {
1626                  $out .= $map[$c];
1627              } else {
1628                  $out .= $c;
1629              }
1630          }
1631  
1632          return $out;
1633      }
1634  
1635  
1636  
1637  
1638  
1639  
1640  
1641  
1642  
1643  
1644      /********************************************
1645       *
1646       * Internal UTF-8 string operation functions
1647       *
1648       ********************************************/
1649  
1650      /**
1651       * Returns a part of a UTF-8 string.
1652       * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1653       *
1654       * @param    string        UTF-8 string
1655       * @param    integer        Start position (character position)
1656       * @param    integer        Length (in characters)
1657       * @return    string        The substring
1658       * @see substr()
1659       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1660       */
1661  	function utf8_substr($str,$start,$len=null)    {
1662          if (!strcmp($len,'0'))    return '';
1663  
1664          $byte_start = $this->utf8_char2byte_pos($str,$start);
1665          if ($byte_start === false)    {
1666              if ($start > 0)    {
1667                  return false;    // $start outside string length
1668              } else {
1669                  $start = 0;
1670              }
1671          }
1672  
1673          $str = substr($str,$byte_start);
1674  
1675          if ($len!=null)    {
1676              $byte_end = $this->utf8_char2byte_pos($str,$len);
1677              if ($byte_end === false)    // $len outside actual string length
1678                  return $len<0 ? '' : $str;    // When length is less than zero and exceeds, then we return blank string.
1679              else
1680                  return substr($str,0,$byte_end);
1681          }
1682          else    return $str;
1683      }
1684  
1685      /**
1686       * Counts the number of characters of a string in UTF-8.
1687       * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1688       *
1689       * @param    string        UTF-8 multibyte character string
1690       * @return    integer        The number of characters
1691       * @see strlen()
1692       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1693       */
1694  	function utf8_strlen($str)    {
1695          $n=0;
1696          for($i=0; strlen($str{$i}); $i++)    {
1697              $c = ord($str{$i});
1698              if (!($c & 0x80))    // single-byte (0xxxxxx)
1699                  $n++;
1700              elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1701                  $n++;
1702          }
1703          return $n;
1704      }
1705  
1706      /**
1707       * Truncates a string in UTF-8 short at a given byte length.
1708       *
1709       * @param    string        UTF-8 multibyte character string
1710       * @param    integer        the byte length
1711       * @return    string        the shortened string
1712       * @see mb_strcut()
1713       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1714       */
1715  	function utf8_strtrunc($str,$len)    {
1716          $i = $len-1;
1717          if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1718              for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)    ;    // find the first byte
1719              if ($i <= 0)    return ''; // sanity check
1720              for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)    $bc++;    // calculate number of bytes
1721              if ($bc+$i > $len)    return substr($str,0,$i);
1722                          // fallthru: multibyte char fits into length
1723          }
1724          return substr($str,0,$len);
1725      }
1726  
1727      /**
1728       * Find position of first occurrence of a string, both arguments are in UTF-8.
1729       *
1730       * @param    string        UTF-8 string to search in
1731       * @param    string        UTF-8 string to search for
1732       * @param    integer        Positition to start the search
1733       * @return    integer        The character position
1734       * @see strpos()
1735       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1736       */
1737  	function utf8_strpos($haystack,$needle,$offset=0)    {
1738          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring')    {
1739              return mb_strpos($haystack,$needle,$offset,'utf-8');
1740          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')    {
1741              return iconv_strpos($haystack,$needle,$offset,'utf-8');
1742          }
1743  
1744          $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1745          if ($byte_offset === false)    return false; // offset beyond string length
1746  
1747          $byte_pos = strpos($haystack,$needle,$byte_offset);
1748          if ($byte_pos === false)    return false; // needle not found
1749  
1750          return $this->utf8_byte2char_pos($haystack,$byte_pos);
1751      }
1752  
1753      /**
1754       * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1755       *
1756       * @param    string        UTF-8 string to search in
1757       * @param    string        UTF-8 character to search for (single character)
1758       * @return    integer        The character position
1759       * @see strrpos()
1760       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1761       */
1762  	function utf8_strrpos($haystack,$needle)    {
1763          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring')    {
1764              return mb_strrpos($haystack,$needle,'utf-8');
1765          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')    {
1766              return iconv_strrpos($haystack,$needle,'utf-8');
1767          }
1768  
1769          $byte_pos = strrpos($haystack,$needle);
1770          if ($byte_pos === false)    return false; // needle not found
1771  
1772          return $this->utf8_byte2char_pos($haystack,$byte_pos);
1773      }
1774  
1775      /**
1776       * Translates a character position into an 'absolute' byte position.
1777       * Unit tested by Kasper.
1778       *
1779       * @param    string        UTF-8 string
1780       * @param    integer        Character position (negative values start from the end)
1781       * @return    integer        Byte position
1782       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1783       */
1784  	function utf8_char2byte_pos($str,$pos)    {
1785          $n = 0;                // number of characters found
1786          $p = abs($pos);        // number of characters wanted
1787  
1788          if ($pos >= 0)    {
1789              $i = 0;
1790              $d = 1;
1791          } else {
1792              $i = strlen($str)-1;
1793              $d = -1;
1794          }
1795  
1796          for( ; strlen($str{$i}) && $n<$p; $i+=$d)    {
1797              $c = (int)ord($str{$i});
1798              if (!($c & 0x80))    // single-byte (0xxxxxx)
1799                  $n++;
1800              elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1801                  $n++;
1802          }
1803          if (!strlen($str{$i}))    return false; // offset beyond string length
1804  
1805          if ($pos >= 0)    {
1806                  // skip trailing multi-byte data bytes
1807              while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1808          } else {
1809                  // correct offset
1810              $i++;
1811          }
1812  
1813          return $i;
1814      }
1815  
1816      /**
1817       * Translates an 'absolute' byte position into a character position.
1818       * Unit tested by Kasper.
1819       *
1820       * @param    string        UTF-8 string
1821       * @param    integer        byte position
1822       * @return    integer        character position
1823       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1824       */
1825  	function utf8_byte2char_pos($str,$pos)    {
1826          $n = 0;    // number of characters
1827          for($i=$pos; $i>0; $i--)    {
1828              $c = (int)ord($str{$i});
1829              if (!($c & 0x80))    // single-byte (0xxxxxx)
1830                  $n++;
1831              elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1832                  $n++;
1833          }
1834          if (!strlen($str{$i}))    return false; // offset beyond string length
1835  
1836          return $n;
1837      }
1838  
1839      /**
1840       * Maps all characters of an UTF-8 string.
1841       *
1842       * @param    string        UTF-8 string
1843       * @param    string        mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1844       * @param    string        'case': conversion 'toLower' or 'toUpper'
1845       * @return    string        the converted string
1846       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1847       */
1848  	function utf8_char_mapping($str,$mode,$opt='')    {
1849          if (!$this->initUnicodeData($mode))    return $str;    // do nothing
1850  
1851          $out = '';
1852          switch($mode)    {
1853              case 'case':
1854                  $map =& $this->caseFolding['utf-8'][$opt];
1855                  break;
1856  
1857              case 'ascii':
1858                  $map =& $this->toASCII['utf-8'];
1859                  break;
1860  
1861              default:
1862                  return $str;
1863          }
1864  
1865          for($i=0; strlen($str{$i}); $i++)    {
1866              $c = ord($str{$i});
1867              if (!($c & 0x80))    // single-byte (0xxxxxx)
1868                  $mbc = $str{$i};
1869              elseif (($c & 0xC0) == 0xC0)    {    // multi-byte starting byte (11xxxxxx)
1870                  for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; }    // calculate number of bytes
1871                  $mbc = substr($str,$i,$bc);
1872                  $i += $bc-1;
1873              }
1874  
1875              if (isset($map[$mbc]))    {
1876                  $out .= $map[$mbc];
1877              } else {
1878                  $out .= $mbc;
1879              }
1880          }
1881  
1882          return $out;
1883      }
1884  
1885  
1886  
1887  
1888  
1889  
1890  
1891  
1892  
1893  
1894  
1895  
1896  
1897  
1898  
1899  
1900  
1901  
1902      /********************************************
1903       *
1904       * Internal EUC string operation functions
1905       *
1906       * Extended Unix Code:
1907       *  ASCII compatible 7bit single bytes chars
1908       *  8bit two byte chars
1909       *
1910       * Shift-JIS is treated as a special case.
1911       *
1912       ********************************************/
1913  
1914      /**
1915       * Cuts a string in the EUC charset family short at a given byte length.
1916       *
1917       * @param    string        EUC multibyte character string
1918       * @param    integer        the byte length
1919       * @param    string        the charset
1920       * @return    string        the shortened string
1921       * @see mb_strcut()
1922       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1923       */
1924  	function euc_strtrunc($str,$len,$charset)     {
1925          $sjis = ($charset == 'shift_jis');
1926          for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1927              $c = ord($str{$i});
1928              if ($sjis)    {
1929                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))    $i++;    // advance a double-byte char
1930              }
1931              else    {
1932                  if ($c >= 0x80)    $i++;    // advance a double-byte char
1933              }
1934          }
1935          if (!strlen($str{$i}))    return $str;    // string shorter than supplied length
1936  
1937          if ($i>$len)
1938              return substr($str,0,$len-1);    // we ended on a first byte
1939          else
1940              return substr($str,0,$len);
1941          }
1942  
1943      /**
1944       * Returns a part of a string in the EUC charset family.
1945       *
1946       * @param    string        EUC multibyte character string
1947       * @param    integer        start position (character position)
1948       * @param    string        the charset
1949       * @param    integer        length (in characters)
1950       * @return    string        the substring
1951       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1952       */
1953  	function euc_substr($str,$start,$charset,$len=null)    {
1954          $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1955          if ($byte_start === false)    return false;    // $start outside string length
1956  
1957          $str = substr($str,$byte_start);
1958  
1959          if ($len!=null)    {
1960              $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1961              if ($byte_end === false)    // $len outside actual string length
1962                  return $str;
1963              else
1964                  return substr($str,0,$byte_end);
1965          }
1966          else    return $str;
1967      }
1968  
1969      /**
1970       * Counts the number of characters of a string in the EUC charset family.
1971       *
1972       * @param    string        EUC multibyte character string
1973       * @param    string        the charset
1974       * @return    integer        the number of characters
1975       * @see strlen()
1976       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1977       */
1978  	function euc_strlen($str,$charset)     {
1979          $sjis = ($charset == 'shift_jis');
1980          $n=0;
1981          for ($i=0; strlen($str{$i}); $i++) {
1982              $c = ord($str{$i});
1983              if ($sjis)    {
1984                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))    $i++;    // advance a double-byte char
1985              }
1986              else    {
1987                  if ($c >= 0x80)    $i++;    // advance a double-byte char
1988              }
1989  
1990              $n++;
1991          }
1992  
1993          return $n;
1994      }
1995  
1996      /**
1997       * Translates a character position into an 'absolute' byte position.
1998       *
1999       * @param    string        EUC multibyte character string
2000       * @param    integer        character position (negative values start from the end)
2001       * @param    string        the charset
2002       * @return    integer        byte position
2003       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2004       */
2005  	function euc_char2byte_pos($str,$pos,$charset)    {
2006          $sjis = ($charset == 'shift_jis');
2007          $n = 0; // number of characters seen
2008          $p = abs($pos);    // number of characters wanted
2009  
2010          if ($pos >= 0)    {
2011              $i = 0;
2012              $d = 1;
2013          } else {
2014              $i = strlen($str)-1;
2015              $d = -1;
2016          }
2017  
2018          for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2019              $c = ord($str{$i});
2020              if ($sjis)    {
2021                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))    $i+=$d;    // advance a double-byte char
2022              }
2023              else    {
2024                  if ($c >= 0x80)    $i+=$d;    // advance a double-byte char
2025              }
2026  
2027              $n++;
2028          }
2029          if (!strlen($str{$i}))    return false; // offset beyond string length
2030  
2031          if ($pos < 0)    $i++;    // correct offset
2032  
2033          return $i;
2034      }
2035  
2036      /**
2037       * Maps all characters of a string in the EUC charset family.
2038       *
2039       * @param    string        EUC multibyte character string
2040       * @param    string        the charset
2041       * @param    string        mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2042       * @param    string        'case': conversion 'toLower' or 'toUpper'
2043       * @return    string        the converted string
2044       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2045       */
2046  	function euc_char_mapping($str,$charset,$mode,$opt='')    {
2047          switch($mode)    {
2048              case 'case':
2049                  if (!$this->initCaseFolding($charset))    return $str;    // do nothing
2050                  $map =& $this->caseFolding[$charset][$opt];
2051                  break;
2052  
2053              case 'ascii':
2054                  if (!$this->initToASCII($charset))    return $str;    // do nothing
2055                  $map =& $this->toASCII[$charset];
2056                  break;
2057  
2058              default:
2059                  return $str;
2060          }
2061  
2062          $sjis = ($charset == 'shift_jis');
2063          $out = '';
2064          for($i=0; strlen($str{$i}); $i++)    {
2065              $mbc = $str{$i};
2066              $c = ord($mbc);
2067  
2068              if ($sjis)    {
2069                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))    {    // a double-byte char
2070                      $mbc = substr($str,$i,2);
2071                      $i++;
2072                  }
2073              }
2074              else    {
2075                  if ($c >= 0x80)    {    // a double-byte char
2076                      $mbc = substr($str,$i,2);
2077                      $i++;
2078                  }
2079              }
2080  
2081              if (isset($map[$mbc]))    {
2082                  $out .= $map[$mbc];
2083              } else {
2084                  $out .= $mbc;
2085              }
2086          }
2087  
2088          return $out;
2089      }
2090  
2091  }
2092  
2093  if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])    {
2094      include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2095  }
2096  ?>


Généré le : Sun Nov 25 17:13:16 2007 par Balluche grâce à PHPXref 0.7
  Clicky Web Analytics