DokuWiki 2006-11-06 : /inc/utf8.php source

[Sommaire] [Imprimer]
   1  <?php
   2  /**
   3   * UTF8 helper functions
   4   *
   5   * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
   6   * @author     Andreas Gohr <andi@splitbrain.org>
   7   */
   8  
   9  /**
  10   * check for mb_string support
  11   */
  12  if(!defined('UTF8_MBSTRING')){
  13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
  14      define('UTF8_MBSTRING',1);
  15    }else{
  16      define('UTF8_MBSTRING',0);
  17    }
  18  }
  19  
  20  if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
  21  
  22  
  23  /**
  24   * URL-Encode a filename to allow unicodecharacters
  25   *
  26   * Slashes are not encoded
  27   *
  28   * When the second parameter is true the string will
  29   * be encoded only if non ASCII characters are detected -
  30   * This makes it safe to run it multiple times on the
  31   * same string (default is true)
  32   *
  33   * @author Andreas Gohr <andi@splitbrain.org>
  34   * @see    urlencode
  35   */
  36  function utf8_encodeFN($file,$safe=true){
  37    if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
  38      return $file;
  39    }
  40    $file = urlencode($file);
  41    $file = str_replace('%2F','/',$file);
  42    return $file;
  43  }
  44  
  45  /**
  46   * URL-Decode a filename
  47   *
  48   * This is just a wrapper around urldecode
  49   *
  50   * @author Andreas Gohr <andi@splitbrain.org>
  51   * @see    urldecode
  52   */
  53  function utf8_decodeFN($file){
  54    $file = urldecode($file);
  55    return $file;
  56  }
  57  
  58  /**
  59   * Checks if a string contains 7bit ASCII only
  60   *
  61   * @author Andreas Gohr <andi@splitbrain.org>
  62   */
  63  function utf8_isASCII($str){
  64    for($i=0; $i<strlen($str); $i++){
  65      if(ord($str{$i}) >127) return false;
  66    }
  67    return true;
  68  }
  69  
  70  /**
  71   * Strips all highbyte chars
  72   *
  73   * Returns a pure ASCII7 string
  74   *
  75   * @author Andreas Gohr <andi@splitbrain.org>
  76   */
  77  function utf8_strip($str){
  78    $ascii = '';
  79    for($i=0; $i<strlen($str); $i++){
  80      if(ord($str{$i}) <128){
  81        $ascii .= $str{$i};
  82      }
  83    }
  84    return $ascii;
  85  }
  86  
  87  /**
  88   * Tries to detect if a string is in Unicode encoding
  89   *
  90   * @author <bmorel@ssi.fr>
  91   * @link   http://www.php.net/manual/en/function.utf8-encode.php
  92   */
  93  function utf8_check($Str) {
  94   for ($i=0; $i<strlen($Str); $i++) {
  95    $b = ord($Str[$i]);
  96    if ($b < 0x80) continue; # 0bbbbbbb
  97    elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
  98    elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
  99    elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
 100    elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
 101    elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
 102    else return false; # Does not match any model
 103    for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
 104     if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
 105     return false;
 106    }
 107   }
 108   return true;
 109  }
 110  
 111  /**
 112   * Unicode aware replacement for strlen()
 113   *
 114   * utf8_decode() converts characters that are not in ISO-8859-1
 115   * to '?', which, for the purpose of counting, is alright - It's
 116   * even faster than mb_strlen.
 117   *
 118   * @author <chernyshevsky at hotmail dot com>
 119   * @see    strlen()
 120   * @see    utf8_decode()
 121   */
 122  function utf8_strlen($string){
 123    return strlen(utf8_decode($string));
 124  }
 125  
 126  /**
 127   * UTF-8 aware alternative to substr
 128   *
 129   * Return part of a string given character offset (and optionally length)
 130   *
 131   * @author Harry Fuecks <hfuecks@gmail.com>
 132   * @author Chris Smith <chris@jalakai.co.uk>
 133   * @param string
 134   * @param integer number of UTF-8 characters offset (from left)
 135   * @param integer (optional) length in UTF-8 characters from offset
 136   * @return mixed string or FALSE if failure
 137   */
 138  function utf8_substr($str, $offset, $length = null) {
 139      if(UTF8_MBSTRING){
 140          if( $length === null ){
 141              return mb_substr($str, $offset);
 142          }else{
 143              return mb_substr($str, $offset, $length);
 144          }
 145      }
 146  
 147      /*
 148       * Notes:
 149       *
 150       * no mb string support, so we'll use pcre regex's with 'u' flag
 151       * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
 152       * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
 153       *
 154       * substr documentation states false can be returned in some cases (e.g. offset > string length)
 155       * mb_substr never returns false, it will return an empty string instead.
 156       *
 157       * calculating the number of characters in the string is a relatively expensive operation, so
 158       * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
 159       */
 160  
 161      // cast parameters to appropriate types to avoid multiple notices/warnings
 162      $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
 163      $offset = (int)$offset;
 164      if (!is_null($length)) $length = (int)$length;
 165  
 166      // handle trivial cases
 167      if ($length === 0) return '';
 168      if ($offset < 0 && $length < 0 && $length < $offset) return '';
 169  
 170      $offset_pattern = '';
 171      $length_pattern = '';
 172  
 173      // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
 174      if ($offset < 0) {
 175        $strlen = strlen(utf8_decode($str));        // see notes
 176        $offset = $strlen + $offset;
 177        if ($offset < 0) $offset = 0;
 178      }
 179  
 180      // establish a pattern for offset, a non-captured group equal in length to offset
 181      if ($offset > 0) {
 182        $Ox = (int)($offset/65535);
 183        $Oy = $offset%65535;
 184  
 185        if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
 186        $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
 187      } else {
 188        $offset_pattern = '^';                      // offset == 0; just anchor the pattern
 189      }
 190  
 191      // establish a pattern for length
 192      if (is_null($length)) {
 193        $length_pattern = '(.*)$';                  // the rest of the string
 194      } else {
 195  
 196        if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
 197        if ($offset > $strlen) return '';           // another trivial case
 198  
 199        if ($length > 0) {
 200  
 201          $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
 202  
 203          $Lx = (int)($length/65535);
 204          $Ly = $length%65535;
 205  
 206          // +ve length requires ... a captured group of length characters
 207          if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
 208          $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
 209  
 210        } else if ($length < 0) {
 211  
 212          if ($length < ($offset - $strlen)) return '';
 213  
 214          $Lx = (int)((-$length)/65535);
 215          $Ly = (-$length)%65535;
 216  
 217          // -ve length requires ... capture everything except a group of -length characters 
 218          //                         anchored at the tail-end of the string
 219          if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
 220          $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
 221        }
 222      }
 223  
 224      if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
 225      return $match[1];
 226  }
 227  
 228  /**
 229   * Unicode aware replacement for substr_replace()
 230   *
 231   * @author Andreas Gohr <andi@splitbrain.org>
 232   * @see    substr_replace()
 233   */
 234  function utf8_substr_replace($string, $replacement, $start , $length=0 ){
 235    $ret = '';
 236    if($start>0) $ret .= utf8_substr($string, 0, $start);
 237    $ret .= $replacement;
 238    $ret .= utf8_substr($string, $start+$length);
 239    return $ret;
 240  }
 241  
 242  /**
 243   * Unicode aware replacement for explode
 244   *
 245   * @TODO   support third limit arg
 246   * @author Harry Fuecks <hfuecks@gmail.com>
 247   * @see    explode();
 248   */
 249  function utf8_explode($sep, $str) {
 250    if ( $sep == '' ) {
 251      trigger_error('Empty delimiter',E_USER_WARNING);
 252      return FALSE;
 253    }
 254  
 255    return preg_split('!'.preg_quote($sep,'!').'!u',$str);
 256  }
 257  
 258  /**
 259   * Unicode aware replacement for strrepalce()
 260   *
 261   * @todo   support PHP5 count (fourth arg)
 262   * @author Harry Fuecks <hfuecks@gmail.com>
 263   * @see    strreplace();
 264   */
 265  function utf8_str_replace($s,$r,$str){
 266    if(!is_array($s)){
 267      $s = '!'.preg_quote($s,'!').'!u';
 268    }else{
 269      foreach ($s as $k => $v) {
 270        $s[$k] = '!'.preg_quote($v).'!u';
 271      }
 272    }
 273    return preg_replace($s,$r,$str);
 274  }
 275  
 276  /**
 277   * Unicode aware replacement for ltrim()
 278   *
 279   * @author Andreas Gohr <andi@splitbrain.org>
 280   * @see    ltrim()
 281   * @return string
 282   */
 283  function utf8_ltrim($str,$charlist=''){
 284    if($charlist == '') return ltrim($str);
 285  
 286    //quote charlist for use in a characterclass
 287    $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\$1}',$charlist);
 288  
 289    return preg_replace('/^['.$charlist.']+/u','',$str);
 290  }
 291  
 292  /**
 293   * Unicode aware replacement for rtrim()
 294   *
 295   * @author Andreas Gohr <andi@splitbrain.org>
 296   * @see    rtrim()
 297   * @return string
 298   */
 299  function  utf8_rtrim($str,$charlist=''){
 300    if($charlist == '') return rtrim($str);
 301  
 302    //quote charlist for use in a characterclass
 303    $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\$1}',$charlist);
 304  
 305    return preg_replace('/['.$charlist.']+$/u','',$str);
 306  }
 307  
 308  /**
 309   * Unicode aware replacement for trim()
 310   *
 311   * @author Andreas Gohr <andi@splitbrain.org>
 312   * @see    trim()
 313   * @return string
 314   */
 315  function  utf8_trim($str,$charlist='') {
 316    if($charlist == '') return trim($str);
 317  
 318    return utf8_ltrim(utf8_rtrim($str));
 319  }
 320  
 321  
 322  /**
 323   * This is a unicode aware replacement for strtolower()
 324   *
 325   * Uses mb_string extension if available
 326   *
 327   * @author Andreas Gohr <andi@splitbrain.org>
 328   * @see    strtolower()
 329   * @see    utf8_strtoupper()
 330   */
 331  function utf8_strtolower($string){
 332    if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
 333  
 334    global $UTF8_UPPER_TO_LOWER;
 335    $uni = utf8_to_unicode($string);
 336    $cnt = count($uni);
 337    for ($i=0; $i < $cnt; $i++){
 338      if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
 339        $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
 340      }
 341    }
 342    return unicode_to_utf8($uni);
 343  }
 344  
 345  /**
 346   * This is a unicode aware replacement for strtoupper()
 347   *
 348   * Uses mb_string extension if available
 349   *
 350   * @author Andreas Gohr <andi@splitbrain.org>
 351   * @see    strtoupper()
 352   * @see    utf8_strtoupper()
 353   */
 354  function utf8_strtoupper($string){
 355    if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
 356  
 357    global $UTF8_LOWER_TO_UPPER;
 358    $uni = utf8_to_unicode($string);
 359    $cnt = count($uni);
 360    for ($i=0; $i < $cnt; $i++){
 361      if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
 362        $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
 363      }
 364    }
 365    return unicode_to_utf8($uni);
 366  }
 367  
 368  /**
 369   * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
 370   *
 371   * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
 372   * letters. Default is to deaccent both cases ($case = 0)
 373   *
 374   * @author Andreas Gohr <andi@splitbrain.org>
 375   */
 376  function utf8_deaccent($string,$case=0){
 377    if($case <= 0){
 378      global $UTF8_LOWER_ACCENTS;
 379      $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
 380    }
 381    if($case >= 0){
 382      global $UTF8_UPPER_ACCENTS;
 383      $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
 384    }
 385    return $string;
 386  }
 387  
 388  /**
 389   * Romanize a non-latin string
 390   *
 391   * @author Andreas Gohr <andi@splitbrain.org>
 392   */
 393  function utf8_romanize($string){
 394    if(utf8_isASCII($string)) return $string; //nothing to do
 395  
 396    global $UTF8_ROMANIZATION;
 397    return strtr($string,$UTF8_ROMANIZATION);
 398  }
 399  
 400  /**
 401   * Removes special characters (nonalphanumeric) from a UTF-8 string
 402   *
 403   * This function adds the controlchars 0x00 to 0x19 to the array of
 404   * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
 405   *
 406   * @author Andreas Gohr <andi@splitbrain.org>
 407   * @param  string $string     The UTF8 string to strip of special chars
 408   * @param  string $repl       Replace special with this string
 409   * @param  string $additional Additional chars to strip (used in regexp char class)
 410   */
 411  function utf8_stripspecials($string,$repl='',$additional=''){
 412    global $UTF8_SPECIAL_CHARS;
 413    global $UTF8_SPECIAL_CHARS2;
 414  
 415    static $specials = null;
 416    if(is_null($specials)){
 417  #    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
 418      $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
 419    }
 420  
 421    return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
 422  }
 423  
 424  /**
 425   * This is an Unicode aware replacement for strpos
 426   *
 427   * Uses mb_string extension if available
 428   *
 429   * @author Harry Fuecks <hfuecks@gmail.com>
 430   * @see    strpos()
 431   */
 432  function utf8_strpos($haystack, $needle,$offset=0) {
 433    if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8');
 434  
 435    if(!$offset){
 436      $ar = utf8_explode($needle, $haystack);
 437      if ( count($ar) > 1 ) {
 438         return utf8_strlen($ar[0]);
 439      }
 440      return false;
 441    }else{
 442      if ( !is_int($offset) ) {
 443        trigger_error('Offset must be an integer',E_USER_WARNING);
 444        return false;
 445      }
 446  
 447      $haystack = utf8_substr($haystack, $offset);
 448  
 449      if ( false !== ($pos = utf8_strpos($haystack,$needle))){
 450         return $pos + $offset;
 451      }
 452      return false;
 453    }
 454  }
 455  
 456  /**
 457   * Encodes UTF-8 characters to HTML entities
 458   *
 459   * @author <vpribish at shopping dot com>
 460   * @link   http://www.php.net/manual/en/function.utf8-decode.php
 461   */
 462  function utf8_tohtml ($str) {
 463    $ret = '';
 464    $max = strlen($str);
 465    $last = 0;  // keeps the index of the last regular character
 466    for ($i=0; $i<$max; $i++) {
 467      $c = $str{$i};
 468      $c1 = ord($c);
 469      if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
 470        $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
 471        $c1 &= 31; // remove the 3 bit two bytes prefix
 472        $c2 = ord($str{++$i}); // the next byte
 473        $c2 &= 63;  // remove the 2 bit trailing byte prefix
 474        $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
 475        $c1 >>= 2; // c1 shifts 2 to the right
 476        $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
 477        $last = $i+1;
 478      }
 479    }
 480    return $ret . substr($str, $last, $i); // append the last batch of regular characters
 481  }
 482  
 483  /**
 484   * Takes an UTF-8 string and returns an array of ints representing the
 485   * Unicode characters. Astral planes are supported ie. the ints in the
 486   * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 487   * are not allowed.
 488   *
 489   * If $strict is set to true the function returns false if the input
 490   * string isn't a valid UTF-8 octet sequence and raises a PHP error at
 491   * level E_USER_WARNING
 492   *
 493   * Note: this function has been modified slightly in this library to
 494   * trigger errors on encountering bad bytes
 495   *
 496   * @author <hsivonen@iki.fi>
 497   * @author Harry Fuecks <hfuecks@gmail.com>
 498   * @param  string  UTF-8 encoded string
 499   * @param  boolean Check for invalid sequences?
 500   * @return mixed array of unicode code points or FALSE if UTF-8 invalid
 501   * @see    unicode_to_utf8
 502   * @link   http://hsivonen.iki.fi/php-utf8/
 503   * @link   http://sourceforge.net/projects/phputf8/
 504   */
 505  function utf8_to_unicode($str,$strict=false) {
 506      $mState = 0;     // cached expected number of octets after the current octet
 507                       // until the beginning of the next UTF8 character sequence
 508      $mUcs4  = 0;     // cached Unicode character
 509      $mBytes = 1;     // cached expected number of octets in the current sequence
 510  
 511      $out = array();
 512  
 513      $len = strlen($str);
 514  
 515      for($i = 0; $i < $len; $i++) {
 516  
 517          $in = ord($str{$i});
 518  
 519          if ( $mState == 0) {
 520  
 521              // When mState is zero we expect either a US-ASCII character or a
 522              // multi-octet sequence.
 523              if (0 == (0x80 & ($in))) {
 524                  // US-ASCII, pass straight through.
 525                  $out[] = $in;
 526                  $mBytes = 1;
 527  
 528              } else if (0xC0 == (0xE0 & ($in))) {
 529                  // First octet of 2 octet sequence
 530                  $mUcs4 = ($in);
 531                  $mUcs4 = ($mUcs4 & 0x1F) << 6;
 532                  $mState = 1;
 533                  $mBytes = 2;
 534  
 535              } else if (0xE0 == (0xF0 & ($in))) {
 536                  // First octet of 3 octet sequence
 537                  $mUcs4 = ($in);
 538                  $mUcs4 = ($mUcs4 & 0x0F) << 12;
 539                  $mState = 2;
 540                  $mBytes = 3;
 541  
 542              } else if (0xF0 == (0xF8 & ($in))) {
 543                  // First octet of 4 octet sequence
 544                  $mUcs4 = ($in);
 545                  $mUcs4 = ($mUcs4 & 0x07) << 18;
 546                  $mState = 3;
 547                  $mBytes = 4;
 548  
 549              } else if (0xF8 == (0xFC & ($in))) {
 550                  /* First octet of 5 octet sequence.
 551                   *
 552                   * This is illegal because the encoded codepoint must be either
 553                   * (a) not the shortest form or
 554                   * (b) outside the Unicode range of 0-0x10FFFF.
 555                   * Rather than trying to resynchronize, we will carry on until the end
 556                   * of the sequence and let the later error handling code catch it.
 557                   */
 558                  $mUcs4 = ($in);
 559                  $mUcs4 = ($mUcs4 & 0x03) << 24;
 560                  $mState = 4;
 561                  $mBytes = 5;
 562  
 563              } else if (0xFC == (0xFE & ($in))) {
 564                  // First octet of 6 octet sequence, see comments for 5 octet sequence.
 565                  $mUcs4 = ($in);
 566                  $mUcs4 = ($mUcs4 & 1) << 30;
 567                  $mState = 5;
 568                  $mBytes = 6;
 569  
 570              } elseif($strict) {
 571                  /* Current octet is neither in the US-ASCII range nor a legal first
 572                   * octet of a multi-octet sequence.
 573                   */
 574                  trigger_error(
 575                          'utf8_to_unicode: Illegal sequence identifier '.
 576                              'in UTF-8 at byte '.$i,
 577                          E_USER_WARNING
 578                      );
 579                  return FALSE;
 580  
 581              }
 582  
 583          } else {
 584  
 585              // When mState is non-zero, we expect a continuation of the multi-octet
 586              // sequence
 587              if (0x80 == (0xC0 & ($in))) {
 588  
 589                  // Legal continuation.
 590                  $shift = ($mState - 1) * 6;
 591                  $tmp = $in;
 592                  $tmp = ($tmp & 0x0000003F) << $shift;
 593                  $mUcs4 |= $tmp;
 594  
 595                  /**
 596                   * End of the multi-octet sequence. mUcs4 now contains the final
 597                   * Unicode codepoint to be output
 598                   */
 599                  if (0 == --$mState) {
 600  
 601                      /*
 602                       * Check for illegal sequences and codepoints.
 603                       */
 604                      // From Unicode 3.1, non-shortest form is illegal
 605                      if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 606                          ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 607                          ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
 608                          (4 < $mBytes) ||
 609                          // From Unicode 3.2, surrogate characters are illegal
 610                          (($mUcs4 & 0xFFFFF800) == 0xD800) ||
 611                          // Codepoints outside the Unicode range are illegal
 612                          ($mUcs4 > 0x10FFFF)) {
 613  
 614                          if($strict){
 615                              trigger_error(
 616                                      'utf8_to_unicode: Illegal sequence or codepoint '.
 617                                          'in UTF-8 at byte '.$i,
 618                                      E_USER_WARNING
 619                                  );
 620  
 621                              return FALSE;
 622                          }
 623  
 624                      }
 625  
 626                      if (0xFEFF != $mUcs4) {
 627                          // BOM is legal but we don't want to output it
 628                          $out[] = $mUcs4;
 629                      }
 630  
 631                      //initialize UTF8 cache
 632                      $mState = 0;
 633                      $mUcs4  = 0;
 634                      $mBytes = 1;
 635                  }
 636  
 637              } elseif($strict) {
 638                  /**
 639                   *((0xC0 & (*in) != 0x80) && (mState != 0))
 640                   * Incomplete multi-octet sequence.
 641                   */
 642                  trigger_error(
 643                          'utf8_to_unicode: Incomplete multi-octet '.
 644                          '   sequence in UTF-8 at byte '.$i,
 645                          E_USER_WARNING
 646                      );
 647  
 648                  return FALSE;
 649              }
 650          }
 651      }
 652      return $out;
 653  }
 654  
 655  /**
 656   * Takes an array of ints representing the Unicode characters and returns
 657   * a UTF-8 string. Astral planes are supported ie. the ints in the
 658   * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 659   * are not allowed.
 660   *
 661   * If $strict is set to true the function returns false if the input
 662   * array contains ints that represent surrogates or are outside the
 663   * Unicode range and raises a PHP error at level E_USER_WARNING
 664   *
 665   * Note: this function has been modified slightly in this library to use
 666   * output buffering to concatenate the UTF-8 string (faster) as well as
 667   * reference the array by it's keys
 668   *
 669   * @param  array of unicode code points representing a string
 670   * @param  boolean Check for invalid sequences?
 671   * @return mixed UTF-8 string or FALSE if array contains invalid code points
 672   * @author <hsivonen@iki.fi>
 673   * @author Harry Fuecks <hfuecks@gmail.com>
 674   * @see    utf8_to_unicode
 675   * @link   http://hsivonen.iki.fi/php-utf8/
 676   * @link   http://sourceforge.net/projects/phputf8/
 677   */
 678  function unicode_to_utf8($arr,$strict=false) {
 679      if (!is_array($arr)) return '';
 680      ob_start();
 681  
 682      foreach (array_keys($arr) as $k) {
 683  
 684          # ASCII range (including control chars)
 685          if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
 686  
 687              echo chr($arr[$k]);
 688  
 689          # 2 byte sequence
 690          } else if ($arr[$k] <= 0x07ff) {
 691  
 692              echo chr(0xc0 | ($arr[$k] >> 6));
 693              echo chr(0x80 | ($arr[$k] & 0x003f));
 694  
 695          # Byte order mark (skip)
 696          } else if($arr[$k] == 0xFEFF) {
 697  
 698              // nop -- zap the BOM
 699  
 700          # Test for illegal surrogates
 701          } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
 702  
 703              // found a surrogate
 704              if($strict){
 705                  trigger_error(
 706                      'unicode_to_utf8: Illegal surrogate '.
 707                          'at index: '.$k.', value: '.$arr[$k],
 708                      E_USER_WARNING
 709                      );
 710                  return FALSE;
 711              }
 712  
 713          # 3 byte sequence
 714          } else if ($arr[$k] <= 0xffff) {
 715  
 716              echo chr(0xe0 | ($arr[$k] >> 12));
 717              echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
 718              echo chr(0x80 | ($arr[$k] & 0x003f));
 719  
 720          # 4 byte sequence
 721          } else if ($arr[$k] <= 0x10ffff) {
 722  
 723              echo chr(0xf0 | ($arr[$k] >> 18));
 724              echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
 725              echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
 726              echo chr(0x80 | ($arr[$k] & 0x3f));
 727  
 728          } elseif($strict) {
 729  
 730              trigger_error(
 731                  'unicode_to_utf8: Codepoint out of Unicode range '.
 732                      'at index: '.$k.', value: '.$arr[$k],
 733                  E_USER_WARNING
 734                  );
 735  
 736              // out of range
 737              return FALSE;
 738          }
 739      }
 740  
 741      $result = ob_get_contents();
 742      ob_end_clean();
 743      return $result;
 744  }
 745  
 746  /**
 747   * UTF-8 to UTF-16BE conversion.
 748   *
 749   * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 750   */
 751  function utf8_to_utf16be(&$str, $bom = false) {
 752    $out = $bom ? "\xFE\xFF" : '';
 753    if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
 754  
 755    $uni = utf8_to_unicode($str);
 756    foreach($uni as $cp){
 757      $out .= pack('n',$cp);
 758    }
 759    return $out;
 760  }
 761  
 762  /**
 763   * UTF-8 to UTF-16BE conversion.
 764   *
 765   * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 766   */
 767  function utf16be_to_utf8(&$str) {
 768    $uni = unpack('n*',$str);
 769    return unicode_to_utf8($uni);
 770  }
 771  
 772  /**
 773   * Replace bad bytes with an alternative character
 774   *
 775   * ASCII character is recommended for replacement char
 776   *
 777   * PCRE Pattern to locate bad bytes in a UTF-8 string
 778   * Comes from W3 FAQ: Multilingual Forms
 779   * Note: modified to include full ASCII range including control chars
 780   *
 781   * @author Harry Fuecks <hfuecks@gmail.com>
 782   * @see http://www.w3.org/International/questions/qa-forms-utf-8
 783   * @param string to search
 784   * @param string to replace bad bytes with (defaults to '?') - use ASCII
 785   * @return string
 786   */
 787  function utf8_bad_replace($str, $replace = '') {
 788      $UTF8_BAD =
 789       '([\x00-\x7F]'.                          # ASCII (including control chars)
 790       '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 791       '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 792       '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 793       '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 794       '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 795       '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 796       '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 797       '|(.{1}))';                              # invalid byte
 798      ob_start();
 799      while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 800          if ( !isset($matches[2])) {
 801              echo $matches[0];
 802          } else {
 803              echo $replace;
 804          }
 805          $str = substr($str,strlen($matches[0]));
 806      }
 807      $result = ob_get_contents();
 808      ob_end_clean();
 809      return $result;
 810  }
 811  
 812  /**
 813   * adjust a byte index into a utf8 string to a utf8 character boundary
 814   *
 815   * @param $str   string   utf8 character string
 816   * @param $i     int      byte index into $str
 817   * @param $next  bool     direction to search for boundary, 
 818   *                           false = up (current character)
 819   *                           true = down (next character)
 820   *
 821   * @return int            byte index into $str now pointing to a utf8 character boundary
 822   *
 823   * @author       chris smith <chris@jalakai.co.uk>
 824   */
 825  function utf8_correctIdx(&$str,$i,$next=false) {
 826  
 827    if ($i <= 0) return 0;
 828  
 829    $limit = strlen($str);
 830    if ($i>=$limit) return $limit;
 831  
 832    if ($next) {
 833      while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
 834    } else {
 835      while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
 836    }
 837  
 838    return $i;
 839  }
 840  
 841  // only needed if no mb_string available
 842  if(!UTF8_MBSTRING){
 843  
 844    /**
 845     * UTF-8 Case lookup table
 846     *
 847     * This lookuptable defines the upper case letters to their correspponding
 848     * lower case letter in UTF-8
 849     *
 850     * @author Andreas Gohr <andi@splitbrain.org>
 851     */
 852    global $UTF8_LOWER_TO_UPPER;
 853    $UTF8_LOWER_TO_UPPER = array(
 854      0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
 855      0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
 856      0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
 857      0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
 858      0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
 859      0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
 860      0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
 861      0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
 862      0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
 863      0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
 864      0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
 865      0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
 866      0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
 867      0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
 868      0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
 869      0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
 870      0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
 871      0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
 872      0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
 873      0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
 874      0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
 875      0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
 876      0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
 877      0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
 878      0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
 879      0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
 880      0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
 881      0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
 882      0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
 883      0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
 884      0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
 885      0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
 886      0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
 887      0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
 888      0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
 889      0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
 890      0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
 891      0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
 892      0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
 893      0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
 894      0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
 895      0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
 896      0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
 897    );
 898  
 899    /**
 900     * UTF-8 Case lookup table
 901     *
 902     * This lookuptable defines the lower case letters to their correspponding
 903     * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
 904     *
 905     * @author Andreas Gohr <andi@splitbrain.org>
 906     */
 907    global $UTF8_UPPER_TO_LOWER;
 908    $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
 909  
 910  } // end of case lookup tables
 911  
 912  
 913  /**
 914   * UTF-8 lookup table for lower case accented letters
 915   *
 916   * This lookuptable defines replacements for accented characters from the ASCII-7
 917   * range. This are lower case letters only.
 918   *
 919   * @author Andreas Gohr <andi@splitbrain.org>
 920   * @see    utf8_deaccent()
 921   */
 922  global $UTF8_LOWER_ACCENTS;
 923  $UTF8_LOWER_ACCENTS = array(
 924    'Ã ' => 'a', 'Ã´' => 'o', 'Ä' => 'd', 'á¸Ÿ' => 'f', 'Ã«' => 'e', 'Å¡' => 's', 'Æ¡' => 'o',
 925    'ÃŸ' => 'ss', 'Äƒ' => 'a', 'Å™' => 'r', 'È›' => 't', 'Åˆ' => 'n', 'Ä' => 'a', 'Ä·' => 'k',
 926    'Å' => 's', 'á»³' => 'y', 'Å†' => 'n', 'Äº' => 'l', 'Ä§' => 'h', 'á¹—' => 'p', 'Ã³' => 'o',
 927    'Ãº' => 'u', 'Ä›' => 'e', 'Ã©' => 'e', 'Ã§' => 'c', 'áº' => 'w', 'Ä‹' => 'c', 'Ãµ' => 'o',
 928    'á¹¡' => 's', 'Ã¸' => 'o', 'Ä£' => 'g', 'Å§' => 't', 'È™' => 's', 'Ä—' => 'e', 'Ä‰' => 'c',
 929    'Å›' => 's', 'Ã®' => 'i', 'Å±' => 'u', 'Ä‡' => 'c', 'Ä™' => 'e', 'Åµ' => 'w', 'á¹«' => 't',
 930    'Å«' => 'u', 'Ä' => 'c', 'Ã¶' => 'oe', 'Ã¨' => 'e', 'Å·' => 'y', 'Ä…' => 'a', 'Å‚' => 'l',
 931    'Å³' => 'u', 'Å¯' => 'u', 'ÅŸ' => 's', 'ÄŸ' => 'g', 'Ä¼' => 'l', 'Æ’' => 'f', 'Å¾' => 'z',
 932    'áºƒ' => 'w', 'á¸ƒ' => 'b', 'Ã¥' => 'a', 'Ã¬' => 'i', 'Ã¯' => 'i', 'á¸‹' => 'd', 'Å¥' => 't',
 933    'Å—' => 'r', 'Ã¤' => 'ae', 'Ã' => 'i', 'Å•' => 'r', 'Ãª' => 'e', 'Ã¼' => 'ue', 'Ã²' => 'o',
 934    'Ä“' => 'e', 'Ã±' => 'n', 'Å„' => 'n', 'Ä¥' => 'h', 'Ä' => 'g', 'Ä‘' => 'd', 'Äµ' => 'j',
 935    'Ã¿' => 'y', 'Å©' => 'u', 'Å' => 'u', 'Æ°' => 'u', 'Å£' => 't', 'Ã½' => 'y', 'Å‘' => 'o',
 936    'Ã¢' => 'a', 'Ä¾' => 'l', 'áº…' => 'w', 'Å¼' => 'z', 'Ä«' => 'i', 'Ã£' => 'a', 'Ä¡' => 'g',
 937    'á¹' => 'm', 'Å' => 'o', 'Ä©' => 'i', 'Ã¹' => 'u', 'Ä¯' => 'i', 'Åº' => 'z', 'Ã¡' => 'a',
 938    'Ã»' => 'u', 'Ã¾' => 'th', 'Ã°' => 'dh', 'Ã¦' => 'ae', 'Âµ' => 'u', 'Ä•' => 'e', 
 939  );
 940  
 941  /**
 942   * UTF-8 lookup table for upper case accented letters
 943   *
 944   * This lookuptable defines replacements for accented characters from the ASCII-7
 945   * range. This are upper case letters only.
 946   *
 947   * @author Andreas Gohr <andi@splitbrain.org>
 948   * @see    utf8_deaccent()
 949   */
 950  global $UTF8_UPPER_ACCENTS;
 951  $UTF8_UPPER_ACCENTS = array(
 952    'Ã€' => 'A', 'Ã”' => 'O', 'ÄŽ' => 'D', 'á¸ž' => 'F', 'Ã‹' => 'E', 'Å ' => 'S', 'Æ ' => 'O',
 953    'Ä‚' => 'A', 'Å˜' => 'R', 'Èš' => 'T', 'Å‡' => 'N', 'Ä€' => 'A', 'Ä¶' => 'K',
 954    'Åœ' => 'S', 'á»²' => 'Y', 'Å…' => 'N', 'Ä¹' => 'L', 'Ä¦' => 'H', 'á¹–' => 'P', 'Ã“' => 'O',
 955    'Ãš' => 'U', 'Äš' => 'E', 'Ã‰' => 'E', 'Ã‡' => 'C', 'áº€' => 'W', 'ÄŠ' => 'C', 'Ã•' => 'O',
 956    'á¹ ' => 'S', 'Ã˜' => 'O', 'Ä¢' => 'G', 'Å¦' => 'T', 'È˜' => 'S', 'Ä–' => 'E', 'Äˆ' => 'C',
 957    'Åš' => 'S', 'ÃŽ' => 'I', 'Å°' => 'U', 'Ä†' => 'C', 'Ä˜' => 'E', 'Å´' => 'W', 'á¹ª' => 'T',
 958    'Åª' => 'U', 'ÄŒ' => 'C', 'Ã–' => 'Oe', 'Ãˆ' => 'E', 'Å¶' => 'Y', 'Ä„' => 'A', 'Å' => 'L',
 959    'Å²' => 'U', 'Å®' => 'U', 'Åž' => 'S', 'Äž' => 'G', 'Ä»' => 'L', 'Æ‘' => 'F', 'Å½' => 'Z',
 960    'áº‚' => 'W', 'á¸‚' => 'B', 'Ã…' => 'A', 'ÃŒ' => 'I', 'Ã' => 'I', 'á¸Š' => 'D', 'Å¤' => 'T',
 961    'Å–' => 'R', 'Ã„' => 'Ae', 'Ã' => 'I', 'Å”' => 'R', 'ÃŠ' => 'E', 'Ãœ' => 'Ue', 'Ã’' => 'O',
 962    'Ä’' => 'E', 'Ã‘' => 'N', 'Åƒ' => 'N', 'Ä¤' => 'H', 'Äœ' => 'G', 'Ä' => 'D', 'Ä´' => 'J',
 963    'Å¸' => 'Y', 'Å¨' => 'U', 'Å¬' => 'U', 'Æ¯' => 'U', 'Å¢' => 'T', 'Ã' => 'Y', 'Å' => 'O',
 964    'Ã‚' => 'A', 'Ä½' => 'L', 'áº„' => 'W', 'Å»' => 'Z', 'Äª' => 'I', 'Ãƒ' => 'A', 'Ä ' => 'G',
 965    'á¹€' => 'M', 'ÅŒ' => 'O', 'Ä¨' => 'I', 'Ã™' => 'U', 'Ä®' => 'I', 'Å¹' => 'Z', 'Ã' => 'A',
 966    'Ã›' => 'U', 'Ãž' => 'Th', 'Ã' => 'Dh', 'Ã†' => 'Ae', 'Ä”' => 'E',
 967  );
 968  
 969  /**
 970   * UTF-8 array of common special characters
 971   *
 972   * This array should contain all special characters (not a letter or digit)
 973   * defined in the various local charsets - it's not a complete list of non-alphanum
 974   * characters in UTF-8. It's not perfect but should match most cases of special
 975   * chars.
 976   *
 977   * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
 978   * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
 979   *
 980   * @author Andreas Gohr <andi@splitbrain.org>
 981   * @see    utf8_stripspecials()
 982   */
 983  global $UTF8_SPECIAL_CHARS;
 984  $UTF8_SPECIAL_CHARS = array(
 985    0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
 986    0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
 987            0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
 988    0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
 989    0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
 990    0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
 991    0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
 992    0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
 993    0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
 994    0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
 995    0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
 996    0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
 997    0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
 998    0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
 999    0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1000    0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1001    0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1002    0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1003    0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1004    0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1005    0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1006    0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1007    0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1008    0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1009    0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1010    0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1011    0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1012    0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1013    0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1014    0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1015    0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1016    0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1017    0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1018    0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1019    0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1020    0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1021    0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1022    0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1023    0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1024    0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1025    0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1026    0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1027    0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1028    0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1029    0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1030    0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1031    0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1032    0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1033    0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1034    0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1035  );
1036  
1037  // utf8 version of above data
1038  global $UTF8_SPECIAL_CHARS2;
1039  $UTF8_SPECIAL_CHARS2 = 
1040      ' !"#$%&\'()+,/;<=>?@[\]^`{|}~Â€ÂÂ‚ÂƒÂ„Â…Â†Â‡ÂˆÂ‰ÂŠÂ‹ÂŒÂÂŽÂÂÂ‘Â’Â“Â”Â•ï¿½'.
1041      'ï¿½Â—Â˜Â™ÂšÂ›ÂœÂÂžÂŸ Â¡Â¢Â£Â¤Â¥Â¦Â§Â¨Â©ÂªÂ«Â¬ÂÂ®Â¯Â°Â±Â²Â³Â´ÂµÂ¶Â·Â¸Â¹ÂºÂ»Â¼Â½ï¿½'.
1042      'ï¿½Â¿Ã—Ã·Ë‡Ë˜Ë™ËšË›ËœËÌ€ÌÌƒÌ‰Ì£Î„Î…Î‡Î²Ï†Ï‘Ï’Ï•Ï–Ö°Ö±Ö²Ö³Ö´ÖµÖ¶Ö·Ö¸Ö¹Ö»Ö¼Ö½Ö¾Ö¿ï¿½'.
1043      'ï¿½××‚×ƒ×³×´ØŒØ›ØŸÙ€Ù‹ÙŒÙÙŽÙÙÙ‘Ù’Ùªà¸¿â€Œâ€â€Žâ€â€“â€”â€•â€—â€˜â€™â€šâ€œâ€ï¿½'.
1044      'ï¿½ï¿½â€ â€¡â€¢â€¦â€°â€²â€³â€¹â€ºâ„â‚§â‚ªâ‚«â‚¬â„–â„˜â„¢â„¦â„µâ†â†‘â†’â†“â†”â†•â†µ'.
1045      'â‡â‡‘â‡’â‡“â‡”âˆ€âˆ‚âˆƒâˆ…âˆ†âˆ‡âˆˆâˆ‰âˆ‹âˆâˆ‘âˆ’âˆ•âˆ—âˆ™âˆšâˆâˆžâˆ âˆ§âˆ¨ï¿½'.
1046      'ï¿½âˆªâˆ«âˆ´âˆ¼â‰…â‰ˆâ‰ â‰¡â‰¤â‰¥âŠ‚âŠƒâŠ„âŠ†âŠ‡âŠ•âŠ—âŠ¥â‹…âŒâŒ âŒ¡âŒ©âŒªâ‘©â”€ï¿½'.
1047      'ï¿½ï¿½â”Œâ”â””â”˜â”œâ”¤â”¬â”´â”¼â•â•‘â•’â•“â•”â••â•–â•—â•˜â•™â•šâ•›â•œâ•â•žâ•Ÿâ• '.
1048      'â•¡â•¢â•£â•¤â•¥â•¦â•§â•¨â•©â•ªâ•«â•¬â–€â–„â–ˆâ–Œâ–â–‘â–’â–“â– â–²â–¼â—†â—Šâ—ï¿½'.
1049      'ï¿½â˜…â˜Žâ˜›â˜žâ™ â™£â™¥â™¦âœâœ‚âœƒâœ„âœ†âœ‡âœˆâœ‰âœŒâœâœŽâœâœâœ‘âœ’âœ“âœ”âœ•ï¿½'.
1050      'ï¿½ï¿½âœ—âœ˜âœ™âœšâœ›âœœâœâœžâœŸâœ âœ¡âœ¢âœ£âœ¤âœ¥âœ¦âœ§âœ©âœªâœ«âœ¬âœâœ®âœ¯âœ°âœ±'.
1051      'âœ²âœ³âœ´âœµâœ¶âœ·âœ¸âœ¹âœºâœ»âœ¼âœ½âœ¾âœ¿â€ââ‚âƒâ„â…â†â‡âˆâ‰âŠâ‹ï¿½'.
1052      'ï¿½âââ‘â’â–â˜â™âšâ›âœââžâ¡â¢â£â¤â¥â¦â§â¿âž‰âž“âž”âž˜âž™âžšï¿½'.
1053      'ï¿½ï¿½âžœâžâžžâžŸâž âž¡âž¢âž£âž¤âž¥âž¦âž§âž¨âž©âžªâž«âž¬âžâž®âž¯âž±âž²âž³âž´âžµâž¶'.
1054      'âž·âž¸âž¹âžºâž»âž¼âž½âž¾ï›™ï›šï››ï£—ï£˜ï£™ï£šï£›ï£œï£ï£žï£Ÿï£ ï£¡ï£¢ï££ï£¤ï£¥ï¿½'.
1055      'ï¿½ï£§ï£¨ï£©ï£ªï£«ï£¬ï£ï£®ï£¯ï£°ï£±ï£²ï£³ï£´ï£µï£¶ï£·ï£¸ï£¹ï£ºï£»ï£¼ï£½ï£¾ï¹¼ï¹½';
1056  
1057  /**
1058   * Romanization lookup table
1059   *
1060   * This lookup tables provides a way to transform strings written in a language
1061   * different from the ones based upon latin letters into plain ASCII.
1062   *
1063   * Please note: this is not a scientific transliteration table. It only works
1064   * oneway from nonlatin to ASCII and it works by simple character replacement
1065   * only. Specialities of each language are not supported.
1066   *
1067   * @author Andreas Gohr <andi@splitbrain.org>
1068   * @author Vitaly Blokhin <vitinfo@vitn.com>
1069   * @link   http://www.uconv.com/translit.htm
1070   * @author Bisqwit <bisqwit@iki.fi>
1071   * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1072   * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1073   * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1074   * @link   http://www.btranslations.com/resources/romanization/korean.asp
1075   */
1076  global $UTF8_ROMANIZATION;
1077  $UTF8_ROMANIZATION = array(
1078    //russian cyrillic
1079    'Ð°'=>'a','Ð'=>'A','Ð±'=>'b','Ð‘'=>'B','Ð²'=>'v','Ð’'=>'V','Ð³'=>'g','Ð“'=>'G',
1080    'Ð´'=>'d','Ð”'=>'D','Ðµ'=>'e','Ð•'=>'E','Ñ‘'=>'jo','Ð'=>'Jo','Ð¶'=>'zh','Ð–'=>'Zh',
1081    'Ð·'=>'z','Ð—'=>'Z','Ð¸'=>'i','Ð˜'=>'I','Ð¹'=>'j','Ð™'=>'J','Ðº'=>'k','Ðš'=>'K',
1082    'Ð»'=>'l','Ð›'=>'L','Ð¼'=>'m','Ðœ'=>'M','Ð½'=>'n','Ð'=>'N','Ð¾'=>'o','Ðž'=>'O',
1083    'Ð¿'=>'p','ÐŸ'=>'P','Ñ€'=>'r','Ð '=>'R','Ñ'=>'s','Ð¡'=>'S','Ñ‚'=>'t','Ð¢'=>'T',
1084    'Ñƒ'=>'u','Ð£'=>'U','Ñ„'=>'f','Ð¤'=>'F','Ñ…'=>'x','Ð¥'=>'X','Ñ†'=>'c','Ð¦'=>'C',
1085    'Ñ‡'=>'ch','Ð§'=>'Ch','Ñˆ'=>'sh','Ð¨'=>'Sh','Ñ‰'=>'sch','Ð©'=>'Sch','ÑŠ'=>'',
1086    'Ðª'=>'','Ñ‹'=>'y','Ð«'=>'Y','ÑŒ'=>'','Ð¬'=>'','Ñ'=>'eh','Ð'=>'Eh','ÑŽ'=>'ju',
1087    'Ð®'=>'Ju','Ñ'=>'ja','Ð¯'=>'Ja',
1088    // Ukrainian cyrillic
1089    'Ò'=>'Gh','Ò‘'=>'gh','Ð„'=>'Je','Ñ”'=>'je','Ð†'=>'I','Ñ–'=>'i','Ð‡'=>'Ji','Ñ—'=>'ji',
1090    // Georgian
1091    'áƒ'=>'a','áƒ‘'=>'b','áƒ’'=>'g','áƒ“'=>'d','áƒ”'=>'e','áƒ•'=>'v','áƒ–'=>'z','áƒ—'=>'th',
1092    'áƒ˜'=>'i','áƒ™'=>'p','áƒš'=>'l','áƒ›'=>'m','áƒœ'=>'n','áƒ'=>'o','áƒž'=>'p','áƒŸ'=>'zh',
1093    'áƒ '=>'r','áƒ¡'=>'s','áƒ¢'=>'t','áƒ£'=>'u','áƒ¤'=>'ph','áƒ¥'=>'kh','áƒ¦'=>'gh','áƒ§'=>'q',
1094    'áƒ¨'=>'sh','áƒ©'=>'ch','áƒª'=>'c','áƒ«'=>'dh','áƒ¬'=>'w','áƒ'=>'j','áƒ®'=>'x','áƒ¯'=>'jh',
1095    'áƒ°'=>'xh',
1096    //Sanskrit
1097    'à¤…'=>'a','à¤†'=>'ah','à¤‡'=>'i','à¤ˆ'=>'ih','à¤‰'=>'u','à¤Š'=>'uh','à¤‹'=>'ry',
1098    'à¥ '=>'ryh','à¤Œ'=>'ly','à¥¡'=>'lyh','à¤'=>'e','à¤'=>'ay','à¤“'=>'o','à¤”'=>'aw',
1099    'à¤…à¤‚'=>'amh','à¤…à¤ƒ'=>'aq','à¤•'=>'k','à¤–'=>'kh','à¤—'=>'g','à¤˜'=>'gh','à¤™'=>'nh',
1100    'à¤š'=>'c','à¤›'=>'ch','à¤œ'=>'j','à¤'=>'jh','à¤ž'=>'ny','à¤Ÿ'=>'tq','à¤ '=>'tqh',
1101    'à¤¡'=>'dq','à¤¢'=>'dqh','à¤£'=>'nq','à¤¤'=>'t','à¤¥'=>'th','à¤¦'=>'d','à¤§'=>'dh',
1102    'à¤¨'=>'n','à¤ª'=>'p','à¤«'=>'ph','à¤¬'=>'b','à¤'=>'bh','à¤®'=>'m','à¤¯'=>'z','à¤°'=>'r',
1103    'à¤²'=>'l','à¤µ'=>'v','à¤¶'=>'sh','à¤·'=>'sqh','à¤¸'=>'s','à¤¹'=>'x',
1104    //Hebrew
1105    '×'=>'a', '×‘'=>'b','×’'=>'g','×“'=>'d','×”'=>'h','×•'=>'v','×–'=>'z','×—'=>'kh','×˜'=>'th',
1106    '×™'=>'y','×š'=>'h','×›'=>'k','×œ'=>'l','×'=>'m','×ž'=>'m','×Ÿ'=>'n','× '=>'n',
1107    '×¡'=>'s','×¢'=>'ah','×£'=>'f','×¤'=>'p','×¥'=>'c','×¦'=>'c','×§'=>'q','×¨'=>'r',
1108    '×©'=>'sh','×ª'=>'t',
1109    //Arabic
1110    'Ø§'=>'a','Ø¨'=>'b','Øª'=>'t','Ø«'=>'th','Ø¬'=>'g','Ø'=>'xh','Ø®'=>'x','Ø¯'=>'d',
1111    'Ø°'=>'dh','Ø±'=>'r','Ø²'=>'z','Ø³'=>'s','Ø´'=>'sh','Øµ'=>'s\'','Ø¶'=>'d\'',
1112    'Ø·'=>'t\'','Ø¸'=>'z\'','Ø¹'=>'y','Øº'=>'gh','Ù'=>'f','Ù‚'=>'q','Ùƒ'=>'k',
1113    'Ù„'=>'l','Ù…'=>'m','Ù†'=>'n','Ù‡'=>'x\'','Ùˆ'=>'u','ÙŠ'=>'i',
1114  
1115    // Japanese hiragana
1116    'ã‚'=>'a','ãˆ'=>'e','ã„'=>'i','ãŠ'=>'o','ã†'=>'u','ã°'=>'ba','ã¹'=>'be',
1117    'ã³'=>'bi','ã¼'=>'bo','ã¶'=>'bu','ã—'=>'ci','ã '=>'da','ã§'=>'de','ã¢'=>'di',
1118    'ã©'=>'do','ã¥'=>'du','ãµã'=>'fa','ãµã‡'=>'fe','ãµãƒ'=>'fi','ãµã‰'=>'fo',
1119    'ãµ'=>'fu','ãŒ'=>'ga','ã’'=>'ge','ãŽ'=>'gi','ã”'=>'go','ã'=>'gu','ã¯'=>'ha',
1120    'ã¸'=>'he','ã²'=>'hi','ã»'=>'ho','ãµ'=>'hu','ã˜ã‚ƒ'=>'ja','ã˜ã‡'=>'je',
1121    'ã˜'=>'ji','ã˜ã‚‡'=>'jo','ã˜ã‚…'=>'ju','ã‹'=>'ka','ã‘'=>'ke','ã'=>'ki',
1122    'ã“'=>'ko','ã'=>'ku','ã‚‰'=>'la','ã‚Œ'=>'le','ã‚Š'=>'li','ã‚'=>'lo','ã‚‹'=>'lu',
1123    'ã¾'=>'ma','ã‚'=>'me','ã¿'=>'mi','ã‚‚'=>'mo','ã‚€'=>'mu','ãª'=>'na','ã'=>'ne',
1124    'ã«'=>'ni','ã®'=>'no','ã¬'=>'nu','ã±'=>'pa','ãº'=>'pe','ã´'=>'pi','ã½'=>'po',
1125    'ã·'=>'pu','ã‚‰'=>'ra','ã‚Œ'=>'re','ã‚Š'=>'ri','ã‚'=>'ro','ã‚‹'=>'ru','ã•'=>'sa',
1126    'ã›'=>'se','ã—'=>'si','ã'=>'so','ã™'=>'su','ãŸ'=>'ta','ã¦'=>'te','ã¡'=>'ti',
1127    'ã¨'=>'to','ã¤'=>'tu','ãƒ´ã'=>'va','ãƒ´ã‡'=>'ve','ãƒ´ãƒ'=>'vi','ãƒ´ã‰'=>'vo',
1128    'ãƒ´'=>'vu','ã‚'=>'wa','ã†ã‡'=>'we','ã†ãƒ'=>'wi','ã‚’'=>'wo','ã‚„'=>'ya','ã„ã‡'=>'ye',
1129    'ã„'=>'yi','ã‚ˆ'=>'yo','ã‚†'=>'yu','ã–'=>'za','ãœ'=>'ze','ã˜'=>'zi','ãž'=>'zo',
1130    'ãš'=>'zu','ã³ã‚ƒ'=>'bya','ã³ã‡'=>'bye','ã³ãƒ'=>'byi','ã³ã‚‡'=>'byo','ã³ã‚…'=>'byu',
1131    'ã¡ã‚ƒ'=>'cha','ã¡ã‡'=>'che','ã¡'=>'chi','ã¡ã‚‡'=>'cho','ã¡ã‚…'=>'chu','ã¡ã‚ƒ'=>'cya',
1132    'ã¡ã‡'=>'cye','ã¡ãƒ'=>'cyi','ã¡ã‚‡'=>'cyo','ã¡ã‚…'=>'cyu','ã§ã‚ƒ'=>'dha','ã§ã‡'=>'dhe',
1133    'ã§ãƒ'=>'dhi','ã§ã‚‡'=>'dho','ã§ã‚…'=>'dhu','ã©ã'=>'dwa','ã©ã‡'=>'dwe','ã©ãƒ'=>'dwi',
1134    'ã©ã‰'=>'dwo','ã©ã…'=>'dwu','ã¢ã‚ƒ'=>'dya','ã¢ã‡'=>'dye','ã¢ãƒ'=>'dyi','ã¢ã‚‡'=>'dyo',
1135    'ã¢ã‚…'=>'dyu','ã¢'=>'dzi','ãµã'=>'fwa','ãµã‡'=>'fwe','ãµãƒ'=>'fwi','ãµã‰'=>'fwo',
1136    'ãµã…'=>'fwu','ãµã‚ƒ'=>'fya','ãµã‡'=>'fye','ãµãƒ'=>'fyi','ãµã‚‡'=>'fyo','ãµã‚…'=>'fyu',
1137    'ãŽã‚ƒ'=>'gya','ãŽã‡'=>'gye','ãŽãƒ'=>'gyi','ãŽã‚‡'=>'gyo','ãŽã‚…'=>'gyu','ã²ã‚ƒ'=>'hya',
1138    'ã²ã‡'=>'hye','ã²ãƒ'=>'hyi','ã²ã‚‡'=>'hyo','ã²ã‚…'=>'hyu','ã˜ã‚ƒ'=>'jya','ã˜ã‡'=>'jye',
1139    'ã˜ãƒ'=>'jyi','ã˜ã‚‡'=>'jyo','ã˜ã‚…'=>'jyu','ãã‚ƒ'=>'kya','ãã‡'=>'kye','ããƒ'=>'kyi',
1140    'ãã‚‡'=>'kyo','ãã‚…'=>'kyu','ã‚Šã‚ƒ'=>'lya','ã‚Šã‡'=>'lye','ã‚Šãƒ'=>'lyi','ã‚Šã‚‡'=>'lyo',
1141    'ã‚Šã‚…'=>'lyu','ã¿ã‚ƒ'=>'mya','ã¿ã‡'=>'mye','ã¿ãƒ'=>'myi','ã¿ã‚‡'=>'myo','ã¿ã‚…'=>'myu',
1142    'ã‚“'=>'n','ã«ã‚ƒ'=>'nya','ã«ã‡'=>'nye','ã«ãƒ'=>'nyi','ã«ã‚‡'=>'nyo','ã«ã‚…'=>'nyu',
1143    'ã´ã‚ƒ'=>'pya','ã´ã‡'=>'pye','ã´ãƒ'=>'pyi','ã´ã‚‡'=>'pyo','ã´ã‚…'=>'pyu','ã‚Šã‚ƒ'=>'rya',
1144    'ã‚Šã‡'=>'rye','ã‚Šãƒ'=>'ryi','ã‚Šã‚‡'=>'ryo','ã‚Šã‚…'=>'ryu','ã—ã‚ƒ'=>'sha','ã—ã‡'=>'she',
1145    'ã—'=>'shi','ã—ã‚‡'=>'sho','ã—ã‚…'=>'shu','ã™ã'=>'swa','ã™ã‡'=>'swe','ã™ãƒ'=>'swi',
1146    'ã™ã‰'=>'swo','ã™ã…'=>'swu','ã—ã‚ƒ'=>'sya','ã—ã‡'=>'sye','ã—ãƒ'=>'syi','ã—ã‚‡'=>'syo',
1147    'ã—ã‚…'=>'syu','ã¦ã‚ƒ'=>'tha','ã¦ã‡'=>'the','ã¦ãƒ'=>'thi','ã¦ã‚‡'=>'tho','ã¦ã‚…'=>'thu',
1148    'ã¤ã‚ƒ'=>'tsa','ã¤ã‡'=>'tse','ã¤ãƒ'=>'tsi','ã¤ã‚‡'=>'tso','ã¤'=>'tsu','ã¨ã'=>'twa',
1149    'ã¨ã‡'=>'twe','ã¨ãƒ'=>'twi','ã¨ã‰'=>'two','ã¨ã…'=>'twu','ã¡ã‚ƒ'=>'tya','ã¡ã‡'=>'tye',
1150    'ã¡ãƒ'=>'tyi','ã¡ã‚‡'=>'tyo','ã¡ã‚…'=>'tyu','ãƒ´ã‚ƒ'=>'vya','ãƒ´ã‡'=>'vye','ãƒ´ãƒ'=>'vyi',
1151    'ãƒ´ã‚‡'=>'vyo','ãƒ´ã‚…'=>'vyu','ã†ã'=>'wha','ã†ã‡'=>'whe','ã†ãƒ'=>'whi','ã†ã‰'=>'who',
1152    'ã†ã…'=>'whu','ã‚‘'=>'wye','ã‚'=>'wyi','ã˜ã‚ƒ'=>'zha','ã˜ã‡'=>'zhe','ã˜ãƒ'=>'zhi',
1153    'ã˜ã‚‡'=>'zho','ã˜ã‚…'=>'zhu','ã˜ã‚ƒ'=>'zya','ã˜ã‡'=>'zye','ã˜ãƒ'=>'zyi','ã˜ã‚‡'=>'zyo',
1154    'ã˜ã‚…'=>'zyu',
1155    // Japanese katakana
1156    'ã‚¢'=>'a','ã‚¨'=>'e','ã‚¤'=>'i','ã‚ª'=>'o','ã‚¦'=>'u','ãƒ'=>'ba','ãƒ™'=>'be','ãƒ“'=>'bi',
1157    'ãƒœ'=>'bo','ãƒ–'=>'bu','ã‚·'=>'ci','ãƒ€'=>'da','ãƒ‡'=>'de','ãƒ‚'=>'di','ãƒ‰'=>'do',
1158    'ãƒ…'=>'du','ãƒ•ã‚¡'=>'fa','ãƒ•ã‚§'=>'fe','ãƒ•ã‚£'=>'fi','ãƒ•ã‚©'=>'fo','ãƒ•'=>'fu','ã‚¬'=>'ga',
1159    'ã‚²'=>'ge','ã‚®'=>'gi','ã‚´'=>'go','ã‚°'=>'gu','ãƒ'=>'ha','ãƒ˜'=>'he','ãƒ’'=>'hi','ãƒ›'=>'ho',
1160    'ãƒ•'=>'hu','ã‚¸ãƒ£'=>'ja','ã‚¸ã‚§'=>'je','ã‚¸'=>'ji','ã‚¸ãƒ§'=>'jo','ã‚¸ãƒ¥'=>'ju','ã‚«'=>'ka',
1161    'ã‚±'=>'ke','ã‚'=>'ki','ã‚³'=>'ko','ã‚¯'=>'ku','ãƒ©'=>'la','ãƒ¬'=>'le','ãƒª'=>'li','ãƒ'=>'lo',
1162    'ãƒ«'=>'lu','ãƒž'=>'ma','ãƒ¡'=>'me','ãƒŸ'=>'mi','ãƒ¢'=>'mo','ãƒ '=>'mu','ãƒŠ'=>'na','ãƒ'=>'ne',
1163    'ãƒ‹'=>'ni','ãƒŽ'=>'no','ãƒŒ'=>'nu','ãƒ‘'=>'pa','ãƒš'=>'pe','ãƒ”'=>'pi','ãƒ'=>'po','ãƒ—'=>'pu',
1164    'ãƒ©'=>'ra','ãƒ¬'=>'re','ãƒª'=>'ri','ãƒ'=>'ro','ãƒ«'=>'ru','ã‚µ'=>'sa','ã‚»'=>'se','ã‚·'=>'si',
1165    'ã‚½'=>'so','ã‚¹'=>'su','ã‚¿'=>'ta','ãƒ†'=>'te','ãƒ'=>'ti','ãƒˆ'=>'to','ãƒ„'=>'tu','ãƒ´ã‚¡'=>'va',
1166    'ãƒ´ã‚§'=>'ve','ãƒ´ã‚£'=>'vi','ãƒ´ã‚©'=>'vo','ãƒ´'=>'vu','ãƒ¯'=>'wa','ã‚¦ã‚§'=>'we','ã‚¦ã‚£'=>'wi',
1167    'ãƒ²'=>'wo','ãƒ¤'=>'ya','ã‚¤ã‚§'=>'ye','ã‚¤'=>'yi','ãƒ¨'=>'yo','ãƒ¦'=>'yu','ã‚¶'=>'za','ã‚¼'=>'ze',
1168    'ã‚¸'=>'zi','ã‚¾'=>'zo','ã‚º'=>'zu','ãƒ“ãƒ£'=>'bya','ãƒ“ã‚§'=>'bye','ãƒ“ã‚£'=>'byi','ãƒ“ãƒ§'=>'byo',
1169    'ãƒ“ãƒ¥'=>'byu','ãƒãƒ£'=>'cha','ãƒã‚§'=>'che','ãƒ'=>'chi','ãƒãƒ§'=>'cho','ãƒãƒ¥'=>'chu',
1170    'ãƒãƒ£'=>'cya','ãƒã‚§'=>'cye','ãƒã‚£'=>'cyi','ãƒãƒ§'=>'cyo','ãƒãƒ¥'=>'cyu','ãƒ‡ãƒ£'=>'dha',
1171    'ãƒ‡ã‚§'=>'dhe','ãƒ‡ã‚£'=>'dhi','ãƒ‡ãƒ§'=>'dho','ãƒ‡ãƒ¥'=>'dhu','ãƒ‰ã‚¡'=>'dwa','ãƒ‰ã‚§'=>'dwe',
1172    'ãƒ‰ã‚£'=>'dwi','ãƒ‰ã‚©'=>'dwo','ãƒ‰ã‚¥'=>'dwu','ãƒ‚ãƒ£'=>'dya','ãƒ‚ã‚§'=>'dye','ãƒ‚ã‚£'=>'dyi',
1173    'ãƒ‚ãƒ§'=>'dyo','ãƒ‚ãƒ¥'=>'dyu','ãƒ‚'=>'dzi','ãƒ•ã‚¡'=>'fwa','ãƒ•ã‚§'=>'fwe','ãƒ•ã‚£'=>'fwi',
1174    'ãƒ•ã‚©'=>'fwo','ãƒ•ã‚¥'=>'fwu','ãƒ•ãƒ£'=>'fya','ãƒ•ã‚§'=>'fye','ãƒ•ã‚£'=>'fyi','ãƒ•ãƒ§'=>'fyo',
1175    'ãƒ•ãƒ¥'=>'fyu','ã‚®ãƒ£'=>'gya','ã‚®ã‚§'=>'gye','ã‚®ã‚£'=>'gyi','ã‚®ãƒ§'=>'gyo','ã‚®ãƒ¥'=>'gyu',
1176    'ãƒ’ãƒ£'=>'hya','ãƒ’ã‚§'=>'hye','ãƒ’ã‚£'=>'hyi','ãƒ’ãƒ§'=>'hyo','ãƒ’ãƒ¥'=>'hyu','ã‚¸ãƒ£'=>'jya',
1177    'ã‚¸ã‚§'=>'jye','ã‚¸ã‚£'=>'jyi','ã‚¸ãƒ§'=>'jyo','ã‚¸ãƒ¥'=>'jyu','ã‚ãƒ£'=>'kya','ã‚ã‚§'=>'kye',
1178    'ã‚ã‚£'=>'kyi','ã‚ãƒ§'=>'kyo','ã‚ãƒ¥'=>'kyu','ãƒªãƒ£'=>'lya','ãƒªã‚§'=>'lye','ãƒªã‚£'=>'lyi',
1179    'ãƒªãƒ§'=>'lyo','ãƒªãƒ¥'=>'lyu','ãƒŸãƒ£'=>'mya','ãƒŸã‚§'=>'mye','ãƒŸã‚£'=>'myi','ãƒŸãƒ§'=>'myo',
1180    'ãƒŸãƒ¥'=>'myu','ãƒ³'=>'n','ãƒ‹ãƒ£'=>'nya','ãƒ‹ã‚§'=>'nye','ãƒ‹ã‚£'=>'nyi','ãƒ‹ãƒ§'=>'nyo',
1181    'ãƒ‹ãƒ¥'=>'nyu','ãƒ”ãƒ£'=>'pya','ãƒ”ã‚§'=>'pye','ãƒ”ã‚£'=>'pyi','ãƒ”ãƒ§'=>'pyo','ãƒ”ãƒ¥'=>'pyu',
1182    'ãƒªãƒ£'=>'rya','ãƒªã‚§'=>'rye','ãƒªã‚£'=>'ryi','ãƒªãƒ§'=>'ryo','ãƒªãƒ¥'=>'ryu','ã‚·ãƒ£'=>'sha',
1183    'ã‚·ã‚§'=>'she','ã‚·'=>'shi','ã‚·ãƒ§'=>'sho','ã‚·ãƒ¥'=>'shu','ã‚¹ã‚¡'=>'swa','ã‚¹ã‚§'=>'swe',
1184    'ã‚¹ã‚£'=>'swi','ã‚¹ã‚©'=>'swo','ã‚¹ã‚¥'=>'swu','ã‚·ãƒ£'=>'sya','ã‚·ã‚§'=>'sye','ã‚·ã‚£'=>'syi',
1185    'ã‚·ãƒ§'=>'syo','ã‚·ãƒ¥'=>'syu','ãƒ†ãƒ£'=>'tha','ãƒ†ã‚§'=>'the','ãƒ†ã‚£'=>'thi','ãƒ†ãƒ§'=>'tho',
1186    'ãƒ†ãƒ¥'=>'thu','ãƒ„ãƒ£'=>'tsa','ãƒ„ã‚§'=>'tse','ãƒ„ã‚£'=>'tsi','ãƒ„ãƒ§'=>'tso','ãƒ„'=>'tsu',
1187    'ãƒˆã‚¡'=>'twa','ãƒˆã‚§'=>'twe','ãƒˆã‚£'=>'twi','ãƒˆã‚©'=>'two','ãƒˆã‚¥'=>'twu','ãƒãƒ£'=>'tya',
1188    'ãƒã‚§'=>'tye','ãƒã‚£'=>'tyi','ãƒãƒ§'=>'tyo','ãƒãƒ¥'=>'tyu','ãƒ´ãƒ£'=>'vya','ãƒ´ã‚§'=>'vye',
1189    'ãƒ´ã‚£'=>'vyi','ãƒ´ãƒ§'=>'vyo','ãƒ´ãƒ¥'=>'vyu','ã‚¦ã‚¡'=>'wha','ã‚¦ã‚§'=>'whe','ã‚¦ã‚£'=>'whi',
1190    'ã‚¦ã‚©'=>'who','ã‚¦ã‚¥'=>'whu','ãƒ±'=>'wye','ãƒ°'=>'wyi','ã‚¸ãƒ£'=>'zha','ã‚¸ã‚§'=>'zhe',
1191    'ã‚¸ã‚£'=>'zhi','ã‚¸ãƒ§'=>'zho','ã‚¸ãƒ¥'=>'zhu','ã‚¸ãƒ£'=>'zya','ã‚¸ã‚§'=>'zye','ã‚¸ã‚£'=>'zyi',
1192    'ã‚¸ãƒ§'=>'zyo','ã‚¸ãƒ¥'=>'zyu',
1193  
1194    // "Greeklish"
1195    'Î“'=>'G','Î”'=>'E','Î˜'=>'Th','Î›'=>'L','Îž'=>'X','Î '=>'P','Î£'=>'S','Î¦'=>'F','Î¨'=>'Ps',
1196    'Î³'=>'g','Î´'=>'e','Î¸'=>'th','Î»'=>'l','Î¾'=>'x','Ï€'=>'p','Ïƒ'=>'s','Ï†'=>'f','Ïˆ'=>'ps',
1197  
1198    // Thai
1199    'à¸'=>'k','à¸‚'=>'kh','à¸ƒ'=>'kh','à¸„'=>'kh','à¸…'=>'kh','à¸†'=>'kh','à¸‡'=>'ng','à¸ˆ'=>'ch',
1200    'à¸‰'=>'ch','à¸Š'=>'ch','à¸‹'=>'s','à¸Œ'=>'ch','à¸'=>'y','à¸Ž'=>'d','à¸'=>'t','à¸'=>'th',
1201    'à¸‘'=>'d','à¸’'=>'th','à¸“'=>'n','à¸”'=>'d','à¸•'=>'t','à¸–'=>'th','à¸—'=>'th','à¸˜'=>'th',
1202    'à¸™'=>'n','à¸š'=>'b','à¸›'=>'p','à¸œ'=>'ph','à¸'=>'f','à¸ž'=>'ph','à¸Ÿ'=>'f','à¸ '=>'ph',
1203    'à¸¡'=>'m','à¸¢'=>'y','à¸£'=>'r','à¸¤'=>'rue','à¸¤à¹…'=>'rue','à¸¥'=>'l','à¸¦'=>'lue',
1204    'à¸¦à¹…'=>'lue','à¸§'=>'w','à¸¨'=>'s','à¸©'=>'s','à¸ª'=>'s','à¸«'=>'h','à¸¬'=>'l','à¸®'=>'h',
1205    'à¸°'=>'a','â€“à¸±'=>'a','à¸£à¸£'=>'a','à¸²'=>'a','à¸£à¸£'=>'an','à¸³'=>'am','â€“à¸´'=>'i','â€“à¸µ'=>'i',
1206    'â€“à¸¶'=>'ue','â€“à¸·'=>'ue','â€“à¸¸'=>'u','â€“à¸¹'=>'u','à¹€à¸°'=>'e','à¹€â€“à¹‡'=>'e','à¹€'=>'e','à¹à¸°'=>'ae',
1207    'à¹'=>'ae','à¹‚à¸°'=>'o','à¹‚'=>'o','à¹€à¸²à¸°'=>'o','à¸'=>'o','à¹€à¸à¸°'=>'oe','à¹€â€“à¸´'=>'oe',
1208    'à¹€à¸'=>'oe','à¹€â€“à¸µà¸¢à¸°'=>'ia','à¹€â€“à¸µà¸¢'=>'ia','à¹€â€“à¸·à¸à¸°'=>'uea','à¹€â€“à¸·à¸'=>'uea','â€“à¸±à¸§à¸°'=>'ua',
1209    'â€“à¸±à¸§'=>'ua','à¸§'=>'ua','à¹ƒ'=>'ai','à¹„'=>'ai','â€“à¸±à¸¢'=>'ai','à¹„à¸¢'=>'ai','à¸²à¸¢'=>'ai',
1210    'à¹€à¸²'=>'ao','à¸²à¸§'=>'ao','â€“à¸¸à¸¢'=>'ui','à¹‚à¸¢'=>'oi','à¸à¸¢'=>'oi','à¹€à¸¢'=>'oei','à¹€â€“à¸·à¸à¸¢'=>'ueai',
1211    'à¸§à¸¢'=>'uai','â€“à¸´à¸§'=>'io','à¹€â€“à¹‡à¸§'=>'eo','à¹€à¸§'=>'eo','à¹â€“à¹‡à¸§'=>'aeo','à¹à¸§'=>'aeo',
1212    'à¹€â€“à¸µà¸¢à¸§'=>'iao',
1213  
1214    // Korean
1215    'ã„±'=>'k','ã…‹'=>'kh','ã„²'=>'kk','ã„·'=>'t','ã…Œ'=>'th','ã„¸'=>'tt','ã…‚'=>'p',
1216    'ã…'=>'ph','ã…ƒ'=>'pp','ã…ˆ'=>'c','ã…Š'=>'ch','ã…‰'=>'cc','ã……'=>'s','ã…†'=>'ss',
1217    'ã…Ž'=>'h','ã…‡'=>'ng','ã„´'=>'n','ã„¹'=>'l','ã…'=>'m', 'ã…'=>'a','ã…“'=>'e','ã…—'=>'o',
1218    'ã…œ'=>'wu','ã…¡'=>'u','ã…£'=>'i','ã…'=>'ay','ã…”'=>'ey','ã…š'=>'oy','ã…˜'=>'wa','ã…'=>'we',
1219    'ã…Ÿ'=>'wi','ã…™'=>'way','ã…ž'=>'wey','ã…¢'=>'uy','ã…‘'=>'ya','ã…•'=>'ye','ã…›'=>'oy',
1220    'ã… '=>'yu','ã…’'=>'yay','ã…–'=>'yey',
1221  );
1222  
1223  //Setup VIM: ex: et ts=2 enc=utf-8 :
1224
Code source de DokuWiki 2006-11-06

/inc/ -> utf8.php (source)