b2evolution 2.1.0-beta : /blogs/inc/_ext/idna/_idna

[Sommaire] [Imprimer]
   1  <?php
   2  /**

   3   * @package libs

   4   */
   5  /* ------------------------------------------------------------------------- */

   6  /* idna_convert.class.php - Encode / Decode Internationalized Domain Names   */

   7  /* (c) 2004-2007 phlyLabs, Berlin (http://phlylabs.de)                       */

   8  /* All rights reserved                                                       */

   9  /* v0.5.0                                                                    */

  10  /* ------------------------------------------------------------------------- */

  11  
  12  // {{{ license

  13  
  14  /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */

  15  //

  16  // +----------------------------------------------------------------------+

  17  // | This library is free software; you can redistribute it and/or modify |

  18  // | it under the terms of the GNU Lesser General Public License as       |

  19  // | published by the Free Software Foundation; either version 2.1 of the |

  20  // | License, or (at your option) any later version.                      |

  21  // |                                                                      |

  22  // | This library is distributed in the hope that it will be useful, but  |

  23  // | WITHOUT ANY WARRANTY; without even the implied warranty of           |

  24  // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    |

  25  // | Lesser General Public License for more details.                      |

  26  // |                                                                      |

  27  // | You should have received a copy of the GNU Lesser General Public     |

  28  // | License along with this library; if not, write to the Free Software  |

  29  // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 |

  30  // | USA.                                                                 |

  31  // +----------------------------------------------------------------------+

  32  //

  33  
  34  // }}}

  35  
  36  /**

  37   * Encode/decode Internationalized Domain Names.

  38   *

  39   * The class allows to convert internationalized domain names

  40   * (see RFC 3490 for details) as they can be used with various registries worldwide

  41   * to be translated between their original (localized) form and their encoded form

  42   * as it will be used in the DNS (Domain Name System).

  43   *

  44   * The class provides two public methods, encode() and decode(), which do exactly

  45   * what you would expect them to do. You are allowed to use complete domain names,

  46   * simple strings and complete email addresses as well. That means, that you might

  47   * use any of the following notations:

  48   *

  49   * - www.nÃ¶rgler.com

  50   * - xn--nrgler-wxa

  51   * - xn--brse-5qa.xn--knrz-1ra.info

  52   *

  53   * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4

  54   * array. Unicode output is available in the same formats.

  55   * You can select your preferred format via {@link set_paramter()}.

  56   *

  57   * ACE input and output is always expected to be ASCII.

  58   *

  59   * @author  Matthias Sommerfeld <mso@phlylabs.de>

  60   * @version 0.5.0

  61   *

  62   */
  63  if( !defined('EVO_MAIN_INIT') ) die( 'Please, do not access this page directly.' );
  64  
  65  class idna_convert
  66  {
  67      /**

  68       * Holds all relevant mapping tables, loaded from a seperate file on construct

  69       * See RFC3454 for details

  70       *

  71       * @var array

  72       * @access private

  73       */
  74      var $NP = array();
  75  
  76      // Internal settings, do not mess with them

  77      var $_punycode_prefix = 'xn--';
  78      var $_invalid_ucs =     0x80000000;
  79      var $_max_ucs =         0x10FFFF;
  80      var $_base =            36;
  81      var $_tmin =            1;
  82      var $_tmax =            26;
  83      var $_skew =            38;
  84      var $_damp =            700;
  85      var $_initial_bias =    72;
  86      var $_initial_n =       0x80;
  87      var $_sbase =           0xAC00;
  88      var $_lbase =           0x1100;
  89      var $_vbase =           0x1161;
  90      var $_tbase =           0x11A7;
  91      var $_lcount =          19;
  92      var $_vcount =          21;
  93      var $_tcount =          28;
  94      var $_ncount =          588;   // _vcount * _tcount

  95      var $_scount =          11172; // _lcount * _tcount * _vcount

  96      var $_error =           false;
  97  
  98      // See {@link set_paramter()} for details of how to change the following

  99      // settings from within your script / application

 100      var $_api_encoding   =  'utf8'; // Default input charset is UTF-8

 101      var $_allow_overlong =  false;  // Overlong UTF-8 encodings are forbidden

 102      var $_strict_mode    =  false;  // Behave strict or not

 103  
 104      // The constructor

 105      function idna_convert($options = false)
 106      {
 107          $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
 108          if (function_exists('file_get_contents')) {
 109              $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/_idna_convert_npdata.ser.inc'));
 110          } else {
 111              $this->NP = unserialize(join('', file(dirname(__FILE__).'/_idna_convert_npdata.ser.inc')));
 112          }
 113          // If parameters are given, pass these to the respective method

 114          if (is_array($options)) {
 115              return $this->set_parameter($options);
 116          }
 117          return true;
 118      }
 119  
 120      /**

 121       * Sets a new option value. Available options and values:

 122       * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,

 123       *         'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]

 124       * [overlong - Unicode does not allow unnecessarily long encodings of chars,

 125       *             to allow this, set this parameter to true, else to false;

 126       *             default is false.]

 127       * [strict - true: strict mode, good for registration purposes - Causes errors

 128       *           on failures; false: loose mode, ideal for "wildlife" applications

 129       *           by silently ignoring errors and returning the original input instead

 130       *

 131       * @param    mixed     Parameter to set (string: single parameter; array of Parameter => Value pairs)

 132       * @param    string    Value to use (if parameter 1 is a string)

 133       * @return   boolean   true on success, false otherwise

 134       * @access   public

 135       */
 136      function set_parameter($option, $value = false)
 137      {
 138          if (!is_array($option)) {
 139              $option = array($option => $value);
 140          }
 141          foreach ($option as $k => $v) {
 142              switch ($k) {
 143              case 'encoding':
 144                  switch ($v) {
 145                  case 'utf8':
 146                  case 'ucs4_string':
 147                  case 'ucs4_array':
 148                      $this->_api_encoding = $v;
 149                      break;
 150                  default:
 151                      $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
 152                      return false;
 153                  }
 154                  break;
 155              case 'overlong':
 156                  $this->_allow_overlong = ($v) ? true : false;
 157                  break;
 158              case 'strict':
 159                  $this->_strict_mode = ($v) ? true : false;
 160                  break;
 161              default:
 162                  $this->_error('Set Parameter: Unknown option '.$k);
 163                  return false;
 164              }
 165          }
 166          return true;
 167      }
 168  
 169      /**

 170       * Decode a given ACE domain name

 171       * @param    string   Domain name (ACE string)

 172       * [@param    string   Desired output encoding, see {@link set_parameter}]

 173       * @return   string   Decoded Domain name (UTF-8 or UCS-4)

 174       * @access   public

 175       */
 176      function decode($input, $one_time_encoding = false)
 177      {
 178          // Optionally set

 179          if ($one_time_encoding) {
 180              switch ($one_time_encoding) {
 181              case 'utf8':
 182              case 'ucs4_string':
 183              case 'ucs4_array':
 184                  break;
 185              default:
 186                  $this->_error('Unknown encoding '.$one_time_encoding);
 187                  return false;
 188              }
 189          }
 190          // Make sure to drop any newline characters around

 191          $input = trim($input);
 192  
 193          // Negotiate input and try to determine, whether it is a plain string,

 194          // an email address or something like a complete URL

 195          if (strpos($input, '@')) { // Maybe it is an email address
 196              // No no in strict mode

 197              if ($this->_strict_mode) {
 198                  $this->_error('Only simple domain name parts can be handled in strict mode');
 199                  return false;
 200              }
 201              list ($email_pref, $input) = explode('@', $input, 2);
 202              $arr = explode('.', $input);
 203              foreach ($arr as $k => $v) {
 204                  if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
 205                      $conv = $this->_decode($v);
 206                      if ($conv) $arr[$k] = $conv;
 207                  }
 208              }
 209              $input = join('.', $arr);
 210              $arr = explode('.', $email_pref);
 211              foreach ($arr as $k => $v) {
 212                  if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
 213                      $conv = $this->_decode($v);
 214                      if ($conv) $arr[$k] = $conv;
 215                  }
 216              }
 217              $email_pref = join('.', $arr);
 218              $return = $email_pref . '@' . $input;
 219          } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
 220              // No no in strict mode

 221              if ($this->_strict_mode) {
 222                  $this->_error('Only simple domain name parts can be handled in strict mode');
 223                  return false;
 224              }
 225              $parsed = parse_url($input);
 226              if (isset($parsed['host'])) {
 227                  $arr = explode('.', $parsed['host']);
 228                  foreach ($arr as $k => $v) {
 229                      $conv = $this->_decode($v);
 230                      if ($conv) $arr[$k] = $conv;
 231                  }
 232                  $parsed['host'] = join('.', $arr);
 233                  $return =
 234                          (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
 235                          .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
 236                          .$parsed['host']
 237                          .(empty($parsed['port']) ? '' : ':'.$parsed['port'])
 238                          .(empty($parsed['path']) ? '' : $parsed['path'])
 239                          .(empty($parsed['query']) ? '' : '?'.$parsed['query'])
 240                          .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
 241              } else { // parse_url seems to have failed, try without it
 242                  $arr = explode('.', $input);
 243                  foreach ($arr as $k => $v) {
 244                      $conv = $this->_decode($v);
 245                      $arr[$k] = ($conv) ? $conv : $v;
 246                  }
 247                  $return = join('.', $arr);
 248              }
 249          } else { // Otherwise we consider it being a pure domain name string
 250              $return = $this->_decode($input);
 251              if (!$return) $return = $input;
 252          }
 253          // The output is UTF-8 by default, other output formats need conversion here

 254          // If one time encoding is given, use this, else the objects property

 255          switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
 256          case 'utf8':
 257              return $return;
 258              break;
 259          case 'ucs4_string':
 260             return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
 261             break;
 262          case 'ucs4_array':
 263              return $this->_utf8_to_ucs4($return);
 264              break;
 265          default:
 266              $this->_error('Unsupported output format');
 267              return false;
 268          }
 269      }
 270  
 271      /**

 272       * Encode a given UTF-8 domain name

 273       * @param    string   Domain name (UTF-8 or UCS-4)

 274       * [@param    string   Desired input encoding, see {@link set_parameter}]

 275       * @return   string   Encoded Domain name (ACE string)

 276       * @access   public

 277       */
 278      function encode($decoded, $one_time_encoding = false)
 279      {
 280          // Forcing conversion of input to UCS4 array

 281          // If one time encoding is given, use this, else the objects property

 282          switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) {
 283          case 'utf8':
 284              $decoded = $this->_utf8_to_ucs4($decoded);
 285              break;
 286          case 'ucs4_string':
 287             $decoded = $this->_ucs4_string_to_ucs4($decoded);
 288          case 'ucs4_array':
 289             break;
 290          default:
 291              $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding));
 292              return false;
 293          }
 294  
 295          // No input, no output, what else did you expect?

 296          if (empty($decoded)) return '';
 297  
 298          // Anchors for iteration

 299          $last_begin = 0;
 300          // Output string

 301          $output = '';
 302          foreach ($decoded as $k => $v) {
 303              // Make sure to use just the plain dot

 304              switch($v) {
 305              case 0x3002:
 306              case 0xFF0E:
 307              case 0xFF61:
 308                  $decoded[$k] = 0x2E;
 309                  // Right, no break here, the above are converted to dots anyway

 310              // Stumbling across an anchoring character

 311              case 0x2E:
 312              case 0x2F:
 313              case 0x3A:
 314              case 0x3F:
 315              case 0x40:
 316                  // Neither email addresses nor URLs allowed in strict mode

 317                  if ($this->_strict_mode) {
 318                     $this->_error('Neither email addresses nor URLs are allowed in strict mode.');
 319                     return false;
 320                  } else {
 321                      // Skip first char

 322                      if ($k) {
 323                          $encoded = '';
 324                          $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin)));
 325                          if ($encoded) {
 326                              $output .= $encoded;
 327                          } else {
 328                              $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin)));
 329                          }
 330                          $output .= chr($decoded[$k]);
 331                      }
 332                      $last_begin = $k + 1;
 333                  }
 334              }
 335          }
 336          // Catch the rest of the string

 337          if ($last_begin) {
 338              $inp_len = sizeof($decoded);
 339              $encoded = '';
 340              $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
 341              if ($encoded) {
 342                  $output .= $encoded;
 343              } else {
 344                  $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
 345              }
 346              return $output;
 347          } else {
 348              if ($output = $this->_encode($decoded)) {
 349                  return $output;
 350              } else {
 351                  return $this->_ucs4_to_utf8($decoded);
 352              }
 353          }
 354      }
 355  
 356      /**

 357       * Use this method to get the last error ocurred

 358       * @param    void

 359       * @return   string   The last error, that occured

 360       * @access   public

 361       */
 362      function get_last_error()
 363      {
 364          return $this->_error;
 365      }
 366  
 367      /**

 368       * The actual decoding algorithm

 369       * @access   private

 370       */
 371      function _decode($encoded)
 372      {
 373          // We do need to find the Punycode prefix

 374          if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) {
 375              $this->_error('This is not a punycode string');
 376              return false;
 377          }
 378          $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded);
 379          // If nothing left after removing the prefix, it is hopeless

 380          if (!$encode_test) {
 381              $this->_error('The given encoded string was empty');
 382              return false;
 383          }
 384          // Find last occurence of the delimiter

 385          $delim_pos = strrpos($encoded, '-');
 386          if ($delim_pos > strlen($this->_punycode_prefix)) {
 387              for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) {
 388                  $decoded[] = ord($encoded{$k});
 389              }
 390          } else {
 391              $decoded = array();
 392          }
 393          $deco_len = count($decoded);
 394          $enco_len = strlen($encoded);
 395  
 396          // Wandering through the strings; init

 397          $is_first = true;
 398          $bias     = $this->_initial_bias;
 399          $idx      = 0;
 400          $char     = $this->_initial_n;
 401  
 402          for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
 403              for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) {
 404                  $digit = $this->_decode_digit($encoded{$enco_idx++});
 405                  $idx += $digit * $w;
 406                  $t = ($k <= $bias) ? $this->_tmin :
 407                          (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
 408                  if ($digit < $t) break;
 409                  $w = (int) ($w * ($this->_base - $t));
 410              }
 411              $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
 412              $is_first = false;
 413              $char += (int) ($idx / ($deco_len + 1));
 414              $idx %= ($deco_len + 1);
 415              if ($deco_len > 0) {
 416                  // Make room for the decoded char

 417                  for ($i = $deco_len; $i > $idx; $i--) {
 418                      $decoded[$i] = $decoded[($i - 1)];
 419                  }
 420              }
 421              $decoded[$idx++] = $char;
 422          }
 423          return $this->_ucs4_to_utf8($decoded);
 424      }
 425  
 426      /**

 427       * The actual encoding algorithm

 428       * @access   private

 429       */
 430      function _encode($decoded)
 431      {
 432          // We cannot encode a domain name containing the Punycode prefix

 433          $extract = strlen($this->_punycode_prefix);
 434          $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
 435          $check_deco = array_slice($decoded, 0, $extract);
 436  
 437          if ($check_pref == $check_deco) {
 438              $this->_error('This is already a punycode string');
 439              return false;
 440          }
 441          // We will not try to encode strings consisting of basic code points only

 442          $encodable = false;
 443          foreach ($decoded as $k => $v) {
 444              if ($v > 0x7a) {
 445                  $encodable = true;
 446                  break;
 447              }
 448          }
 449          if (!$encodable) {
 450              $this->_error('The given string does not contain encodable chars');
 451              return false;
 452          }
 453  
 454          // Do NAMEPREP

 455          $decoded = $this->_nameprep($decoded);
 456          if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed

 457  
 458          $deco_len  = count($decoded);
 459          if (!$deco_len) return false; // Empty array

 460  
 461          $codecount = 0; // How many chars have been consumed

 462  
 463          $encoded = '';
 464          // Copy all basic code points to output

 465          for ($i = 0; $i < $deco_len; ++$i) {
 466              $test = $decoded[$i];
 467              // Will match [-0-9a-zA-Z]

 468              if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B)
 469                      || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) {
 470                  $encoded .= chr($decoded[$i]);
 471                  $codecount++;
 472              }
 473          }
 474          if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones

 475  
 476          // Start with the prefix; copy it to output

 477          $encoded = $this->_punycode_prefix.$encoded;
 478  
 479          // If we have basic code points in output, add an hyphen to the end

 480          if ($codecount) $encoded .= '-';
 481  
 482          // Now find and encode all non-basic code points

 483          $is_first  = true;
 484          $cur_code  = $this->_initial_n;
 485          $bias      = $this->_initial_bias;
 486          $delta     = 0;
 487          while ($codecount < $deco_len) {
 488              // Find the smallest code point >= the current code point and

 489              // remember the last ouccrence of it in the input

 490              for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
 491                  if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
 492                      $next_code = $decoded[$i];
 493                  }
 494              }
 495  
 496              $delta += ($next_code - $cur_code) * ($codecount + 1);
 497              $cur_code = $next_code;
 498  
 499              // Scan input again and encode all characters whose code point is $cur_code

 500              for ($i = 0; $i < $deco_len; $i++) {
 501                  if ($decoded[$i] < $cur_code) {
 502                      $delta++;
 503                  } elseif ($decoded[$i] == $cur_code) {
 504                      for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
 505                          $t = ($k <= $bias) ? $this->_tmin :
 506                                  (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
 507                          if ($q < $t) break;
 508                          $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval()

 509                          $q = (int) (($q - $t) / ($this->_base - $t));
 510                      }
 511                      $encoded .= $this->_encode_digit($q);
 512                      $bias = $this->_adapt($delta, $codecount+1, $is_first);
 513                      $codecount++;
 514                      $delta = 0;
 515                      $is_first = false;
 516                  }
 517              }
 518              $delta++;
 519              $cur_code++;
 520          }
 521          return $encoded;
 522      }
 523  
 524      /**

 525       * Adapt the bias according to the current code point and position

 526       * @access   private

 527       */
 528      function _adapt($delta, $npoints, $is_first)
 529      {
 530          $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2));
 531          $delta += intval($delta / $npoints);
 532          for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
 533              $delta = intval($delta / ($this->_base - $this->_tmin));
 534          }
 535          return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
 536      }
 537  
 538      /**

 539       * Encoding a certain digit

 540       * @access   private

 541       */
 542      function _encode_digit($d)
 543      {
 544          return chr($d + 22 + 75 * ($d < 26));
 545      }
 546  
 547      /**

 548       * Decode a certain digit

 549       * @access   private

 550       */
 551      function _decode_digit($cp)
 552      {
 553          $cp = ord($cp);
 554          return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
 555      }
 556  
 557      /**

 558       * Internal error handling method

 559       * @access   private

 560       */
 561      function _error($error = '')
 562      {
 563          $this->_error = $error;
 564      }
 565  
 566      /**

 567       * Do Nameprep according to RFC3491 and RFC3454

 568       * @param    array    Unicode Characters

 569       * @return   string   Unicode Characters, Nameprep'd

 570       * @access   private

 571       */
 572      function _nameprep($input)
 573      {
 574          $output = array();
 575          $error = false;
 576          //

 577          // Mapping

 578          // Walking through the input array, performing the required steps on each of

 579          // the input chars and putting the result into the output array

 580          // While mapping required chars we apply the cannonical ordering

 581          foreach ($input as $v) {
 582              // Map to nothing == skip that code point

 583              if (in_array($v, $this->NP['map_nothing'])) continue;
 584  
 585              // Try to find prohibited input

 586              if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) {
 587                  $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
 588                  return false;
 589              }
 590              foreach ($this->NP['prohibit_ranges'] as $range) {
 591                  if ($range[0] <= $v && $v <= $range[1]) {
 592                      $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
 593                      return false;
 594                  }
 595              }
 596              //

 597              // Hangul syllable decomposition

 598              if (0xAC00 <= $v && $v <= 0xD7AF) {
 599                  foreach ($this->_hangul_decompose($v) as $out) {
 600                      $output[] = (int) $out;
 601                  }
 602              // There's a decomposition mapping for that code point

 603              } elseif (isset($this->NP['replacemaps'][$v])) {
 604                  foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) {
 605                      $output[] = (int) $out;
 606                  }
 607              } else {
 608                  $output[] = (int) $v;
 609              }
 610          }
 611          // Before applying any Combining, try to rearrange any Hangul syllables

 612          $output = $this->_hangul_compose($output);
 613          //

 614          // Combine code points

 615          //

 616          $last_class   = 0;
 617          $last_starter = 0;
 618          $out_len      = count($output);
 619          for ($i = 0; $i < $out_len; ++$i) {
 620              $class = $this->_get_combining_class($output[$i]);
 621              if ((!$last_class || $last_class > $class) && $class) {
 622                  // Try to match

 623                  $seq_len = $i - $last_starter;
 624                  $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
 625                  // On match: Replace the last starter with the composed character and remove

 626                  // the now redundant non-starter(s)

 627                  if ($out) {
 628                      $output[$last_starter] = $out;
 629                      if (count($out) != $seq_len) {
 630                          for ($j = $i+1; $j < $out_len; ++$j) {
 631                              $output[$j-1] = $output[$j];
 632                          }
 633                          unset($output[$out_len]);
 634                      }
 635                      // Rewind the for loop by one, since there can be more possible compositions

 636                      $i--;
 637                      $out_len--;
 638                      $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]);
 639                      continue;
 640                  }
 641              }
 642              // The current class is 0

 643              if (!$class) $last_starter = $i;
 644              $last_class = $class;
 645          }
 646          return $output;
 647      }
 648  
 649      /**

 650       * Decomposes a Hangul syllable

 651       * (see http://www.unicode.org/unicode/reports/tr15/#Hangul

 652       * @param    integer  32bit UCS4 code point

 653       * @return   array    Either Hangul Syllable decomposed or original 32bit value as one value array

 654       * @access   private

 655       */
 656      function _hangul_decompose($char)
 657      {
 658          $sindex = (int) $char - $this->_sbase;
 659          if ($sindex < 0 || $sindex >= $this->_scount) {
 660              return array($char);
 661          }
 662          $result = array();
 663          $result[] = (int) $this->_lbase + $sindex / $this->_ncount;
 664          $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount;
 665          $T = intval($this->_tbase + $sindex % $this->_tcount);
 666          if ($T != $this->_tbase) $result[] = $T;
 667          return $result;
 668      }
 669      /**

 670       * Ccomposes a Hangul syllable

 671       * (see http://www.unicode.org/unicode/reports/tr15/#Hangul

 672       * @param    array    Decomposed UCS4 sequence

 673       * @return   array    UCS4 sequence with syllables composed

 674       * @access   private

 675       */
 676      function _hangul_compose($input)
 677      {
 678          $inp_len = count($input);
 679          if (!$inp_len) return array();
 680          $result = array();
 681          $last = (int) $input[0];
 682          $result[] = $last; // copy first char from input to output

 683  
 684          for ($i = 1; $i < $inp_len; ++$i) {
 685              $char = (int) $input[$i];
 686              $sindex = $last - $this->_sbase;
 687              $lindex = $last - $this->_lbase;
 688              $vindex = $char - $this->_vbase;
 689              $tindex = $char - $this->_tbase;
 690              // Find out, whether two current characters are LV and T

 691              if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0)
 692                      && 0 <= $tindex && $tindex <= $this->_tcount) {
 693                  // create syllable of form LVT

 694                  $last += $tindex;
 695                  $result[(count($result) - 1)] = $last; // reset last

 696                  continue; // discard char

 697              }
 698              // Find out, whether two current characters form L and V

 699              if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) {
 700                  // create syllable of form LV

 701                  $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount;
 702                  $result[(count($result) - 1)] = $last; // reset last

 703                  continue; // discard char

 704              }
 705              // if neither case was true, just add the character

 706              $last = $char;
 707              $result[] = $char;
 708          }
 709          return $result;
 710      }
 711  
 712      /**

 713       * Returns the combining class of a certain wide char

 714       * @param    integer    Wide char to check (32bit integer)

 715       * @return   integer    Combining class if found, else 0

 716       * @access   private

 717       */
 718      function _get_combining_class($char)
 719      {
 720          return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0;
 721      }
 722  
 723      /**

 724       * Apllies the cannonical ordering of a decomposed UCS4 sequence

 725       * @param    array      Decomposed UCS4 sequence

 726       * @return   array      Ordered USC4 sequence

 727       * @access   private

 728       */
 729      function _apply_cannonical_ordering($input)
 730      {
 731          $swap = true;
 732          $size = count($input);
 733          while ($swap) {
 734              $swap = false;
 735              $last = $this->_get_combining_class(intval($input[0]));
 736              for ($i = 0; $i < $size-1; ++$i) {
 737                  $next = $this->_get_combining_class(intval($input[$i+1]));
 738                  if ($next != 0 && $last > $next) {
 739                      // Move item leftward until it fits

 740                      for ($j = $i + 1; $j > 0; --$j) {
 741                          if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break;
 742                          $t = intval($input[$j]);
 743                          $input[$j] = intval($input[$j-1]);
 744                          $input[$j-1] = $t;
 745                          $swap = true;
 746                      }
 747                      // Reentering the loop looking at the old character again

 748                      $next = $last;
 749                  }
 750                  $last = $next;
 751              }
 752          }
 753          return $input;
 754      }
 755  
 756      /**

 757       * Do composition of a sequence of starter and non-starter

 758       * @param    array      UCS4 Decomposed sequence

 759       * @return   array      Ordered USC4 sequence

 760       * @access   private

 761       */
 762      function _combine($input)
 763      {
 764          $inp_len = count($input);
 765          foreach ($this->NP['replacemaps'] as $np_src => $np_target) {
 766              if ($np_target[0] != $input[0]) continue;
 767              if (count($np_target) != $inp_len) continue;
 768              $hit = false;
 769              foreach ($input as $k2 => $v2) {
 770                  if ($v2 == $np_target[$k2]) {
 771                      $hit = true;
 772                  } else {
 773                      $hit = false;
 774                      break;
 775                  }
 776              }
 777              if ($hit) return $np_src;
 778          }
 779          return false;
 780      }
 781  
 782      /**

 783       * This converts an UTF-8 encoded string to its UCS-4 representation

 784       * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing

 785       * each of the "chars". This is due to PHP not being able to handle strings with

 786       * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.

 787       * The following UTF-8 encodings are supported:

 788       * bytes bits  representation

 789       * 1        7  0xxxxxxx

 790       * 2       11  110xxxxx 10xxxxxx

 791       * 3       16  1110xxxx 10xxxxxx 10xxxxxx

 792       * 4       21  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

 793       * 5       26  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

 794       * 6       31  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

 795       * Each x represents a bit that can be used to store character data.

 796       * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000

 797       * @access   private

 798       */
 799      function _utf8_to_ucs4($input)
 800      {
 801          $output = array();
 802          $out_len = 0;
 803          $inp_len = strlen($input);
 804          $mode = 'next';
 805          $test = 'none';
 806          for ($k = 0; $k < $inp_len; ++$k) {
 807              $v = ord($input{$k}); // Extract byte from input string

 808  
 809              if ($v < 128) { // We found an ASCII char - put into stirng as is
 810                  $output[$out_len] = $v;
 811                  ++$out_len;
 812                  if ('add' == $mode) {
 813                      $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
 814                      return false;
 815                  }
 816                  continue;
 817              }
 818              if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
 819                  $start_byte = $v;
 820                  $mode = 'add';
 821                  $test = 'range';
 822                  if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
 823                      $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left

 824                      $v = ($v - 192) << 6;
 825                  } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
 826                      $next_byte = 1;
 827                      $v = ($v - 224) << 12;
 828                  } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 829                      $next_byte = 2;
 830                      $v = ($v - 240) << 18;
 831                  } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 832                      $next_byte = 3;
 833                      $v = ($v - 248) << 24;
 834                  } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 835                      $next_byte = 4;
 836                      $v = ($v - 252) << 30;
 837                  } else {
 838                      $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
 839                      return false;
 840                  }
 841                  if ('add' == $mode) {
 842                      $output[$out_len] = (int) $v;
 843                      ++$out_len;
 844                      continue;
 845                  }
 846              }
 847              if ('add' == $mode) {
 848                  if (!$this->_allow_overlong && $test == 'range') {
 849                      $test = 'none';
 850                      if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
 851                          $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
 852                          return false;
 853                      }
 854                  }
 855                  if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
 856                      $v = ($v - 128) << ($next_byte * 6);
 857                      $output[($out_len - 1)] += $v;
 858                      --$next_byte;
 859                  } else {
 860                      $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
 861                      return false;
 862                  }
 863                  if ($next_byte < 0) {
 864                      $mode = 'next';
 865                  }
 866              }
 867          } // for

 868          return $output;
 869      }
 870  
 871      /**

 872       * Convert UCS-4 string into UTF-8 string

 873       * See _utf8_to_ucs4() for details

 874       * @access   private

 875       */
 876      function _ucs4_to_utf8($input)
 877      {
 878          $output = '';
 879          $k = 0;
 880          foreach ($input as $v) {
 881              ++$k;
 882              // $v = ord($v);

 883              if ($v < 128) { // 7bit are transferred literally
 884                  $output .= chr($v);
 885              } elseif ($v < (1 << 11)) { // 2 bytes
 886                  $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
 887              } elseif ($v < (1 << 16)) { // 3 bytes
 888                  $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
 889              } elseif ($v < (1 << 21)) { // 4 bytes
 890                  $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
 891                           . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
 892              } elseif ($v < (1 << 26)) { // 5 bytes
 893                  $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
 894                           . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
 895                           . chr(128 + ($v & 63));
 896              } elseif ($v < (1 << 31)) { // 6 bytes
 897                  $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
 898                           . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
 899                           . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
 900              } else {
 901                  $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
 902                  return false;
 903              }
 904          }
 905          return $output;
 906      }
 907  
 908      /**

 909        * Convert UCS-4 array into UCS-4 string

 910        *

 911        * @access   private

 912        */
 913      function _ucs4_to_ucs4_string($input)
 914      {
 915          $output = '';
 916          // Take array values and split output to 4 bytes per value

 917          // The bit mask is 255, which reads &11111111

 918          foreach ($input as $v) {
 919              $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
 920          }
 921          return $output;
 922      }
 923  
 924      /**

 925        * Convert UCS-4 strin into UCS-4 garray

 926        *

 927        * @access   private

 928        */
 929      function _ucs4_string_to_ucs4($input)
 930      {
 931          $output = array();
 932          $inp_len = strlen($input);
 933          // Input length must be dividable by 4

 934          if ($inp_len % 4) {
 935              $this->_error('Input UCS4 string is broken');
 936              return false;
 937          }
 938          // Empty input - return empty output

 939          if (!$inp_len) return $output;
 940          for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
 941              // Increment output position every 4 input bytes

 942              if (!($i % 4)) {
 943                  $out_len++;
 944                  $output[$out_len] = 0;
 945              }
 946              $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
 947          }
 948          return $output;
 949      }
 950  }
 951  
 952  /**

 953  * Adapter class for aligning the API of idna_convert with that of Net_IDNA

 954  * @author  Matthias Sommerfeld <mso@phlylabs.de>

 955  */
 956  class Net_IDNA_php4 extends idna_convert
 957  {
 958      /**

 959       * Sets a new option value. Available options and values:

 960       * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,

 961       *         'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]

 962       * [overlong - Unicode does not allow unnecessarily long encodings of chars,

 963       *             to allow this, set this parameter to true, else to false;

 964       *             default is false.]

 965       * [strict - true: strict mode, good for registration purposes - Causes errors

 966       *           on failures; false: loose mode, ideal for "wildlife" applications

 967       *           by silently ignoring errors and returning the original input instead

 968       *

 969       * @param    mixed     Parameter to set (string: single parameter; array of Parameter => Value pairs)

 970       * @param    string    Value to use (if parameter 1 is a string)

 971       * @return   boolean   true on success, false otherwise

 972       * @access   public

 973       */
 974      function setParams($option, $param = false)
 975      {
 976          return $this->IC->set_parameters($option, $param);
 977      }
 978  }
 979  
 980  ?>
Code source de b2evolution 2.1.0-beta

/blogs/inc/_ext/idna/ -> _idna_convert.class.php (source)

Généré le : Thu Nov 29 23:58:50 2007	par Balluche grâce à PHPXref 0.7