[ Index ] |
|
Code source de b2evolution 2.1.0-beta |
1 <?php 2 /** 3 * @package libs 4 */ 5 /* ------------------------------------------------------------------------- */ 6 /* idna_convert.class.php - Encode / Decode Internationalized Domain Names */ 7 /* (c) 2004-2007 phlyLabs, Berlin (http://phlylabs.de) */ 8 /* All rights reserved */ 9 /* v0.5.0 */ 10 /* ------------------------------------------------------------------------- */ 11 12 // {{{ license 13 14 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */ 15 // 16 // +----------------------------------------------------------------------+ 17 // | This library is free software; you can redistribute it and/or modify | 18 // | it under the terms of the GNU Lesser General Public License as | 19 // | published by the Free Software Foundation; either version 2.1 of the | 20 // | License, or (at your option) any later version. | 21 // | | 22 // | This library is distributed in the hope that it will be useful, but | 23 // | WITHOUT ANY WARRANTY; without even the implied warranty of | 24 // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 25 // | Lesser General Public License for more details. | 26 // | | 27 // | You should have received a copy of the GNU Lesser General Public | 28 // | License along with this library; if not, write to the Free Software | 29 // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | 30 // | USA. | 31 // +----------------------------------------------------------------------+ 32 // 33 34 // }}} 35 36 /** 37 * Encode/decode Internationalized Domain Names. 38 * 39 * The class allows to convert internationalized domain names 40 * (see RFC 3490 for details) as they can be used with various registries worldwide 41 * to be translated between their original (localized) form and their encoded form 42 * as it will be used in the DNS (Domain Name System). 43 * 44 * The class provides two public methods, encode() and decode(), which do exactly 45 * what you would expect them to do. You are allowed to use complete domain names, 46 * simple strings and complete email addresses as well. That means, that you might 47 * use any of the following notations: 48 * 49 * - www.nörgler.com 50 * - xn--nrgler-wxa 51 * - xn--brse-5qa.xn--knrz-1ra.info 52 * 53 * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4 54 * array. Unicode output is available in the same formats. 55 * You can select your preferred format via {@link set_paramter()}. 56 * 57 * ACE input and output is always expected to be ASCII. 58 * 59 * @author Matthias Sommerfeld <mso@phlylabs.de> 60 * @version 0.5.0 61 * 62 */ 63 if( !defined('EVO_MAIN_INIT') ) die( 'Please, do not access this page directly.' ); 64 65 class idna_convert 66 { 67 /** 68 * Holds all relevant mapping tables, loaded from a seperate file on construct 69 * See RFC3454 for details 70 * 71 * @var array 72 * @access private 73 */ 74 var $NP = array(); 75 76 // Internal settings, do not mess with them 77 var $_punycode_prefix = 'xn--'; 78 var $_invalid_ucs = 0x80000000; 79 var $_max_ucs = 0x10FFFF; 80 var $_base = 36; 81 var $_tmin = 1; 82 var $_tmax = 26; 83 var $_skew = 38; 84 var $_damp = 700; 85 var $_initial_bias = 72; 86 var $_initial_n = 0x80; 87 var $_sbase = 0xAC00; 88 var $_lbase = 0x1100; 89 var $_vbase = 0x1161; 90 var $_tbase = 0x11A7; 91 var $_lcount = 19; 92 var $_vcount = 21; 93 var $_tcount = 28; 94 var $_ncount = 588; // _vcount * _tcount 95 var $_scount = 11172; // _lcount * _tcount * _vcount 96 var $_error = false; 97 98 // See {@link set_paramter()} for details of how to change the following 99 // settings from within your script / application 100 var $_api_encoding = 'utf8'; // Default input charset is UTF-8 101 var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden 102 var $_strict_mode = false; // Behave strict or not 103 104 // The constructor 105 function idna_convert($options = false) 106 { 107 $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount; 108 if (function_exists('file_get_contents')) { 109 $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/_idna_convert_npdata.ser.inc')); 110 } else { 111 $this->NP = unserialize(join('', file(dirname(__FILE__).'/_idna_convert_npdata.ser.inc'))); 112 } 113 // If parameters are given, pass these to the respective method 114 if (is_array($options)) { 115 return $this->set_parameter($options); 116 } 117 return true; 118 } 119 120 /** 121 * Sets a new option value. Available options and values: 122 * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8, 123 * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8] 124 * [overlong - Unicode does not allow unnecessarily long encodings of chars, 125 * to allow this, set this parameter to true, else to false; 126 * default is false.] 127 * [strict - true: strict mode, good for registration purposes - Causes errors 128 * on failures; false: loose mode, ideal for "wildlife" applications 129 * by silently ignoring errors and returning the original input instead 130 * 131 * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs) 132 * @param string Value to use (if parameter 1 is a string) 133 * @return boolean true on success, false otherwise 134 * @access public 135 */ 136 function set_parameter($option, $value = false) 137 { 138 if (!is_array($option)) { 139 $option = array($option => $value); 140 } 141 foreach ($option as $k => $v) { 142 switch ($k) { 143 case 'encoding': 144 switch ($v) { 145 case 'utf8': 146 case 'ucs4_string': 147 case 'ucs4_array': 148 $this->_api_encoding = $v; 149 break; 150 default: 151 $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k); 152 return false; 153 } 154 break; 155 case 'overlong': 156 $this->_allow_overlong = ($v) ? true : false; 157 break; 158 case 'strict': 159 $this->_strict_mode = ($v) ? true : false; 160 break; 161 default: 162 $this->_error('Set Parameter: Unknown option '.$k); 163 return false; 164 } 165 } 166 return true; 167 } 168 169 /** 170 * Decode a given ACE domain name 171 * @param string Domain name (ACE string) 172 * [@param string Desired output encoding, see {@link set_parameter}] 173 * @return string Decoded Domain name (UTF-8 or UCS-4) 174 * @access public 175 */ 176 function decode($input, $one_time_encoding = false) 177 { 178 // Optionally set 179 if ($one_time_encoding) { 180 switch ($one_time_encoding) { 181 case 'utf8': 182 case 'ucs4_string': 183 case 'ucs4_array': 184 break; 185 default: 186 $this->_error('Unknown encoding '.$one_time_encoding); 187 return false; 188 } 189 } 190 // Make sure to drop any newline characters around 191 $input = trim($input); 192 193 // Negotiate input and try to determine, whether it is a plain string, 194 // an email address or something like a complete URL 195 if (strpos($input, '@')) { // Maybe it is an email address 196 // No no in strict mode 197 if ($this->_strict_mode) { 198 $this->_error('Only simple domain name parts can be handled in strict mode'); 199 return false; 200 } 201 list ($email_pref, $input) = explode('@', $input, 2); 202 $arr = explode('.', $input); 203 foreach ($arr as $k => $v) { 204 if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) { 205 $conv = $this->_decode($v); 206 if ($conv) $arr[$k] = $conv; 207 } 208 } 209 $input = join('.', $arr); 210 $arr = explode('.', $email_pref); 211 foreach ($arr as $k => $v) { 212 if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) { 213 $conv = $this->_decode($v); 214 if ($conv) $arr[$k] = $conv; 215 } 216 } 217 $email_pref = join('.', $arr); 218 $return = $email_pref . '@' . $input; 219 } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters) 220 // No no in strict mode 221 if ($this->_strict_mode) { 222 $this->_error('Only simple domain name parts can be handled in strict mode'); 223 return false; 224 } 225 $parsed = parse_url($input); 226 if (isset($parsed['host'])) { 227 $arr = explode('.', $parsed['host']); 228 foreach ($arr as $k => $v) { 229 $conv = $this->_decode($v); 230 if ($conv) $arr[$k] = $conv; 231 } 232 $parsed['host'] = join('.', $arr); 233 $return = 234 (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://')) 235 .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@') 236 .$parsed['host'] 237 .(empty($parsed['port']) ? '' : ':'.$parsed['port']) 238 .(empty($parsed['path']) ? '' : $parsed['path']) 239 .(empty($parsed['query']) ? '' : '?'.$parsed['query']) 240 .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']); 241 } else { // parse_url seems to have failed, try without it 242 $arr = explode('.', $input); 243 foreach ($arr as $k => $v) { 244 $conv = $this->_decode($v); 245 $arr[$k] = ($conv) ? $conv : $v; 246 } 247 $return = join('.', $arr); 248 } 249 } else { // Otherwise we consider it being a pure domain name string 250 $return = $this->_decode($input); 251 if (!$return) $return = $input; 252 } 253 // The output is UTF-8 by default, other output formats need conversion here 254 // If one time encoding is given, use this, else the objects property 255 switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) { 256 case 'utf8': 257 return $return; 258 break; 259 case 'ucs4_string': 260 return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return)); 261 break; 262 case 'ucs4_array': 263 return $this->_utf8_to_ucs4($return); 264 break; 265 default: 266 $this->_error('Unsupported output format'); 267 return false; 268 } 269 } 270 271 /** 272 * Encode a given UTF-8 domain name 273 * @param string Domain name (UTF-8 or UCS-4) 274 * [@param string Desired input encoding, see {@link set_parameter}] 275 * @return string Encoded Domain name (ACE string) 276 * @access public 277 */ 278 function encode($decoded, $one_time_encoding = false) 279 { 280 // Forcing conversion of input to UCS4 array 281 // If one time encoding is given, use this, else the objects property 282 switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) { 283 case 'utf8': 284 $decoded = $this->_utf8_to_ucs4($decoded); 285 break; 286 case 'ucs4_string': 287 $decoded = $this->_ucs4_string_to_ucs4($decoded); 288 case 'ucs4_array': 289 break; 290 default: 291 $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding)); 292 return false; 293 } 294 295 // No input, no output, what else did you expect? 296 if (empty($decoded)) return ''; 297 298 // Anchors for iteration 299 $last_begin = 0; 300 // Output string 301 $output = ''; 302 foreach ($decoded as $k => $v) { 303 // Make sure to use just the plain dot 304 switch($v) { 305 case 0x3002: 306 case 0xFF0E: 307 case 0xFF61: 308 $decoded[$k] = 0x2E; 309 // Right, no break here, the above are converted to dots anyway 310 // Stumbling across an anchoring character 311 case 0x2E: 312 case 0x2F: 313 case 0x3A: 314 case 0x3F: 315 case 0x40: 316 // Neither email addresses nor URLs allowed in strict mode 317 if ($this->_strict_mode) { 318 $this->_error('Neither email addresses nor URLs are allowed in strict mode.'); 319 return false; 320 } else { 321 // Skip first char 322 if ($k) { 323 $encoded = ''; 324 $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin))); 325 if ($encoded) { 326 $output .= $encoded; 327 } else { 328 $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin))); 329 } 330 $output .= chr($decoded[$k]); 331 } 332 $last_begin = $k + 1; 333 } 334 } 335 } 336 // Catch the rest of the string 337 if ($last_begin) { 338 $inp_len = sizeof($decoded); 339 $encoded = ''; 340 $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin))); 341 if ($encoded) { 342 $output .= $encoded; 343 } else { 344 $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin))); 345 } 346 return $output; 347 } else { 348 if ($output = $this->_encode($decoded)) { 349 return $output; 350 } else { 351 return $this->_ucs4_to_utf8($decoded); 352 } 353 } 354 } 355 356 /** 357 * Use this method to get the last error ocurred 358 * @param void 359 * @return string The last error, that occured 360 * @access public 361 */ 362 function get_last_error() 363 { 364 return $this->_error; 365 } 366 367 /** 368 * The actual decoding algorithm 369 * @access private 370 */ 371 function _decode($encoded) 372 { 373 // We do need to find the Punycode prefix 374 if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) { 375 $this->_error('This is not a punycode string'); 376 return false; 377 } 378 $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded); 379 // If nothing left after removing the prefix, it is hopeless 380 if (!$encode_test) { 381 $this->_error('The given encoded string was empty'); 382 return false; 383 } 384 // Find last occurence of the delimiter 385 $delim_pos = strrpos($encoded, '-'); 386 if ($delim_pos > strlen($this->_punycode_prefix)) { 387 for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) { 388 $decoded[] = ord($encoded{$k}); 389 } 390 } else { 391 $decoded = array(); 392 } 393 $deco_len = count($decoded); 394 $enco_len = strlen($encoded); 395 396 // Wandering through the strings; init 397 $is_first = true; 398 $bias = $this->_initial_bias; 399 $idx = 0; 400 $char = $this->_initial_n; 401 402 for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) { 403 for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) { 404 $digit = $this->_decode_digit($encoded{$enco_idx++}); 405 $idx += $digit * $w; 406 $t = ($k <= $bias) ? $this->_tmin : 407 (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias)); 408 if ($digit < $t) break; 409 $w = (int) ($w * ($this->_base - $t)); 410 } 411 $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first); 412 $is_first = false; 413 $char += (int) ($idx / ($deco_len + 1)); 414 $idx %= ($deco_len + 1); 415 if ($deco_len > 0) { 416 // Make room for the decoded char 417 for ($i = $deco_len; $i > $idx; $i--) { 418 $decoded[$i] = $decoded[($i - 1)]; 419 } 420 } 421 $decoded[$idx++] = $char; 422 } 423 return $this->_ucs4_to_utf8($decoded); 424 } 425 426 /** 427 * The actual encoding algorithm 428 * @access private 429 */ 430 function _encode($decoded) 431 { 432 // We cannot encode a domain name containing the Punycode prefix 433 $extract = strlen($this->_punycode_prefix); 434 $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix); 435 $check_deco = array_slice($decoded, 0, $extract); 436 437 if ($check_pref == $check_deco) { 438 $this->_error('This is already a punycode string'); 439 return false; 440 } 441 // We will not try to encode strings consisting of basic code points only 442 $encodable = false; 443 foreach ($decoded as $k => $v) { 444 if ($v > 0x7a) { 445 $encodable = true; 446 break; 447 } 448 } 449 if (!$encodable) { 450 $this->_error('The given string does not contain encodable chars'); 451 return false; 452 } 453 454 // Do NAMEPREP 455 $decoded = $this->_nameprep($decoded); 456 if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed 457 458 $deco_len = count($decoded); 459 if (!$deco_len) return false; // Empty array 460 461 $codecount = 0; // How many chars have been consumed 462 463 $encoded = ''; 464 // Copy all basic code points to output 465 for ($i = 0; $i < $deco_len; ++$i) { 466 $test = $decoded[$i]; 467 // Will match [-0-9a-zA-Z] 468 if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B) 469 || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) { 470 $encoded .= chr($decoded[$i]); 471 $codecount++; 472 } 473 } 474 if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones 475 476 // Start with the prefix; copy it to output 477 $encoded = $this->_punycode_prefix.$encoded; 478 479 // If we have basic code points in output, add an hyphen to the end 480 if ($codecount) $encoded .= '-'; 481 482 // Now find and encode all non-basic code points 483 $is_first = true; 484 $cur_code = $this->_initial_n; 485 $bias = $this->_initial_bias; 486 $delta = 0; 487 while ($codecount < $deco_len) { 488 // Find the smallest code point >= the current code point and 489 // remember the last ouccrence of it in the input 490 for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) { 491 if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) { 492 $next_code = $decoded[$i]; 493 } 494 } 495 496 $delta += ($next_code - $cur_code) * ($codecount + 1); 497 $cur_code = $next_code; 498 499 // Scan input again and encode all characters whose code point is $cur_code 500 for ($i = 0; $i < $deco_len; $i++) { 501 if ($decoded[$i] < $cur_code) { 502 $delta++; 503 } elseif ($decoded[$i] == $cur_code) { 504 for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) { 505 $t = ($k <= $bias) ? $this->_tmin : 506 (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias); 507 if ($q < $t) break; 508 $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval() 509 $q = (int) (($q - $t) / ($this->_base - $t)); 510 } 511 $encoded .= $this->_encode_digit($q); 512 $bias = $this->_adapt($delta, $codecount+1, $is_first); 513 $codecount++; 514 $delta = 0; 515 $is_first = false; 516 } 517 } 518 $delta++; 519 $cur_code++; 520 } 521 return $encoded; 522 } 523 524 /** 525 * Adapt the bias according to the current code point and position 526 * @access private 527 */ 528 function _adapt($delta, $npoints, $is_first) 529 { 530 $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2)); 531 $delta += intval($delta / $npoints); 532 for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) { 533 $delta = intval($delta / ($this->_base - $this->_tmin)); 534 } 535 return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew)); 536 } 537 538 /** 539 * Encoding a certain digit 540 * @access private 541 */ 542 function _encode_digit($d) 543 { 544 return chr($d + 22 + 75 * ($d < 26)); 545 } 546 547 /** 548 * Decode a certain digit 549 * @access private 550 */ 551 function _decode_digit($cp) 552 { 553 $cp = ord($cp); 554 return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base)); 555 } 556 557 /** 558 * Internal error handling method 559 * @access private 560 */ 561 function _error($error = '') 562 { 563 $this->_error = $error; 564 } 565 566 /** 567 * Do Nameprep according to RFC3491 and RFC3454 568 * @param array Unicode Characters 569 * @return string Unicode Characters, Nameprep'd 570 * @access private 571 */ 572 function _nameprep($input) 573 { 574 $output = array(); 575 $error = false; 576 // 577 // Mapping 578 // Walking through the input array, performing the required steps on each of 579 // the input chars and putting the result into the output array 580 // While mapping required chars we apply the cannonical ordering 581 foreach ($input as $v) { 582 // Map to nothing == skip that code point 583 if (in_array($v, $this->NP['map_nothing'])) continue; 584 585 // Try to find prohibited input 586 if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) { 587 $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v)); 588 return false; 589 } 590 foreach ($this->NP['prohibit_ranges'] as $range) { 591 if ($range[0] <= $v && $v <= $range[1]) { 592 $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v)); 593 return false; 594 } 595 } 596 // 597 // Hangul syllable decomposition 598 if (0xAC00 <= $v && $v <= 0xD7AF) { 599 foreach ($this->_hangul_decompose($v) as $out) { 600 $output[] = (int) $out; 601 } 602 // There's a decomposition mapping for that code point 603 } elseif (isset($this->NP['replacemaps'][$v])) { 604 foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) { 605 $output[] = (int) $out; 606 } 607 } else { 608 $output[] = (int) $v; 609 } 610 } 611 // Before applying any Combining, try to rearrange any Hangul syllables 612 $output = $this->_hangul_compose($output); 613 // 614 // Combine code points 615 // 616 $last_class = 0; 617 $last_starter = 0; 618 $out_len = count($output); 619 for ($i = 0; $i < $out_len; ++$i) { 620 $class = $this->_get_combining_class($output[$i]); 621 if ((!$last_class || $last_class > $class) && $class) { 622 // Try to match 623 $seq_len = $i - $last_starter; 624 $out = $this->_combine(array_slice($output, $last_starter, $seq_len)); 625 // On match: Replace the last starter with the composed character and remove 626 // the now redundant non-starter(s) 627 if ($out) { 628 $output[$last_starter] = $out; 629 if (count($out) != $seq_len) { 630 for ($j = $i+1; $j < $out_len; ++$j) { 631 $output[$j-1] = $output[$j]; 632 } 633 unset($output[$out_len]); 634 } 635 // Rewind the for loop by one, since there can be more possible compositions 636 $i--; 637 $out_len--; 638 $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]); 639 continue; 640 } 641 } 642 // The current class is 0 643 if (!$class) $last_starter = $i; 644 $last_class = $class; 645 } 646 return $output; 647 } 648 649 /** 650 * Decomposes a Hangul syllable 651 * (see http://www.unicode.org/unicode/reports/tr15/#Hangul 652 * @param integer 32bit UCS4 code point 653 * @return array Either Hangul Syllable decomposed or original 32bit value as one value array 654 * @access private 655 */ 656 function _hangul_decompose($char) 657 { 658 $sindex = (int) $char - $this->_sbase; 659 if ($sindex < 0 || $sindex >= $this->_scount) { 660 return array($char); 661 } 662 $result = array(); 663 $result[] = (int) $this->_lbase + $sindex / $this->_ncount; 664 $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount; 665 $T = intval($this->_tbase + $sindex % $this->_tcount); 666 if ($T != $this->_tbase) $result[] = $T; 667 return $result; 668 } 669 /** 670 * Ccomposes a Hangul syllable 671 * (see http://www.unicode.org/unicode/reports/tr15/#Hangul 672 * @param array Decomposed UCS4 sequence 673 * @return array UCS4 sequence with syllables composed 674 * @access private 675 */ 676 function _hangul_compose($input) 677 { 678 $inp_len = count($input); 679 if (!$inp_len) return array(); 680 $result = array(); 681 $last = (int) $input[0]; 682 $result[] = $last; // copy first char from input to output 683 684 for ($i = 1; $i < $inp_len; ++$i) { 685 $char = (int) $input[$i]; 686 $sindex = $last - $this->_sbase; 687 $lindex = $last - $this->_lbase; 688 $vindex = $char - $this->_vbase; 689 $tindex = $char - $this->_tbase; 690 // Find out, whether two current characters are LV and T 691 if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0) 692 && 0 <= $tindex && $tindex <= $this->_tcount) { 693 // create syllable of form LVT 694 $last += $tindex; 695 $result[(count($result) - 1)] = $last; // reset last 696 continue; // discard char 697 } 698 // Find out, whether two current characters form L and V 699 if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) { 700 // create syllable of form LV 701 $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount; 702 $result[(count($result) - 1)] = $last; // reset last 703 continue; // discard char 704 } 705 // if neither case was true, just add the character 706 $last = $char; 707 $result[] = $char; 708 } 709 return $result; 710 } 711 712 /** 713 * Returns the combining class of a certain wide char 714 * @param integer Wide char to check (32bit integer) 715 * @return integer Combining class if found, else 0 716 * @access private 717 */ 718 function _get_combining_class($char) 719 { 720 return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0; 721 } 722 723 /** 724 * Apllies the cannonical ordering of a decomposed UCS4 sequence 725 * @param array Decomposed UCS4 sequence 726 * @return array Ordered USC4 sequence 727 * @access private 728 */ 729 function _apply_cannonical_ordering($input) 730 { 731 $swap = true; 732 $size = count($input); 733 while ($swap) { 734 $swap = false; 735 $last = $this->_get_combining_class(intval($input[0])); 736 for ($i = 0; $i < $size-1; ++$i) { 737 $next = $this->_get_combining_class(intval($input[$i+1])); 738 if ($next != 0 && $last > $next) { 739 // Move item leftward until it fits 740 for ($j = $i + 1; $j > 0; --$j) { 741 if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break; 742 $t = intval($input[$j]); 743 $input[$j] = intval($input[$j-1]); 744 $input[$j-1] = $t; 745 $swap = true; 746 } 747 // Reentering the loop looking at the old character again 748 $next = $last; 749 } 750 $last = $next; 751 } 752 } 753 return $input; 754 } 755 756 /** 757 * Do composition of a sequence of starter and non-starter 758 * @param array UCS4 Decomposed sequence 759 * @return array Ordered USC4 sequence 760 * @access private 761 */ 762 function _combine($input) 763 { 764 $inp_len = count($input); 765 foreach ($this->NP['replacemaps'] as $np_src => $np_target) { 766 if ($np_target[0] != $input[0]) continue; 767 if (count($np_target) != $inp_len) continue; 768 $hit = false; 769 foreach ($input as $k2 => $v2) { 770 if ($v2 == $np_target[$k2]) { 771 $hit = true; 772 } else { 773 $hit = false; 774 break; 775 } 776 } 777 if ($hit) return $np_src; 778 } 779 return false; 780 } 781 782 /** 783 * This converts an UTF-8 encoded string to its UCS-4 representation 784 * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing 785 * each of the "chars". This is due to PHP not being able to handle strings with 786 * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too. 787 * The following UTF-8 encodings are supported: 788 * bytes bits representation 789 * 1 7 0xxxxxxx 790 * 2 11 110xxxxx 10xxxxxx 791 * 3 16 1110xxxx 10xxxxxx 10xxxxxx 792 * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 793 * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 794 * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 795 * Each x represents a bit that can be used to store character data. 796 * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000 797 * @access private 798 */ 799 function _utf8_to_ucs4($input) 800 { 801 $output = array(); 802 $out_len = 0; 803 $inp_len = strlen($input); 804 $mode = 'next'; 805 $test = 'none'; 806 for ($k = 0; $k < $inp_len; ++$k) { 807 $v = ord($input{$k}); // Extract byte from input string 808 809 if ($v < 128) { // We found an ASCII char - put into stirng as is 810 $output[$out_len] = $v; 811 ++$out_len; 812 if ('add' == $mode) { 813 $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k); 814 return false; 815 } 816 continue; 817 } 818 if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char 819 $start_byte = $v; 820 $mode = 'add'; 821 $test = 'range'; 822 if ($v >> 5 == 6) { // &110xxxxx 10xxxxx 823 $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left 824 $v = ($v - 192) << 6; 825 } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx 826 $next_byte = 1; 827 $v = ($v - 224) << 12; 828 } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 829 $next_byte = 2; 830 $v = ($v - 240) << 18; 831 } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 832 $next_byte = 3; 833 $v = ($v - 248) << 24; 834 } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 835 $next_byte = 4; 836 $v = ($v - 252) << 30; 837 } else { 838 $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k); 839 return false; 840 } 841 if ('add' == $mode) { 842 $output[$out_len] = (int) $v; 843 ++$out_len; 844 continue; 845 } 846 } 847 if ('add' == $mode) { 848 if (!$this->_allow_overlong && $test == 'range') { 849 $test = 'none'; 850 if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) { 851 $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k); 852 return false; 853 } 854 } 855 if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx 856 $v = ($v - 128) << ($next_byte * 6); 857 $output[($out_len - 1)] += $v; 858 --$next_byte; 859 } else { 860 $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k); 861 return false; 862 } 863 if ($next_byte < 0) { 864 $mode = 'next'; 865 } 866 } 867 } // for 868 return $output; 869 } 870 871 /** 872 * Convert UCS-4 string into UTF-8 string 873 * See _utf8_to_ucs4() for details 874 * @access private 875 */ 876 function _ucs4_to_utf8($input) 877 { 878 $output = ''; 879 $k = 0; 880 foreach ($input as $v) { 881 ++$k; 882 // $v = ord($v); 883 if ($v < 128) { // 7bit are transferred literally 884 $output .= chr($v); 885 } elseif ($v < (1 << 11)) { // 2 bytes 886 $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63)); 887 } elseif ($v < (1 << 16)) { // 3 bytes 888 $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); 889 } elseif ($v < (1 << 21)) { // 4 bytes 890 $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63)) 891 . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); 892 } elseif ($v < (1 << 26)) { // 5 bytes 893 $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63)) 894 . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63)) 895 . chr(128 + ($v & 63)); 896 } elseif ($v < (1 << 31)) { // 6 bytes 897 $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63)) 898 . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63)) 899 . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); 900 } else { 901 $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k); 902 return false; 903 } 904 } 905 return $output; 906 } 907 908 /** 909 * Convert UCS-4 array into UCS-4 string 910 * 911 * @access private 912 */ 913 function _ucs4_to_ucs4_string($input) 914 { 915 $output = ''; 916 // Take array values and split output to 4 bytes per value 917 // The bit mask is 255, which reads &11111111 918 foreach ($input as $v) { 919 $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255); 920 } 921 return $output; 922 } 923 924 /** 925 * Convert UCS-4 strin into UCS-4 garray 926 * 927 * @access private 928 */ 929 function _ucs4_string_to_ucs4($input) 930 { 931 $output = array(); 932 $inp_len = strlen($input); 933 // Input length must be dividable by 4 934 if ($inp_len % 4) { 935 $this->_error('Input UCS4 string is broken'); 936 return false; 937 } 938 // Empty input - return empty output 939 if (!$inp_len) return $output; 940 for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) { 941 // Increment output position every 4 input bytes 942 if (!($i % 4)) { 943 $out_len++; 944 $output[$out_len] = 0; 945 } 946 $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) ); 947 } 948 return $output; 949 } 950 } 951 952 /** 953 * Adapter class for aligning the API of idna_convert with that of Net_IDNA 954 * @author Matthias Sommerfeld <mso@phlylabs.de> 955 */ 956 class Net_IDNA_php4 extends idna_convert 957 { 958 /** 959 * Sets a new option value. Available options and values: 960 * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8, 961 * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8] 962 * [overlong - Unicode does not allow unnecessarily long encodings of chars, 963 * to allow this, set this parameter to true, else to false; 964 * default is false.] 965 * [strict - true: strict mode, good for registration purposes - Causes errors 966 * on failures; false: loose mode, ideal for "wildlife" applications 967 * by silently ignoring errors and returning the original input instead 968 * 969 * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs) 970 * @param string Value to use (if parameter 1 is a string) 971 * @return boolean true on success, false otherwise 972 * @access public 973 */ 974 function setParams($option, $param = false) 975 { 976 return $this->IC->set_parameters($option, $param); 977 } 978 } 979 980 ?>
titre
Description
Corps
titre
Description
Corps
titre
Description
Corps
titre
Corps
Généré le : Thu Nov 29 23:58:50 2007 | par Balluche grâce à PHPXref 0.7 |
![]() |