Drupal 5.3 : /includes/unicode.inc source

[Sommaire] [Imprimer]
   1  <?php
   2  // $Id: unicode.inc,v 1.23.2.2 2007/05/21 01:09:21 drumm Exp $
   3  
   4  /**
   5   * Indicates an error during check for PHP unicode support.
   6   */
   7  define('UNICODE_ERROR', -1);
   8  
   9  /**
  10   * Indicates that standard PHP (emulated) unicode support is being used.
  11   */
  12  define('UNICODE_SINGLEBYTE', 0);
  13  
  14  /**
  15   * Indicates that full unicode support with the PHP mbstring extension is being
  16   * used.
  17   */
  18  define('UNICODE_MULTIBYTE', 1);
  19  
  20  /**
  21   * Wrapper around _unicode_check().
  22   */
  23  function unicode_check() {
  24    list($GLOBALS['multibyte']) = _unicode_check();
  25  }
  26  
  27  /**
  28   * Perform checks about Unicode support in PHP, and set the right settings if
  29   * needed.
  30   *
  31   * Because Drupal needs to be able to handle text in various encodings, we do
  32   * not support mbstring function overloading. HTTP input/output conversion must
  33   * be disabled for similar reasons.
  34   *
  35   * @param $errors
  36   *   Whether to report any fatal errors with form_set_error().
  37   */
  38  function _unicode_check() {
  39    // Ensure translations don't break at install time
  40    $t = get_t();
  41  
  42    // Set the standard C locale to ensure consistent, ASCII-only string handling.
  43    setlocale(LC_CTYPE, 'C');
  44  
  45    // Check for outdated PCRE library
  46    // Note: we check if U+E2 is in the range U+E0 - U+E1. This test returns TRUE on old PCRE versions.
  47    if (preg_match('/[Ã -Ã¡]/u', 'Ã¢')) {
  48      return array(UNICODE_ERROR, $t('The PCRE library in your PHP installation is outdated. This will cause problems when handling Unicode text. If you are running PHP 4.3.3 or higher, make sure you are using the PCRE library supplied by PHP. Please refer to the <a href="@url">PHP PCRE documentation</a> for more information.', array('@url' => 'http://www.php.net/pcre')));
  49    }
  50  
  51    // Check for mbstring extension
  52    if (!function_exists('mb_strlen')) {
  53      return array(UNICODE_SINGLEBYTE, $t('Operations on Unicode strings are emulated on a best-effort basis. Install the <a href="@url">PHP mbstring extension</a> for improved Unicode support.', array('@url' => 'http://www.php.net/mbstring')));
  54    }
  55  
  56    // Check mbstring configuration
  57    if (ini_get('mbstring.func_overload') != 0) {
  58      return array(UNICODE_ERROR, $t('Multibyte string function overloading in PHP is active and must be disabled. Check the php.ini <em>mbstring.func_overload</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
  59    }
  60    if (ini_get('mbstring.encoding_translation') != 0) {
  61      return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.encoding_translation</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
  62    }
  63    if (ini_get('mbstring.http_input') != 'pass') {
  64      return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_input</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
  65    }
  66    if (ini_get('mbstring.http_output') != 'pass') {
  67      return array(UNICODE_ERROR, $t('Multibyte string output conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_output</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
  68    }
  69  
  70    // Set appropriate configuration
  71    mb_internal_encoding('utf-8');
  72    mb_language('uni');
  73    return array(UNICODE_MULTIBYTE, '');
  74  }
  75  
  76  /**
  77   * Return Unicode library status and errors.
  78   */
  79  function unicode_requirements() {
  80    // Ensure translations don't break at install time
  81    $t = get_t();
  82  
  83    $libraries = array(
  84      UNICODE_SINGLEBYTE => $t('Standard PHP'),
  85      UNICODE_MULTIBYTE => $t('PHP Mbstring Extension'),
  86      UNICODE_ERROR => $t('Error'),
  87    );
  88    $severities = array(
  89      UNICODE_SINGLEBYTE => REQUIREMENT_WARNING,
  90      UNICODE_MULTIBYTE => REQUIREMENT_OK,
  91      UNICODE_ERROR => REQUIREMENT_ERROR,
  92    );
  93    list($library, $description) = _unicode_check();
  94  
  95    $requirements['unicode'] = array(
  96      'title' => $t('Unicode library'),
  97      'value' => $libraries[$library],
  98    );
  99    if ($description) {
 100      $requirements['unicode']['description'] = $description;
 101    }
 102  
 103    $requirements['unicode']['severity'] = $severities[$library];
 104  
 105    return $requirements;
 106  }
 107   
 108  /**
 109   * Prepare a new XML parser.
 110   *
 111   * This is a wrapper around xml_parser_create() which extracts the encoding from
 112   * the XML data first and sets the output encoding to UTF-8. This function should
 113   * be used instead of xml_parser_create(), because PHP 4's XML parser doesn't
 114   * check the input encoding itself. "Starting from PHP 5, the input encoding is
 115   * automatically detected, so that the encoding parameter specifies only the
 116   * output encoding."
 117   *
 118   * This is also where unsupported encodings will be converted. Callers should
 119   * take this into account: $data might have been changed after the call.
 120   *
 121   * @param &$data
 122   *   The XML data which will be parsed later.
 123   * @return
 124   *   An XML parser object.
 125   */
 126  function drupal_xml_parser_create(&$data) {
 127    // Default XML encoding is UTF-8
 128    $encoding = 'utf-8';
 129    $bom = FALSE;
 130  
 131    // Check for UTF-8 byte order mark (PHP5's XML parser doesn't handle it).
 132    if (!strncmp($data, "\xEF\xBB\xBF", 3)) {
 133      $bom = TRUE;
 134      $data = substr($data, 3);
 135    }
 136  
 137    // Check for an encoding declaration in the XML prolog if no BOM was found.
 138    if (!$bom && ereg('^<\?xml[^>]+encoding="([^"]+)"', $data, $match)) {
 139      $encoding = $match[1];
 140    }
 141  
 142    // Unsupported encodings are converted here into UTF-8.
 143    $php_supported = array('utf-8', 'iso-8859-1', 'us-ascii');
 144    if (!in_array(strtolower($encoding), $php_supported)) {
 145      $out = drupal_convert_to_utf8($data, $encoding);
 146      if ($out !== FALSE) {
 147        $encoding = 'utf-8';
 148        $data = ereg_replace('^(<\?xml[^>]+encoding)="([^"]+)"', '\\1="utf-8"', $out);
 149      }
 150      else {
 151        watchdog('php', t("Could not convert XML encoding %s to UTF-8.", array('%s' => $encoding)), WATCHDOG_WARNING);
 152        return 0;
 153      }
 154    }
 155  
 156    $xml_parser = xml_parser_create($encoding);
 157    xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, 'utf-8');
 158    return $xml_parser;
 159  }
 160  
 161  /**
 162   * Convert data to UTF-8
 163   *
 164   * Requires the iconv, GNU recode or mbstring PHP extension.
 165   *
 166   * @param $data
 167   *   The data to be converted.
 168   * @param $encoding
 169   *   The encoding that the data is in
 170   * @return
 171   *   Converted data or FALSE.
 172   */
 173  function drupal_convert_to_utf8($data, $encoding) {
 174    if (function_exists('iconv')) {
 175      $out = @iconv($encoding, 'utf-8', $data);
 176    }
 177    else if (function_exists('mb_convert_encoding')) {
 178      $out = @mb_convert_encoding($data, 'utf-8', $encoding);
 179    }
 180    else if (function_exists('recode_string')) {
 181      $out = @recode_string($encoding .'..utf-8', $data);
 182    }
 183    else {
 184      watchdog('php', t("Unsupported encoding %s. Please install iconv, GNU recode or mbstring for PHP.", array('%s' => $encoding)), WATCHDOG_ERROR);
 185      return FALSE;
 186    }
 187  
 188    return $out;
 189  }
 190  
 191  /**
 192   * Truncate a UTF-8-encoded string safely to a number of bytes.
 193   *
 194   * If the end position is in the middle of a UTF-8 sequence, it scans backwards
 195   * until the beginning of the byte sequence.
 196   *
 197   * Use this function whenever you want to chop off a string at an unsure
 198   * location. On the other hand, if you're sure that you're splitting on a
 199   * character boundary (e.g. after using strpos() or similar), you can safely use
 200   * substr() instead.
 201   *
 202   * @param $string
 203   *   The string to truncate.
 204   * @param $len
 205   *   An upper limit on the returned string length.
 206   * @param $wordsafe
 207   *   Flag to truncate at nearest space. Defaults to FALSE.
 208   * @return
 209   *   The truncated string.
 210   */
 211  function truncate_utf8($string, $len, $wordsafe = FALSE, $dots = FALSE) {
 212    $slen = strlen($string);
 213    if ($slen <= $len) {
 214      return $string;
 215    }
 216    if ($wordsafe) {
 217      $end = $len;
 218      while (($string[--$len] != ' ') && ($len > 0)) {};
 219      if ($len == 0) {
 220        $len = $end;
 221      }
 222    }
 223    if ((ord($string[$len]) < 0x80) || (ord($string[$len]) >= 0xC0)) {
 224      return substr($string, 0, $len) . ($dots ? ' ...' : '');
 225    }
 226    while (--$len >= 0 && ord($string[$len]) >= 0x80 && ord($string[$len]) < 0xC0) {};
 227    return substr($string, 0, $len) . ($dots ? ' ...' : '');
 228  }
 229  
 230  /**
 231   * Encodes MIME/HTTP header values that contain non-ASCII, UTF-8 encoded
 232   * characters.
 233   *
 234   * For example, mime_header_encode('tÃ©st.txt') returns "=?UTF-8?B?dMOpc3QudHh0?=".
 235   *
 236   * See http://www.rfc-editor.org/rfc/rfc2047.txt for more information.
 237   *
 238   * Notes:
 239   * - Only encode strings that contain non-ASCII characters.
 240   * - We progressively cut-off a chunk with truncate_utf8(). This is to ensure
 241   *   each chunk starts and ends on a character boundary.
 242   * - Using \n as the chunk separator may cause problems on some systems and may
 243   *   have to be changed to \r\n or \r.
 244   */
 245  function mime_header_encode($string) {
 246    if (preg_match('/[^\x20-\x7E]/', $string)) {
 247      $chunk_size = 47; // floor((75 - strlen("=?UTF-8?B??=")) * 0.75);
 248      $len = strlen($string);
 249      $output = '';
 250      while ($len > 0) {
 251        $chunk = truncate_utf8($string, $chunk_size);
 252        $output .= ' =?UTF-8?B?'. base64_encode($chunk) ."?=\n";
 253        $c = strlen($chunk);
 254        $string = substr($string, $c);
 255        $len -= $c;
 256      }
 257      return trim($output);
 258    }
 259    return $string;
 260  }
 261  
 262  /**
 263   * Complement to mime_header_encode
 264   */
 265  function mime_header_decode($header) {
 266    // First step: encoded chunks followed by other encoded chunks (need to collapse whitespace)
 267    $header = preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/', '_mime_header_decode', $header);
 268    // Second step: remaining chunks (do not collapse whitespace)
 269    return preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/', '_mime_header_decode', $header);
 270  }
 271  
 272  /**
 273   * Helper function to mime_header_decode
 274   */
 275  function _mime_header_decode($matches) {
 276    // Regexp groups:
 277    // 1: Character set name
 278    // 2: Escaping method (Q or B)
 279    // 3: Encoded data
 280    $data = ($matches[2] == 'B') ? base64_decode($matches[3]) : str_replace('_', ' ', quoted_printable_decode($matches[3]));
 281    if (strtolower($matches[1]) != 'utf-8') {
 282      $data = drupal_convert_to_utf8($data, $matches[1]);
 283    }
 284    return $data;
 285  }
 286  
 287  /**
 288   * Decode all HTML entities (including numerical ones) to regular UTF-8 bytes.
 289   * Double-escaped entities will only be decoded once ("&amp;lt;" becomes "&lt;", not "<").
 290   *
 291   * @param $text
 292   *   The text to decode entities in.
 293   * @param $exclude
 294   *   An array of characters which should not be decoded. For example,
 295   *   array('<', '&', '"'). This affects both named and numerical entities.
 296   */
 297  function decode_entities($text, $exclude = array()) {
 298    static $table;
 299    // We store named entities in a table for quick processing.
 300    if (!isset($table)) {
 301      // Get all named HTML entities.
 302      $table = array_flip(get_html_translation_table(HTML_ENTITIES));
 303      // PHP gives us ISO-8859-1 data, we need UTF-8.
 304      $table = array_map('utf8_encode', $table);
 305      // Add apostrophe (XML)
 306      $table['&apos;'] = "'";
 307    }
 308    $newtable = array_diff($table, $exclude);
 309  
 310    // Use a regexp to select all entities in one pass, to avoid decoding double-escaped entities twice.
 311    return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $newtable, $exclude)', $text);
 312  }
 313  
 314  /**
 315   * Helper function for decode_entities
 316   */
 317  function _decode_entities($prefix, $codepoint, $original, &$table, &$exclude) {
 318    // Named entity
 319    if (!$prefix) {
 320      if (isset($table[$original])) {
 321        return $table[$original];
 322      }
 323      else {
 324        return $original;
 325      }
 326    }
 327    // Hexadecimal numerical entity
 328    if ($prefix == '#x') {
 329      $codepoint = base_convert($codepoint, 16, 10);
 330    }
 331    // Decimal numerical entity (strip leading zeros to avoid PHP octal notation)
 332    else {
 333      $codepoint = preg_replace('/^0+/', '', $codepoint);
 334    }
 335    // Encode codepoint as UTF-8 bytes
 336    if ($codepoint < 0x80) {
 337      $str = chr($codepoint);
 338    }
 339    else if ($codepoint < 0x800) {
 340      $str = chr(0xC0 | ($codepoint >> 6))
 341           . chr(0x80 | ($codepoint & 0x3F));
 342    }
 343    else if ($codepoint < 0x10000) {
 344      $str = chr(0xE0 | ( $codepoint >> 12))
 345           . chr(0x80 | (($codepoint >> 6) & 0x3F))
 346           . chr(0x80 | ( $codepoint       & 0x3F));
 347    }
 348    else if ($codepoint < 0x200000) {
 349      $str = chr(0xF0 | ( $codepoint >> 18))
 350           . chr(0x80 | (($codepoint >> 12) & 0x3F))
 351           . chr(0x80 | (($codepoint >> 6)  & 0x3F))
 352           . chr(0x80 | ( $codepoint        & 0x3F));
 353    }
 354    // Check for excluded characters
 355    if (in_array($str, $exclude)) {
 356      return $original;
 357    }
 358    else {
 359      return $str;
 360    }
 361  }
 362  
 363  /**
 364   * Count the amount of characters in a UTF-8 string. This is less than or
 365   * equal to the byte count.
 366   */
 367  function drupal_strlen($text) {
 368    global $multibyte;
 369    if ($multibyte == UNICODE_MULTIBYTE) {
 370      return mb_strlen($text);
 371    }
 372    else {
 373      // Do not count UTF-8 continuation bytes.
 374      return strlen(preg_replace("/[\x80-\xBF]/", '', $text));
 375    }
 376  }
 377  
 378  /**
 379   * Uppercase a UTF-8 string.
 380   */
 381  function drupal_strtoupper($text) {
 382    global $multibyte;
 383    if ($multibyte == UNICODE_MULTIBYTE) {
 384      return mb_strtoupper($text);
 385    }
 386    else {
 387      // Use C-locale for ASCII-only uppercase
 388      $text = strtoupper($text);
 389      // Case flip Latin-1 accented letters
 390      $text = preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/', '_unicode_caseflip', $text);
 391      return $text;
 392    }
 393  }
 394  
 395  /**
 396   * Lowercase a UTF-8 string.
 397   */
 398  function drupal_strtolower($text) {
 399    global $multibyte;
 400    if ($multibyte == UNICODE_MULTIBYTE) {
 401      return mb_strtolower($text);
 402    }
 403    else {
 404      // Use C-locale for ASCII-only lowercase
 405      $text = strtolower($text);
 406      // Case flip Latin-1 accented letters
 407      $text = preg_replace_callback('/\xC3[\x80-\x96\x98-\x9E]/', '_unicode_caseflip', $text);
 408      return $text;
 409    }
 410  }
 411  
 412  /**
 413   * Helper function for case conversion of Latin-1.
 414   * Used for flipping U+C0-U+DE to U+E0-U+FD and back.
 415   */
 416  function _unicode_caseflip($matches) {
 417    return $matches[0][0] . chr(ord($matches[0][1]) ^ 32);
 418  }
 419  
 420  /**
 421   * Capitalize the first letter of a UTF-8 string.
 422   */
 423  function drupal_ucfirst($text) {
 424    // Note: no mbstring equivalent!
 425    return drupal_strtoupper(drupal_substr($text, 0, 1)) . drupal_substr($text, 1);
 426  }
 427  
 428  /**
 429   * Cut off a piece of a string based on character indices and counts. Follows
 430   * the same behaviour as PHP's own substr() function.
 431   *
 432   * Note that for cutting off a string at a known character/substring
 433   * location, the usage of PHP's normal strpos/substr is safe and
 434   * much faster.
 435   */
 436  function drupal_substr($text, $start, $length = NULL) {
 437    global $multibyte;
 438    if ($multibyte == UNICODE_MULTIBYTE) {
 439      return $length === NULL ? mb_substr($text, $start) : mb_substr($text, $start, $length);
 440    }
 441    else {
 442      $strlen = strlen($text);
 443      // Find the starting byte offset
 444      $bytes = 0;
 445      if ($start > 0) {
 446        // Count all the continuation bytes from the start until we have found
 447        // $start characters
 448        $bytes = -1; $chars = -1;
 449        while ($bytes < $strlen && $chars < $start) {
 450          $bytes++;
 451          $c = ord($text[$bytes]);
 452          if ($c < 0x80 || $c >= 0xC0) {
 453            $chars++;
 454          }
 455        }
 456      }
 457      else if ($start < 0) {
 458        // Count all the continuation bytes from the end until we have found
 459        // abs($start) characters
 460        $start = abs($start);
 461        $bytes = $strlen; $chars = 0;
 462        while ($bytes > 0 && $chars < $start) {
 463          $bytes--;
 464          $c = ord($text[$bytes]);
 465          if ($c < 0x80 || $c >= 0xC0) {
 466            $chars++;
 467          }
 468        }
 469      }
 470      $istart = $bytes;
 471  
 472      // Find the ending byte offset
 473      if ($length === NULL) {
 474        $bytes = $strlen - 1;
 475      }
 476      else if ($length > 0) {
 477        // Count all the continuation bytes from the starting index until we have
 478        // found $length + 1 characters. Then backtrack one byte.
 479        $bytes = $istart; $chars = 0;
 480        while ($bytes < $strlen && $chars < $length) {
 481          $bytes++;
 482          $c = ord($text[$bytes]);
 483          if ($c < 0x80 || $c >= 0xC0) {
 484            $chars++;
 485          }
 486        }
 487        $bytes--;
 488      }
 489      else if ($length < 0) {
 490        // Count all the continuation bytes from the end until we have found
 491        // abs($length) characters
 492        $length = abs($length);
 493        $bytes = $strlen - 1; $chars = 0;
 494        while ($bytes >= 0 && $chars < $length) {
 495          $c = ord($text[$bytes]);
 496          if ($c < 0x80 || $c >= 0xC0) {
 497            $chars++;
 498          }
 499          $bytes--;
 500        }
 501      }
 502      $iend = $bytes;
 503  
 504      return substr($text, $istart, max(0, $iend - $istart + 1));
 505    }
 506  }
 507  
 508
Code source de Drupal 5.3

/includes/ -> unicode.inc (source)

Généré le : Fri Nov 30 16:20:15 2007	par Balluche grâce à PHPXref 0.7