eZ Publish 3.9.0 : /lib/ezi18n/classes/ezutf8codec.php source

[Sommaire] [Imprimer]
   1  <?php
   2  //
   3  // Definition of eZUTF8Codec class
   4  //
   5  // SOFTWARE NAME: eZ publish
   6  // SOFTWARE RELEASE: 3.9.0
   7  // BUILD VERSION: 17785
   8  // COPYRIGHT NOTICE: Copyright (C) 1999-2006 eZ systems AS
   9  // SOFTWARE LICENSE: GNU General Public License v2.0
  10  // NOTICE: >
  11  //   This program is free software; you can redistribute it and/or
  12  //   modify it under the terms of version 2.0  of the GNU General
  13  //   Public License as published by the Free Software Foundation.
  14  //
  15  //   This program is distributed in the hope that it will be useful,
  16  //   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  //   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  //   GNU General Public License for more details.
  19  //
  20  //   You should have received a copy of version 2.0 of the GNU General
  21  //   Public License along with this program; if not, write to the Free
  22  //   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  23  //   MA 02110-1301, USA.
  24  //
  25  //
  26  
  27  /*!
  28    \class eZUTF8Codec ezutf8codec.php
  29    \ingroup eZI18N
  30    \brief Converter for utf8 and 32bit unicode
  31  
  32    Allows for conversion from utf8 charactes and to 32bit unicode values,
  33    and vice versa.
  34  
  35  */
  36  
  37  class eZUTF8Codec
  38  {
  39      /*!
  40       Initializes utf8 codec.
  41      */
  42      function eZUTF8Codec()
  43      {
  44      }
  45  
  46      /*!
  47       Converts an UTF8 string into Unicode values and returns an array with the values.
  48      */
  49      function convertStringToUnicode( $str )
  50      {
  51          $unicodeValues = array();
  52          $strLen = strlen( $str );
  53          for ( $offset = 0; $offset < $strLen; )
  54          {
  55              $charLen = 1;
  56              $unicodeValue = eZUTF8Codec::fromUTF8( $str, $offset, $charLen );
  57              if ( $unicodeValue !== false )
  58                  $unicodeValues[] = $unicodeValue;
  59              $offset += $charLen;
  60          }
  61          return $unicodeValues;
  62      }
  63  
  64      /*!
  65       Converts an array with Unicode values into an UTF8 string and returns it.
  66      */
  67      function convertUnicodeToString( $unicodeValues )
  68      {
  69          if ( !is_array( $unicodeValues ) )
  70              return false;
  71          $text = '';
  72          foreach ( $unicodeValues as $unicodeValue )
  73          {
  74              $utf8Char = eZUTF8Codec::toUTF8( $unicodeValue );
  75              $text .= $utf8Char;
  76          }
  77          return $text;
  78      }
  79  
  80      /*!
  81       \static
  82       Converts the 32 bit integer $char_code to a utf8 string representing the Unicode character.
  83      */
  84      function &toUTF8( $char_code )
  85      {
  86          switch ( $char_code )
  87          {
  88              case 0:
  89                  $char = chr( 0 );
  90              case !($char_code & 0xffffff80): // 7 bit
  91                  $char = chr( $char_code );
  92                  break;
  93              case !($char_code & 0xfffff800): // 11 bit
  94                  $char = ( chr(0xc0 | (($char_code >> 6) & 0x1f)) .
  95                            chr(0x80 | ($char_code & 0x3f)) );
  96                  break;
  97              case !($char_code & 0xffff0000): // 16 bit
  98                  $char = ( chr(0xe0 | (($char_code >> 12) & 0x0f)) .
  99                            chr(0x80 | (($char_code >> 6) & 0x3f)) .
 100                            chr(0x80 | ($char_code & 0x3f)) );
 101                  break;
 102              case !($char_code & 0xffe00000): // 21 bit
 103                  $char = ( chr(0xf0 | (($char_code >> 18) & 0x07)) .
 104                            chr(0x80 | (($char_code >> 12) & 0x3f)) .
 105                            chr(0x80 | (($char_code >> 6) & 0x3f)) .
 106                            chr(0x80 | ($char_code & 0x3f)) );
 107                  break;
 108              case !($char_code & 0xfc000000): // 26 bit
 109                  $char = ( chr(0xf8 | (($char_code >> 24) & 0x03)) .
 110                            chr(0x80 | (($char_code >> 18) & 0x3f)) .
 111                            chr(0x80 | (($char_code >> 12) & 0x3f)) .
 112                            chr(0x80 | (($char_code >> 6) & 0x3f)) .
 113                            chr(0x80 | ($char_code & 0x3f)) );
 114              default: // 31 bit
 115                  $char = ( chr(0xfc | (($char_code >> 30) & 0x01)) .
 116                            chr(0x80 | (($char_code >> 24) & 0x3f)) .
 117                            chr(0x80 | (($char_code >> 18) & 0x3f)) .
 118                            chr(0x80 | (($char_code >> 12) & 0x3f)) .
 119                            chr(0x80 | (($char_code >> 6) & 0x3f)) .
 120                            chr(0x80 | ($char_code & 0x3f)) );
 121          }
 122          return $char;
 123      }
 124  
 125      /*!
 126       \static
 127       Converts the first utf8 char in the string $multi_char to a 32 bit integer.
 128       $offs is the offset in the string.
 129       $len will contain the length of utf8 char in the string which can be used to
 130       find the next char.
 131      */
 132      function &fromUtf8( $multi_char, $offs, &$len )
 133      {
 134          $char_code = false;
 135          if ( ( ord( $multi_char[$offs + 0] ) & 0x80 ) == 0x00 ) // 7 bit, 1 char
 136          {
 137              $char_code = ord( $multi_char[$offs + 0] );
 138              $len = 1;
 139          }
 140          else if ( ( ord( $multi_char[$offs + 0] ) & 0xe0 ) == 0xc0 ) // 11 bit, 2 chars
 141          {
 142              $len = 2;
 143              if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 )
 144                  return $char_code;
 145              $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x1f ) << 6) +
 146                             (( ord( $multi_char[$offs + 1] ) & 0x3f )) );
 147              if ( $char_code < 128 ) // Illegal multibyte, should use less than 2 chars
 148              {
 149                  $char_code == false;
 150              }
 151          }
 152          else if ( ( ord( $multi_char[$offs + 0] ) & 0xf0 ) == 0xe0 ) // 16 bit, 3 chars
 153          {
 154              $len = 3;
 155              if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or
 156                   ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 )
 157                  return $char_code;
 158              $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x0f ) << 12) +
 159                             (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 6) +
 160                             (( ord( $multi_char[$offs + 2] ) & 0x3f )) );
 161              if ( $char_code < 2048 ) // Illegal multibyte, should use less than 3 chars
 162              {
 163                  $char_code == false;
 164              }
 165          }
 166          else if ( ( ord( $multi_char[$offs + 0] ) & 0xf8 ) == 0xf0 ) // 21 bit, 4 chars
 167          {
 168              $len = 4;
 169              if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or
 170                   ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 or
 171                   ( ord( $multi_char[$offs + 3] ) & 0xc0 ) != 0x80 )
 172                  return $char_code;
 173              $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x07 ) << 18) +
 174                             (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 12) +
 175                             (( ord( $multi_char[$offs + 2] ) & 0x3f ) << 6) +
 176                             (( ord( $multi_char[$offs + 3] ) & 0x3f )) );
 177              if ( $char_code < 65536 ) // Illegal multibyte, should use less than 4 chars
 178              {
 179                  $char_code == false;
 180              }
 181          }
 182          else if ( ( ord( $multi_char[$offs + 0] ) & 0xfc ) == 0xf8 ) // 26 bit, 5 chars
 183          {
 184              $len = 5;
 185              if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or
 186                   ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 or
 187                   ( ord( $multi_char[$offs + 3] ) & 0xc0 ) != 0x80 or
 188                   ( ord( $multi_char[$offs + 4] ) & 0xc0 ) != 0x80 )
 189                  return $char_code;
 190              $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x03 ) << 24) +
 191                             (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 18) +
 192                             (( ord( $multi_char[$offs + 2] ) & 0x3f ) << 12) +
 193                             (( ord( $multi_char[$offs + 3] ) & 0x3f ) << 6) +
 194                             (( ord( $multi_char[$offs + 4] ) & 0x3f )) );
 195              if ( $char_code < 2097152 ) // Illegal multibyte, should use less than 5 chars
 196              {
 197                  $char_code == false;
 198              }
 199          }
 200          else if ( ( ord( $multi_char[$offs + 0] ) & 0xfe ) == 0xfc ) // 31 bit, 6 chars
 201          {
 202              $len = 6;
 203              if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or
 204                   ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 or
 205                   ( ord( $multi_char[$offs + 3] ) & 0xc0 ) != 0x80 or
 206                   ( ord( $multi_char[$offs + 4] ) & 0xc0 ) != 0x80 or
 207                   ( ord( $multi_char[$offs + 5] ) & 0xc0 ) != 0x80 )
 208                  return $char_code;
 209              $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x01 ) << 30) +
 210                             (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 24) +
 211                             (( ord( $multi_char[$offs + 2] ) & 0x3f ) << 18) +
 212                             (( ord( $multi_char[$offs + 3] ) & 0x3f ) << 12) +
 213                             (( ord( $multi_char[$offs + 4] ) & 0x3f ) << 6) +
 214                             (( ord( $multi_char[$offs + 5] ) & 0x3f )) );
 215              if ( $char_code < 67108864 ) // Illegal multibyte, should use less than 6 chars
 216              {
 217                  $char_code == false;
 218              }
 219          }
 220          return $char_code;
 221      }
 222  
 223      function &utf8LengthTable()
 224      {
 225          $table =& $GLOBALS["eZUTF8LengthTable"];
 226          if ( !is_array( $table ) )
 227              $table = array( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 228                              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 229                              0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6 );
 230          return $table;
 231      }
 232  
 233      function characterByteLength( &$str, $pos )
 234      {
 235          $table =& eZUTF8Codec::utf8LengthTable();
 236          $char = ord( $str[$pos] );
 237          return $table[($char >> 2) & 0x3f];
 238      }
 239  
 240      function strlen( &$str )
 241      {
 242          $table =& eZUTF8Codec::utf8LengthTable();
 243          $len = strlen( $str );
 244          $strlen = 0;
 245          for ( $i = 0; $i < $len; )
 246          {
 247              $char = ord( $str[$i] );
 248              $char_len = $table[($char >> 2) & 0x3f];
 249              $i += $char_len;
 250              ++$strlen;
 251          }
 252          return $strlen;
 253      }
 254  
 255      /*!
 256       \return a unique instance of the UTF8 codec.
 257      */
 258      function &instance()
 259      {
 260          $instance =& $GLOBALS["eZUTF8CodecInstance"];
 261          if ( get_class( $instance ) != "ezutf8codec" )
 262          {
 263              $instance = new eZUTF8Codec();
 264          }
 265          return $instance;
 266      }
 267  }
 268  
 269  ?>
Code source de eZ Publish 3.9.0

/lib/ezi18n/classes/ -> ezutf8codec.php (source)