LifeType 1.2.4 : /class/data/utf8/utf8

[Sommaire] [Imprimer]
   1  <?php
   2  
   3  /**
   4   * utf8 interrelated functions 
   5   * @autor CB
   6   * @email cb.utblog@gmail.com
   7   * @homepage http://www.utblog.com/plog/CB
   8   * @date 25 Jul 2005
   9   */
  10  
  11  /**
  12   * int utf8_isValidChar(string $inputStr, $start = 0)
  13   * Is it a valid utf8 character
  14   * @param $inputStr input string
  15   * @param $start start index
  16   * @return the ascii bytes of the utf8 char if it is a valid utf8 char. 0 if input array is empty, or -1 if it's invalid 
  17   * @note don't use pass-by-reference for $inArr here, otherwise efficiency will decreased significantly 
  18   * @note change param $inArr from char array to string ($inputStr), for porformance purpose.
  19   * @note preg_split consumes too much memory and cpu when split a big string to char array
  20   */
  21  function utf8_isValidChar($inputStr, $start = 0)
  22  {
  23      $size = strlen($inputStr);
  24      if($size <=0 || $start < 0 || $size <= $start) return 0;
  25  
  26      $inOrd = ord($inputStr{$start});
  27      $us = 0;
  28      if($inOrd <= 0x7F) { //0xxxxxxx
  29          return 1;
  30      } else if($inOrd >= 0xC0 && $inOrd <= 0xDF ) { //110xxxxx 10xxxxxx
  31          $us = 2;
  32      } else if($inOrd >= 0xE0 && $inOrd <= 0xEF ) { //1110xxxx 10xxxxxx 10xxxxxx
  33          $us = 3;
  34      } else if($inOrd >= 0xF0 && $inOrd <= 0xF7 ) { //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  35          $us = 4;
  36      } else if($inOrd >= 0xF8 && $inOrd <= 0xFB ) { //111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  37          $us = 5;
  38      } else if($inOrd >= 0xFC && $inOrd <= 0xFD ) { //1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  39          $us = 6;
  40      } else
  41          return -1;
  42  
  43      if($size - $start < $us)
  44          return -1;
  45  
  46      for($i=1; $i<$us; $i++)
  47      {
  48          $od = ord($inputStr{$start+$i}); 
  49          if($od <0x80 || $od > 0xBF)
  50              return -1;
  51      }
  52      return $us;
  53  }
  54  
  55  /**
  56   * mix utf8_substr(string $inputString, int $start_index, int $length = -1, bool $ignore_invalid_utf8_char = true)
  57   * @param $inputStr
  58   * @param $start start index, must be large than 0
  59   * @param $length. if $length <0, return all text begin from $start
  60   * @param $ignore_error whether ignore the invalid characters (in return string, these invalid chars will be replaced with '?') or not. default is true (ignore)
  61   * @return the substring, or false (empty string '')
  62   */
  63  function utf8_substr($inputStr, $start, $length = -1, $ignore_error = true)
  64  {
  65      if($start<0 || $length == 0)
  66          return false;
  67      //discard preg_split function. it consumes too much system resource when it tries to split a big string to pieces
  68      //$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
  69      //find start
  70      $si = 0;
  71      $si_single = 0;
  72      while($si < $start)
  73      {
  74          $hm = utf8_isValidChar($inputStr, $si_single);
  75          if($hm == -1)
  76          {
  77              //ignore invalid character?
  78              if(!$ignore_error)
  79                  return false;
  80              //array_shift is very slow
  81              //array_shift($rawArr); 
  82              $si++;
  83              $si_single++;
  84          }
  85          else if($hm == 0)
  86          {
  87              //$start is bigger than the utf8_length of inputString
  88              return false;
  89          }
  90          else
  91          {
  92              //for($i=0; $i<$hm; $i++) array_shift($rawArr);
  93              $si++;
  94              $si_single += $hm;
  95          }
  96      }
  97      if($length < 0)
  98          //return implode('', $rawArr);
  99          return substr($inputStr, $si_single);
 100      $retArr = array();
 101      $li = 0;
 102      while($li < $length)
 103      {
 104          $hm = utf8_isValidChar($inputStr, $si_single);
 105          if($hm == -1)
 106          {
 107              if(!$ignore_error)
 108                  return false;
 109              $retArr[] = '?'; 
 110              //array_shift($rawArr);
 111              $li++;
 112              $si_single++;
 113          }
 114          else if($hm == 0)
 115          {
 116              //end of string
 117              return implode('', $retArr);
 118          }
 119          else
 120          {
 121              //for($i=0; $i<$hm; $i++) $retArr[] = array_shift($rawArr);
 122              for($i=0; $i<$hm; $i++) $retArr[] = $inputStr{$si_single++};
 123              $li++;
 124          }
 125      }
 126      return implode('', $retArr);
 127  }
 128  
 129  /**
 130   * int utf8_strlen(string $inputString, bool $ignore_invalid_utf8_char = true)
 131   * @return length of string encoded as utf8 ( how many utf8 characters )
 132   * -1 if given $ignore_error is false and there's invalid utf8 char in the inputString
 133   * @note if $ignore_error is true (the default value), every invalid utf8 character will be count as ONE utf8 char
 134   */
 135  function utf8_strlen($inputStr, $ignore_error = true)
 136  {
 137      //$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
 138      $len = 0;
 139      $si_single = 0;
 140      while(($hm = utf8_isValidChar($inputStr, $si_single)) != 0)
 141      {
 142          if($hm == -1)
 143          {
 144              if(!$ignore_error)
 145                  return -1;
 146              //array_shift($rawArr);
 147              $si_single++;
 148          }
 149          else
 150              //for($i=0; $i<$hm; $i++) array_shift($rawArr);
 151              $si_single += $hm;
 152          $len++;
 153      }
 154      return $len;
 155  }
 156  
 157  /**
 158   * int utf8_proportion(string $inputString)
 159   * @param $inputString
 160   * @return percentage of valid utf8 chars of $inputString
 161   * @see http://www.utblog.com/plog/1/article/292
 162   */ 
 163  function utf8_proportion($inputStr)
 164  {
 165      //$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
 166      //$rawLen = count($rawArr);
 167      $rawLen = strlen($inputStr);
 168      if($rawLen == 0)
 169          return 100;
 170      $validChars = 0;
 171      $si_single = 0;
 172      while(($hm = utf8_isValidChar($inputStr, $si_single)) != 0)
 173      {
 174          if($hm == -1)
 175          {
 176              //array_shift($rawArr);
 177              $si_single++;
 178              continue;
 179          }
 180          //for($i=0; $i<$hm; $i++) array_shift($rawArr);
 181          $validChars += $hm;
 182          $si_single += $hm;
 183      }
 184      if($validChars == $rawLen)
 185          return 100;
 186      else
 187          return (int)($validChars*100.0/$rawLen);
 188  }
 189  
 190  ?>
Code source de LifeType 1.2.4

/class/data/utf8/ -> utf8_funcs.php (source)

Généré le : Mon Nov 26 21:04:15 2007	par Balluche grâce à PHPXref 0.7