[ Index ] |
|
Code source de LifeType 1.2.4 |
1 <?php 2 3 /** 4 * utf8 interrelated functions 5 * @autor CB 6 * @email cb.utblog@gmail.com 7 * @homepage http://www.utblog.com/plog/CB 8 * @date 25 Jul 2005 9 */ 10 11 /** 12 * int utf8_isValidChar(string $inputStr, $start = 0) 13 * Is it a valid utf8 character 14 * @param $inputStr input string 15 * @param $start start index 16 * @return the ascii bytes of the utf8 char if it is a valid utf8 char. 0 if input array is empty, or -1 if it's invalid 17 * @note don't use pass-by-reference for $inArr here, otherwise efficiency will decreased significantly 18 * @note change param $inArr from char array to string ($inputStr), for porformance purpose. 19 * @note preg_split consumes too much memory and cpu when split a big string to char array 20 */ 21 function utf8_isValidChar($inputStr, $start = 0) 22 { 23 $size = strlen($inputStr); 24 if($size <=0 || $start < 0 || $size <= $start) return 0; 25 26 $inOrd = ord($inputStr{$start}); 27 $us = 0; 28 if($inOrd <= 0x7F) { //0xxxxxxx 29 return 1; 30 } else if($inOrd >= 0xC0 && $inOrd <= 0xDF ) { //110xxxxx 10xxxxxx 31 $us = 2; 32 } else if($inOrd >= 0xE0 && $inOrd <= 0xEF ) { //1110xxxx 10xxxxxx 10xxxxxx 33 $us = 3; 34 } else if($inOrd >= 0xF0 && $inOrd <= 0xF7 ) { //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 35 $us = 4; 36 } else if($inOrd >= 0xF8 && $inOrd <= 0xFB ) { //111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 37 $us = 5; 38 } else if($inOrd >= 0xFC && $inOrd <= 0xFD ) { //1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 39 $us = 6; 40 } else 41 return -1; 42 43 if($size - $start < $us) 44 return -1; 45 46 for($i=1; $i<$us; $i++) 47 { 48 $od = ord($inputStr{$start+$i}); 49 if($od <0x80 || $od > 0xBF) 50 return -1; 51 } 52 return $us; 53 } 54 55 /** 56 * mix utf8_substr(string $inputString, int $start_index, int $length = -1, bool $ignore_invalid_utf8_char = true) 57 * @param $inputStr 58 * @param $start start index, must be large than 0 59 * @param $length. if $length <0, return all text begin from $start 60 * @param $ignore_error whether ignore the invalid characters (in return string, these invalid chars will be replaced with '?') or not. default is true (ignore) 61 * @return the substring, or false (empty string '') 62 */ 63 function utf8_substr($inputStr, $start, $length = -1, $ignore_error = true) 64 { 65 if($start<0 || $length == 0) 66 return false; 67 //discard preg_split function. it consumes too much system resource when it tries to split a big string to pieces 68 //$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 69 //find start 70 $si = 0; 71 $si_single = 0; 72 while($si < $start) 73 { 74 $hm = utf8_isValidChar($inputStr, $si_single); 75 if($hm == -1) 76 { 77 //ignore invalid character? 78 if(!$ignore_error) 79 return false; 80 //array_shift is very slow 81 //array_shift($rawArr); 82 $si++; 83 $si_single++; 84 } 85 else if($hm == 0) 86 { 87 //$start is bigger than the utf8_length of inputString 88 return false; 89 } 90 else 91 { 92 //for($i=0; $i<$hm; $i++) array_shift($rawArr); 93 $si++; 94 $si_single += $hm; 95 } 96 } 97 if($length < 0) 98 //return implode('', $rawArr); 99 return substr($inputStr, $si_single); 100 $retArr = array(); 101 $li = 0; 102 while($li < $length) 103 { 104 $hm = utf8_isValidChar($inputStr, $si_single); 105 if($hm == -1) 106 { 107 if(!$ignore_error) 108 return false; 109 $retArr[] = '?'; 110 //array_shift($rawArr); 111 $li++; 112 $si_single++; 113 } 114 else if($hm == 0) 115 { 116 //end of string 117 return implode('', $retArr); 118 } 119 else 120 { 121 //for($i=0; $i<$hm; $i++) $retArr[] = array_shift($rawArr); 122 for($i=0; $i<$hm; $i++) $retArr[] = $inputStr{$si_single++}; 123 $li++; 124 } 125 } 126 return implode('', $retArr); 127 } 128 129 /** 130 * int utf8_strlen(string $inputString, bool $ignore_invalid_utf8_char = true) 131 * @return length of string encoded as utf8 ( how many utf8 characters ) 132 * -1 if given $ignore_error is false and there's invalid utf8 char in the inputString 133 * @note if $ignore_error is true (the default value), every invalid utf8 character will be count as ONE utf8 char 134 */ 135 function utf8_strlen($inputStr, $ignore_error = true) 136 { 137 //$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 138 $len = 0; 139 $si_single = 0; 140 while(($hm = utf8_isValidChar($inputStr, $si_single)) != 0) 141 { 142 if($hm == -1) 143 { 144 if(!$ignore_error) 145 return -1; 146 //array_shift($rawArr); 147 $si_single++; 148 } 149 else 150 //for($i=0; $i<$hm; $i++) array_shift($rawArr); 151 $si_single += $hm; 152 $len++; 153 } 154 return $len; 155 } 156 157 /** 158 * int utf8_proportion(string $inputString) 159 * @param $inputString 160 * @return percentage of valid utf8 chars of $inputString 161 * @see http://www.utblog.com/plog/1/article/292 162 */ 163 function utf8_proportion($inputStr) 164 { 165 //$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 166 //$rawLen = count($rawArr); 167 $rawLen = strlen($inputStr); 168 if($rawLen == 0) 169 return 100; 170 $validChars = 0; 171 $si_single = 0; 172 while(($hm = utf8_isValidChar($inputStr, $si_single)) != 0) 173 { 174 if($hm == -1) 175 { 176 //array_shift($rawArr); 177 $si_single++; 178 continue; 179 } 180 //for($i=0; $i<$hm; $i++) array_shift($rawArr); 181 $validChars += $hm; 182 $si_single += $hm; 183 } 184 if($validChars == $rawLen) 185 return 100; 186 else 187 return (int)($validChars*100.0/$rawLen); 188 } 189 190 ?>
titre
Description
Corps
titre
Description
Corps
titre
Description
Corps
titre
Corps
Généré le : Mon Nov 26 21:04:15 2007 | par Balluche grâce à PHPXref 0.7 |
![]() |