[ Index ] |
|
Code source de LifeType 1.2.4 |
1 <?php 2 3 lt_include( PLOG_CLASS_PATH."class/dao/bayesiantoken.class.php" ); 4 lt_include( PLOG_CLASS_PATH."class/bayesian/tokenizer.class.php" ); 5 6 define( "SPLIT_REG_EXP", "[^a-zA-Z0-9àáèéíïòóúüÀÁÈÉÍÏÒÓÚÜ'$!,.^-]+"); 7 8 /** 9 * \ingroup Bayesian 10 * 11 * This class takes care of splitting a valid html source in the different words that 12 * make it up, taking tags into account. The main public method is BayesianTokenizer::tokenize() 13 */ 14 class BayesianTokenizer extends Tokenizer 15 { 16 17 var $_htmlTags = array(); 18 19 /** 20 * constructor, it only calls the parent constructor. 21 * @see Tokenizer 22 */ 23 function BayesianTokenizer() 24 { 25 $this->Tokenizer(); 26 } 27 28 /** 29 * given an input text, possibly containing HTML tags, it will split it into 30 * all the different words that make it up. 31 * 32 * @param text The text to split 33 * @param unique Whether the return array should contain unique items or if the same 34 * word is allowed more than once. 35 * @return An array where each item is a word from the text 36 */ 37 function tokenize($text, $unique = false) 38 { 39 $this->_htmlTags = array(); 40 $text = $this->_stripHtmlTags($text); 41 $tokensFromHtml = $this->_tokenizeHtmlTags($this->_htmlTags); 42 $tokensText = $this->_tokenize($text); 43 $tokens = array_merge($tokensText, $tokensFromHtml); 44 45 if ($unique) 46 { 47 $tokens = array_unique($tokens); 48 } 49 50 return $tokens; 51 } 52 53 /** 54 * @private 55 */ 56 function _tokenize($text) 57 { 58 $tokensTemp = split(SPLIT_REG_EXP, $text); 59 $tokens = array(); 60 61 foreach ($tokensTemp as $token) 62 { 63 if (strlen($token) > 0 && BayesianToken::isValid($token)) 64 { 65 if (ereg("\\$[0-9]+[-][0-9]+", $token)) 66 { 67 $temp = split("[-]", $token); 68 69 if (BayesianToken::isValid($temp[0])) 70 { 71 array_push($tokens, $temp[0]); 72 } 73 74 if (BayesianToken::isValid("$" . $temp[1])) 75 { 76 array_push($tokens, "$" . $temp[1]); 77 } 78 } 79 else if (!ereg("[0-9]+[,.^][0-9]+", $token)) 80 { 81 $splitted = split("[,.^]", $token); 82 83 foreach ($splitted as $splittedOne) 84 { 85 if (BayesianToken::isValid($splittedOne)) 86 { 87 array_push($tokens, $splittedOne); 88 } 89 } 90 } 91 else 92 { 93 array_push($tokens, $token); 94 } 95 } 96 } 97 98 return $tokens; 99 } 100 101 /** 102 * @private 103 */ 104 function _getValidHtmlTags($tags) 105 { 106 $validTags = array(); 107 108 foreach ($tags as $tag) 109 { 110 if (eregi("^<a ", $tag) || eregi("^<img ", $tag) || eregi("^<font ", $tag)) 111 { 112 array_push($validTags, $tag); 113 } 114 } 115 116 return $validTags; 117 } 118 119 /** 120 * @private 121 */ 122 function _stripHtmlTags($text) 123 { 124 preg_match_all("/(<[^>]+>)/", $text, $regs); 125 126 foreach ($regs[1] as $tag) 127 { 128 array_push($this->_htmlTags, $tag); 129 } 130 131 $this->_htmlTags = $this->_getValidHtmlTags($this->_htmlTags); 132 133 return preg_replace("/<[^>]+>/", "", $text); 134 } 135 136 /** 137 * @private 138 */ 139 function _tokenizeHtmlTags($tags) 140 { 141 $tokens = array(); 142 143 foreach ($tags as $tag) 144 { 145 $tokens = array_merge($tokens, $this->_tokenizeHtmlTag($tag)); 146 } 147 148 return $tokens; 149 } 150 151 /** 152 * @private 153 */ 154 function _tokenizeHtmlTag($tag) 155 { 156 $tokens = array(); 157 158 preg_match_all("/([^=]+)=\s*([^\s>]+)/", $tag, $regs); 159 $count = count($regs[1]); 160 161 //foreach ($regs[2] as $value) 162 for ($i = 0; $i < $count; $i++) 163 { 164 $value = $regs[2][$i]; 165 $prefix = ""; 166 167 if (eregi("(href|src)", $regs[1][$i])) 168 { 169 $prefix = TOKEN_URL_MARK; 170 } 171 172 $token = $this->_unquoteToken($value); 173 $tokensTemp = $this->_tokenize($token); 174 175 foreach ($tokensTemp as $tokenTemp) 176 { 177 if (BayesianToken::isValid($tokenTemp)) 178 { 179 array_push($tokens, $prefix . $tokenTemp); 180 } 181 } 182 } 183 184 return $tokens; 185 } 186 187 /** 188 * @private 189 */ 190 function _unquoteToken($token) 191 { 192 if (ereg("^['\"](.+)['\"]$", $token, $regs)) 193 { 194 return $regs[1]; 195 } 196 else 197 { 198 return $token; 199 } 200 } 201 202 /** 203 * @private 204 */ 205 function addContextMark($tokens, $mark) 206 { 207 $count = count($tokens); 208 209 for ($i = 0; $i < $count; $i++) 210 { 211 if (!eregi("^" . TOKEN_URL_MARK, $tokens[$i])) 212 { 213 $tokens[$i] = $mark . $tokens[$i]; 214 } 215 } 216 217 return $tokens; 218 } 219 } 220 ?>
titre
Description
Corps
titre
Description
Corps
titre
Description
Corps
titre
Corps
Généré le : Mon Nov 26 21:04:15 2007 | par Balluche grâce à PHPXref 0.7 |
![]() |