[ Index ] |
|
Code source de DokuWiki 2006-11-06 |
1 <?php 2 /** 3 * Common DokuWiki functions 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_CONF.'dokuwiki.php'); 11 require_once (DOKU_INC.'inc/io.php'); 12 require_once (DOKU_INC.'inc/utf8.php'); 13 require_once (DOKU_INC.'inc/parserutils.php'); 14 15 // Asian characters are handled as words. The following regexp defines the 16 // Unicode-Ranges for Asian characters 17 // Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 18 // I'm no language expert. If you think some ranges are wrongly chosen or 19 // a range is missing, please contact me 20 define('IDX_ASIAN','['. 21 '\x{0E00}-\x{0E7F}'. // Thai 22 '\x{2E80}-\x{D7AF}'. // CJK -> Hangul 23 '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 24 '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 25 ']'); 26 27 28 /** 29 * Split a page into words 30 * 31 * Returns an array of of word counts, false if an error occured 32 * 33 * @author Andreas Gohr <andi@splitbrain.org> 34 * @author Christopher Smith <chris@jalakai.co.uk> 35 */ 36 function idx_getPageWords($page){ 37 global $conf; 38 $word_idx = file($conf['cachedir'].'/word.idx'); 39 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 40 if(@file_exists($swfile)){ 41 $stopwords = file($swfile); 42 }else{ 43 $stopwords = array(); 44 } 45 46 $body = rawWiki($page); 47 $body = strtr($body, "\r\n\t", ' '); 48 $tokens = explode(' ', $body); 49 $tokens = array_count_values($tokens); // count the frequency of each token 50 51 // ensure the deaccented or romanised page names of internal links are added to the token array 52 // (this is necessary for the backlink function -- there maybe a better way!) 53 if ($conf['deaccent']) { 54 $links = p_get_metadata($page,'relation references'); 55 56 $tmp = join(' ',array_keys($links)); // make a single string 57 $tmp = strtr($tmp, ':', ' '); // replace namespace separator with a space 58 $link_tokens = array_unique(explode(' ', $tmp)); // break into tokens 59 60 foreach ($link_tokens as $link_token) { 61 if (isset($tokens[$link_token])) continue; 62 $tokens[$link_token] = 1; 63 } 64 } 65 66 $words = array(); 67 foreach ($tokens as $word => $count) { 68 // simple filter to restrict use of utf8_stripspecials 69 if (preg_match('/[^0-9A-Za-z]/u', $word)) { 70 // handle asian chars as single words (may fail on older PHP version) 71 $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word); 72 if(!is_null($asia)) $word = $asia; //recover from regexp failure 73 $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*')); 74 $arr = array_count_values($arr); 75 76 foreach ($arr as $w => $c) { 77 if (!is_numeric($w) && strlen($w) < 3) continue; 78 $w = utf8_strtolower($w); 79 $words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0); 80 } 81 } else { 82 if (!is_numeric($word) && strlen($word) < 3) continue; 83 $word = strtolower($word); 84 $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0); 85 } 86 } 87 88 // arrive here with $words = array(word => frequency) 89 90 $index = array(); //resulting index 91 foreach ($words as $word => $freq) { 92 if (is_int(array_search("$word\n",$stopwords))) continue; 93 $wid = array_search("$word\n",$word_idx); 94 if(!is_int($wid)){ 95 $word_idx[] = "$word\n"; 96 $wid = count($word_idx)-1; 97 } 98 $index[$wid] = $freq; 99 } 100 101 // save back word index 102 $fh = fopen($conf['cachedir'].'/word.idx','w'); 103 if(!$fh){ 104 trigger_error("Failed to write word.idx", E_USER_ERROR); 105 return false; 106 } 107 fwrite($fh,join('',$word_idx)); 108 fclose($fh); 109 110 return $index; 111 } 112 113 /** 114 * Adds/updates the search for the given page 115 * 116 * This is the core function of the indexer which does most 117 * of the work. This function needs to be called with proper 118 * locking! 119 * 120 * @author Andreas Gohr <andi@splitbrain.org> 121 */ 122 function idx_addPage($page){ 123 global $conf; 124 125 // load known documents 126 $page_idx = file($conf['cachedir'].'/page.idx'); 127 128 // get page id (this is the linenumber in page.idx) 129 $pid = array_search("$page\n",$page_idx); 130 if(!is_int($pid)){ 131 $page_idx[] = "$page\n"; 132 $pid = count($page_idx)-1; 133 // page was new - write back 134 $fh = fopen($conf['cachedir'].'/page.idx','w'); 135 if(!$fh) return false; 136 fwrite($fh,join('',$page_idx)); 137 fclose($fh); 138 } 139 140 // get word usage in page 141 $words = idx_getPageWords($page); 142 if($words === false) return false; 143 if(!count($words)) return true; 144 145 // Open index and temp file 146 $idx = fopen($conf['cachedir'].'/index.idx','r'); 147 $tmp = fopen($conf['cachedir'].'/index.tmp','w'); 148 if(!$idx || !$tmp){ 149 trigger_error("Failed to open index files", E_USER_ERROR); 150 return false; 151 } 152 153 // copy from index to temp file, modifying were needed 154 $lno = 0; 155 $line = ''; 156 while (!feof($idx)) { 157 // read full line 158 $line .= fgets($idx, 4096); 159 if(substr($line,-1) != "\n") continue; 160 161 // write a new Line to temp file 162 idx_writeIndexLine($tmp,$line,$pid,$words[$lno]); 163 164 $line = ''; // reset line buffer 165 $lno++; // increase linecounter 166 } 167 fclose($idx); 168 169 // add missing lines (usually index and word should contain 170 // the same number of lines, however if the page contained 171 // new words the word file has some more lines which need to 172 // be added here 173 $word_idx = file($conf['cachedir'].'/word.idx'); 174 $wcnt = count($word_idx); 175 for($lno; $lno<$wcnt; $lno++){ 176 idx_writeIndexLine($tmp,'',$pid,$words[$lno]); 177 } 178 179 // close the temp file and move it over to be the new one 180 fclose($tmp); 181 // try rename first (fast) fallback to copy (slow) 182 io_rename($conf['cachedir'].'/index.tmp', 183 $conf['cachedir'].'/index.idx'); 184 return false; 185 } 186 187 /** 188 * Write a new index line to the filehandle 189 * 190 * This function writes an line for the index file to the 191 * given filehandle. It removes the given document from 192 * the given line and readds it when $count is >0. 193 * 194 * @author Andreas Gohr <andi@splitbrain.org> 195 */ 196 function idx_writeIndexLine($fh,$line,$pid,$count){ 197 $line = trim($line); 198 199 if($line != ''){ 200 $parts = explode(':',$line); 201 // remove doc from given line 202 foreach($parts as $part){ 203 if($part == '') continue; 204 list($doc,$cnt) = explode('*',$part); 205 if($doc != $pid){ 206 fwrite($fh,"$doc*$cnt:"); 207 } 208 } 209 } 210 211 // add doc 212 if ($count){ 213 fwrite($fh,"$pid*$count"); 214 } 215 216 // add newline 217 fwrite($fh,"\n"); 218 } 219 220 /** 221 * Lookup words in index 222 * 223 * Takes an array of word and will return a list of matching 224 * documents for each one. 225 * 226 * Important: No ACL checking is done here! All results are 227 * returned, regardless of permissions 228 * 229 * @author Andreas Gohr <andi@splitbrain.org> 230 */ 231 function idx_lookup($words){ 232 global $conf; 233 234 $result = array(); 235 236 // load known words and documents 237 $page_idx = file($conf['cachedir'].'/page.idx'); 238 $word_idx = file($conf['cachedir'].'/word.idx'); 239 240 // get word IDs 241 $wids = array(); 242 foreach($words as $word){ 243 $result[$word] = array(); 244 $wild = 0; 245 $xword = $word; 246 247 // check for wildcards 248 if(substr($xword,0,1) == '*'){ 249 $xword = substr($xword,1); 250 $wild = 1; 251 $ptn = '/'.preg_quote($xword,'/').'$/'; 252 # $l = -1*strlen($xword)-1; 253 } 254 if(substr($xword,-1,1) == '*'){ 255 $xword = substr($xword,0,-1); 256 $wild += 2; 257 } 258 259 // look for the ID(s) for the given word 260 if($wild){ // handle wildcard search 261 $cnt = count($word_idx); 262 for($wid=0; $wid<$cnt; $wid++){ 263 $iword = $word_idx[$wid]; 264 if( (($wild==3) && is_int(strpos($iword,$xword))) || 265 # (($wild==1) && ("$xword\n" == substr($iword,$l))) || 266 (($wild==1) && preg_match($ptn,$iword)) || 267 # (($wild==2) && ($xword == substr($iword,0,strlen($xword)))) 268 (($wild==2) && (0 === strpos($iword,$xword))) 269 270 ){ 271 $wids[] = $wid; 272 $result[$word][] = $wid; 273 } 274 } 275 }else{ // handle exact search 276 $wid = array_search("$word\n",$word_idx); 277 if(is_int($wid)){ 278 $wids[] = $wid; 279 $result[$word][] = $wid; 280 }else{ 281 $result[$word] = array(); 282 } 283 } 284 } 285 sort($wids); 286 $wids = array_unique($wids); 287 288 // Open index 289 $idx = fopen($conf['cachedir'].'/index.idx','r'); 290 if(!$idx){ 291 msg("Failed to open index file",-1); 292 return false; 293 } 294 295 // Walk the index til the lines are found 296 $docs = array(); // hold docs found 297 $lno = 0; 298 $line = ''; 299 $srch = array_shift($wids); // which word do we look for? 300 while (!feof($idx)) { 301 // read full line 302 $line .= fgets($idx, 4096); 303 if(substr($line,-1) != "\n") continue; 304 if($lno > $srch) break; // shouldn't happen 305 306 307 // do we want this line? 308 if($lno == $srch){ 309 // add docs to list 310 $docs[$srch] = idx_parseIndexLine($page_idx,$line); 311 312 $srch = array_shift($wids); // next word to look up 313 if($srch == null) break; // no more words 314 } 315 316 $line = ''; // reset line buffer 317 $lno++; // increase linecounter 318 } 319 fclose($idx); 320 321 322 // merge found pages into final result array 323 $final = array(); 324 foreach(array_keys($result) as $word){ 325 $final[$word] = array(); 326 foreach($result[$word] as $wid){ 327 $hits = &$docs[$wid]; 328 foreach ($hits as $hitkey => $hitcnt) { 329 $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey]; 330 } 331 } 332 } 333 return $final; 334 } 335 336 /** 337 * Returns a list of documents and counts from a index line 338 * 339 * It omits docs with a count of 0 and pages that no longer 340 * exist. 341 * 342 * @param array $page_idx The list of known pages 343 * @param string $line A line from the main index 344 * @author Andreas Gohr <andi@splitbrain.org> 345 */ 346 function idx_parseIndexLine(&$page_idx,$line){ 347 $result = array(); 348 349 $line = trim($line); 350 if($line == '') return $result; 351 352 $parts = explode(':',$line); 353 foreach($parts as $part){ 354 if($part == '') continue; 355 list($doc,$cnt) = explode('*',$part); 356 if(!$cnt) continue; 357 $doc = trim($page_idx[$doc]); 358 if(!$doc) continue; 359 // make sure the document still exists 360 if(!@file_exists(wikiFN($doc,'',false))) continue; 361 362 $result[$doc] = $cnt; 363 } 364 return $result; 365 } 366 367 /** 368 * Tokenizes a string into an array of search words 369 * 370 * Uses the same algorithm as idx_getPageWords() 371 * 372 * @param string $string the query as given by the user 373 * @param arrayref $stopwords array of stopwords 374 * @param boolean $wc are wildcards allowed? 375 * 376 * @todo make combined function to use alone or in getPageWords 377 */ 378 function idx_tokenizer($string,&$stopwords,$wc=false){ 379 $words = array(); 380 $wc = ($wc) ? '' : $wc = '\*'; 381 382 if(preg_match('/[^0-9A-Za-z]/u', $string)){ 383 // handle asian chars as single words (may fail on older PHP version) 384 $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string); 385 if(!is_null($asia)) $string = $asia; //recover from regexp failure 386 387 $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); 388 foreach ($arr as $w) { 389 if (!is_numeric($w) && strlen($w) < 3) continue; 390 $w = utf8_strtolower($w); 391 if($stopwords && is_int(array_search("$w\n",$stopwords))) continue; 392 $words[] = $w; 393 } 394 }else{ 395 $w = $string; 396 if (!is_numeric($w) && strlen($w) < 3) return $words; 397 $w = strtolower($w); 398 if(is_int(array_search("$w\n",$stopwords))) return $words; 399 $words[] = $w; 400 } 401 402 return $words; 403 } 404 405 //Setup VIM: ex: et ts=4 enc=utf-8 :
titre
Description
Corps
titre
Description
Corps
titre
Description
Corps
titre
Corps
Généré le : Tue Apr 3 20:47:31 2007 | par Balluche grâce à PHPXref 0.7 |