DokuWiki 2006-11-06 : /inc/indexer.php source

[Sommaire] [Imprimer]
   1  <?php
   2  /**
   3   * Common DokuWiki functions
   4   *
   5   * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
   6   * @author     Andreas Gohr <andi@splitbrain.org>
   7   */
   8  
   9    if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
  10    require_once(DOKU_CONF.'dokuwiki.php');
  11    require_once (DOKU_INC.'inc/io.php');
  12    require_once (DOKU_INC.'inc/utf8.php');
  13    require_once (DOKU_INC.'inc/parserutils.php');
  14  
  15  // Asian characters are handled as words. The following regexp defines the
  16  // Unicode-Ranges for Asian characters
  17  // Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
  18  // I'm no language expert. If you think some ranges are wrongly chosen or
  19  // a range is missing, please contact me
  20  define('IDX_ASIAN','['.
  21                     '\x{0E00}-\x{0E7F}'.  // Thai
  22                     '\x{2E80}-\x{D7AF}'.  // CJK -> Hangul
  23                     '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
  24                     '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
  25                     ']');
  26  
  27  
  28  /**
  29   * Split a page into words
  30   *
  31   * Returns an array of of word counts, false if an error occured
  32   *
  33   * @author Andreas Gohr <andi@splitbrain.org>
  34   * @author Christopher Smith <chris@jalakai.co.uk>
  35   */
  36  function idx_getPageWords($page){
  37      global $conf;
  38      $word_idx = file($conf['cachedir'].'/word.idx');
  39      $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
  40      if(@file_exists($swfile)){
  41          $stopwords = file($swfile);
  42      }else{
  43          $stopwords = array();
  44      }
  45  
  46      $body   = rawWiki($page);
  47      $body   = strtr($body, "\r\n\t", '   ');
  48      $tokens = explode(' ', $body);
  49      $tokens = array_count_values($tokens);   // count the frequency of each token
  50  
  51  // ensure the deaccented or romanised page names of internal links are added to the token array
  52  // (this is necessary for the backlink function -- there maybe a better way!)
  53      if ($conf['deaccent']) {
  54        $links = p_get_metadata($page,'relation references');
  55  
  56        $tmp = join(' ',array_keys($links));                // make a single string
  57        $tmp = strtr($tmp, ':', ' ');                       // replace namespace separator with a space
  58        $link_tokens = array_unique(explode(' ', $tmp));    // break into tokens
  59  
  60        foreach ($link_tokens as $link_token) {
  61          if (isset($tokens[$link_token])) continue;
  62          $tokens[$link_token] = 1;
  63        }
  64      }
  65  
  66      $words = array();
  67      foreach ($tokens as $word => $count) {
  68          // simple filter to restrict use of utf8_stripspecials
  69          if (preg_match('/[^0-9A-Za-z]/u', $word)) {
  70              // handle asian chars as single words (may fail on older PHP version)
  71              $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
  72              if(!is_null($asia)) $word = $asia; //recover from regexp failure
  73              $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*'));
  74              $arr = array_count_values($arr);
  75  
  76              foreach ($arr as $w => $c) {
  77                  if (!is_numeric($w) && strlen($w) < 3) continue;
  78                  $w = utf8_strtolower($w);
  79                  $words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0);
  80              }
  81          } else {
  82              if (!is_numeric($word) && strlen($word) < 3) continue;
  83              $word = strtolower($word);
  84              $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
  85          }
  86      }
  87  
  88      // arrive here with $words = array(word => frequency)
  89  
  90      $index = array(); //resulting index
  91      foreach ($words as $word => $freq) {
  92      if (is_int(array_search("$word\n",$stopwords))) continue;
  93          $wid = array_search("$word\n",$word_idx);
  94          if(!is_int($wid)){
  95              $word_idx[] = "$word\n";
  96              $wid = count($word_idx)-1;
  97          }
  98          $index[$wid] = $freq;
  99      }
 100  
 101      // save back word index
 102      $fh = fopen($conf['cachedir'].'/word.idx','w');
 103      if(!$fh){
 104          trigger_error("Failed to write word.idx", E_USER_ERROR);
 105          return false;
 106      }
 107      fwrite($fh,join('',$word_idx));
 108      fclose($fh);
 109  
 110      return $index;
 111  }
 112  
 113  /**
 114   * Adds/updates the search for the given page
 115   *
 116   * This is the core function of the indexer which does most
 117   * of the work. This function needs to be called with proper
 118   * locking!
 119   *
 120   * @author Andreas Gohr <andi@splitbrain.org>
 121   */
 122  function idx_addPage($page){
 123      global $conf;
 124  
 125      // load known documents
 126      $page_idx = file($conf['cachedir'].'/page.idx');
 127  
 128      // get page id (this is the linenumber in page.idx)
 129      $pid = array_search("$page\n",$page_idx);
 130      if(!is_int($pid)){
 131          $page_idx[] = "$page\n";
 132          $pid = count($page_idx)-1;
 133          // page was new - write back
 134          $fh = fopen($conf['cachedir'].'/page.idx','w');
 135          if(!$fh) return false;
 136          fwrite($fh,join('',$page_idx));
 137          fclose($fh);
 138      }
 139  
 140      // get word usage in page
 141      $words = idx_getPageWords($page);
 142      if($words === false) return false;
 143      if(!count($words)) return true;
 144  
 145      // Open index and temp file
 146      $idx = fopen($conf['cachedir'].'/index.idx','r');
 147      $tmp = fopen($conf['cachedir'].'/index.tmp','w');
 148      if(!$idx || !$tmp){
 149         trigger_error("Failed to open index files", E_USER_ERROR);
 150         return false;
 151      }
 152  
 153      // copy from index to temp file, modifying were needed
 154      $lno = 0;
 155      $line = '';
 156      while (!feof($idx)) {
 157          // read full line
 158          $line .= fgets($idx, 4096);
 159          if(substr($line,-1) != "\n") continue;
 160  
 161          // write a new Line to temp file
 162          idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
 163  
 164          $line = ''; // reset line buffer
 165          $lno++;     // increase linecounter
 166      }
 167      fclose($idx);
 168  
 169      // add missing lines (usually index and word should contain
 170      // the same number of lines, however if the page contained
 171      // new words the word file has some more lines which need to
 172      // be added here
 173      $word_idx = file($conf['cachedir'].'/word.idx');
 174      $wcnt = count($word_idx);
 175      for($lno; $lno<$wcnt; $lno++){
 176          idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
 177      }
 178  
 179      // close the temp file and move it over to be the new one
 180      fclose($tmp);
 181      // try rename first (fast) fallback to copy (slow)
 182      io_rename($conf['cachedir'].'/index.tmp',
 183                $conf['cachedir'].'/index.idx');
 184      return false;
 185  }
 186  
 187  /**
 188   * Write a new index line to the filehandle
 189   *
 190   * This function writes an line for the index file to the
 191   * given filehandle. It removes the given document from
 192   * the given line and readds it when $count is >0.
 193   *
 194   * @author Andreas Gohr <andi@splitbrain.org>
 195   */
 196  function idx_writeIndexLine($fh,$line,$pid,$count){
 197      $line = trim($line);
 198  
 199      if($line != ''){
 200          $parts = explode(':',$line);
 201          // remove doc from given line
 202          foreach($parts as $part){
 203              if($part == '') continue;
 204              list($doc,$cnt) = explode('*',$part);
 205              if($doc != $pid){
 206                  fwrite($fh,"$doc*$cnt:");
 207              }
 208          }
 209      }
 210  
 211      // add doc
 212      if ($count){
 213          fwrite($fh,"$pid*$count");
 214      }
 215  
 216      // add newline
 217      fwrite($fh,"\n");
 218  }
 219  
 220  /**
 221   * Lookup words in index
 222   *
 223   * Takes an array of word and will return a list of matching
 224   * documents for each one.
 225   *
 226   * Important: No ACL checking is done here! All results are
 227   *            returned, regardless of permissions
 228   *
 229   * @author Andreas Gohr <andi@splitbrain.org>
 230   */
 231  function idx_lookup($words){
 232      global $conf;
 233  
 234      $result = array();
 235  
 236      // load known words and documents
 237      $page_idx = file($conf['cachedir'].'/page.idx');
 238      $word_idx = file($conf['cachedir'].'/word.idx');
 239  
 240      // get word IDs
 241      $wids = array();
 242      foreach($words as $word){
 243          $result[$word] = array();
 244          $wild = 0;
 245          $xword = $word;
 246  
 247          // check for wildcards
 248          if(substr($xword,0,1) == '*'){
 249              $xword = substr($xword,1);
 250              $wild  = 1;
 251              $ptn = '/'.preg_quote($xword,'/').'$/';
 252  #            $l = -1*strlen($xword)-1;
 253          }
 254          if(substr($xword,-1,1) == '*'){
 255              $xword = substr($xword,0,-1);
 256              $wild += 2;
 257          }
 258  
 259          // look for the ID(s) for the given word
 260          if($wild){  // handle wildcard search
 261              $cnt = count($word_idx);
 262              for($wid=0; $wid<$cnt; $wid++){
 263                  $iword = $word_idx[$wid];
 264                  if( (($wild==3) && is_int(strpos($iword,$xword))) ||
 265  #                    (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
 266                      (($wild==1) && preg_match($ptn,$iword)) ||
 267  #                    (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
 268                      (($wild==2) && (0 === strpos($iword,$xword)))
 269  
 270                    ){
 271                      $wids[] = $wid;
 272                      $result[$word][] = $wid;
 273                  }
 274              }
 275          }else{     // handle exact search
 276              $wid = array_search("$word\n",$word_idx);
 277              if(is_int($wid)){
 278                  $wids[] = $wid;
 279                  $result[$word][] = $wid;
 280              }else{
 281                  $result[$word] = array();
 282              }
 283          }
 284      }
 285      sort($wids);
 286      $wids = array_unique($wids);
 287  
 288      // Open index
 289      $idx = fopen($conf['cachedir'].'/index.idx','r');
 290      if(!$idx){
 291         msg("Failed to open index file",-1);
 292         return false;
 293      }
 294  
 295      // Walk the index til the lines are found
 296      $docs = array();                          // hold docs found
 297      $lno  = 0;
 298      $line = '';
 299      $srch = array_shift($wids);               // which word do we look for?
 300      while (!feof($idx)) {
 301          // read full line
 302          $line .= fgets($idx, 4096);
 303          if(substr($line,-1) != "\n") continue;
 304          if($lno > $srch)             break;   // shouldn't happen
 305  
 306  
 307          // do we want this line?
 308          if($lno == $srch){
 309              // add docs to list
 310              $docs[$srch] = idx_parseIndexLine($page_idx,$line);
 311  
 312              $srch = array_shift($wids);        // next word to look up
 313              if($srch == null) break;           // no more words
 314          }
 315  
 316          $line = ''; // reset line buffer
 317          $lno++;     // increase linecounter
 318      }
 319      fclose($idx);
 320  
 321  
 322      // merge found pages into final result array
 323      $final = array();
 324      foreach(array_keys($result) as $word){
 325          $final[$word] = array();
 326          foreach($result[$word] as $wid){
 327              $hits = &$docs[$wid];
 328              foreach ($hits as $hitkey => $hitcnt) {
 329                  $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey];
 330              }
 331          }
 332      }
 333      return $final;
 334  }
 335  
 336  /**
 337   * Returns a list of documents and counts from a index line
 338   *
 339   * It omits docs with a count of 0 and pages that no longer
 340   * exist.
 341   *
 342   * @param  array  $page_idx The list of known pages
 343   * @param  string $line     A line from the main index
 344   * @author Andreas Gohr <andi@splitbrain.org>
 345   */
 346  function idx_parseIndexLine(&$page_idx,$line){
 347      $result = array();
 348  
 349      $line = trim($line);
 350      if($line == '') return $result;
 351  
 352      $parts = explode(':',$line);
 353      foreach($parts as $part){
 354          if($part == '') continue;
 355          list($doc,$cnt) = explode('*',$part);
 356          if(!$cnt) continue;
 357          $doc = trim($page_idx[$doc]);
 358          if(!$doc) continue;
 359          // make sure the document still exists
 360          if(!@file_exists(wikiFN($doc,'',false))) continue;
 361  
 362          $result[$doc] = $cnt;
 363      }
 364      return $result;
 365  }
 366  
 367  /**
 368   * Tokenizes a string into an array of search words
 369   *
 370   * Uses the same algorithm as idx_getPageWords()
 371   *
 372   * @param string   $string     the query as given by the user
 373   * @param arrayref $stopwords  array of stopwords
 374   * @param boolean  $wc         are wildcards allowed?
 375   *
 376   * @todo make combined function to use alone or in getPageWords
 377   */
 378  function idx_tokenizer($string,&$stopwords,$wc=false){
 379      $words = array();
 380      $wc = ($wc) ? '' : $wc = '\*';
 381  
 382      if(preg_match('/[^0-9A-Za-z]/u', $string)){
 383          // handle asian chars as single words (may fail on older PHP version)
 384          $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
 385          if(!is_null($asia)) $string = $asia; //recover from regexp failure
 386  
 387          $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc));
 388          foreach ($arr as $w) {
 389              if (!is_numeric($w) && strlen($w) < 3) continue;
 390              $w = utf8_strtolower($w);
 391              if($stopwords && is_int(array_search("$w\n",$stopwords))) continue;
 392              $words[] = $w;
 393          }
 394      }else{
 395          $w = $string;
 396          if (!is_numeric($w) && strlen($w) < 3) return $words;
 397          $w = strtolower($w);
 398          if(is_int(array_search("$w\n",$stopwords))) return $words;
 399          $words[] = $w;
 400      }
 401  
 402      return $words;
 403  }
 404  
 405  //Setup VIM: ex: et ts=4 enc=utf-8 :
Code source de DokuWiki 2006-11-06

/inc/ -> indexer.php (source)