CMS made simple 1.0.5 : /modules/Search/PorterStemmer.class.php source

[Sommaire] [Imprimer]
   1  <?php
   2      /**
   3      * Copyright (c) 2005 Richard Heyes (http://www.phpguru.org/)
   4      *
   5      * All rights reserved.
   6      *
   7      * This script is free software.
   8      */
   9  
  10      /**
  11      * PHP5 Implementation of the Porter Stemmer algorithm. Certain elements
  12      * were borrowed from the (broken) implementation by Jon Abernathy.
  13      *
  14      * Usage:
  15      *
  16      *  $stem = PorterStemmer::Stem($word);
  17      *
  18      * How easy is that?
  19      */
  20  
  21      class PorterStemmer
  22      {
  23          /**
  24          * Regex for matching a consonant
  25          * @var string
  26          */
  27          var $regex_consonant;
  28  
  29  
  30          /**
  31          * Regex for matching a vowel
  32          * @var string
  33          */
  34          var $regex_vowel;
  35  
  36  	function PorterStemmer()
  37      {
  38          $this->regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
  39          $this->regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
  40      }
  41  
  42          /**
  43          * Stems a word. Simple huh?
  44          *
  45          * @param  string $word Word to stem
  46          * @return string       Stemmed word
  47          */
  48          function Stem($word)
  49          {
  50              if (strlen($word) <= 2) {
  51                  return $word;
  52              }
  53  
  54              $word = $this->step1ab($word);
  55              $word = $this->step1c($word);
  56              $word = $this->step2($word);
  57              $word = $this->step3($word);
  58              $word = $this->step4($word);
  59              $word = $this->step5($word);
  60  
  61              return $word;
  62          }
  63  
  64  
  65          /**
  66          * Step 1
  67          */
  68          function step1ab($word)
  69          {
  70              // Part a
  71              if (substr($word, -1) == 's') {
  72  
  73                     $this->replace($word, 'sses', 'ss')
  74                  || $this->replace($word, 'ies', 'i')
  75                  || $this->replace($word, 'ss', 'ss')
  76                  || $this->replace($word, 's', '');
  77              }
  78  
  79              // Part b
  80              if (substr($word, -2, 1) != 'e' OR !$this->replace($word, 'eed', 'ee', 0)) { // First rule
  81                  $v = $this->regex_vowel;
  82  
  83                  // ing and ed
  84                  if (   preg_match("#$v+#", substr($word, 0, -3)) && $this->replace($word, 'ing', '')
  85                      OR preg_match("#$v+#", substr($word, 0, -2)) && $this->replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
  86  
  87                      // If one of above two test successful
  88                      if (    !$this->replace($word, 'at', 'ate')
  89                          AND !$this->replace($word, 'bl', 'ble')
  90                          AND !$this->replace($word, 'iz', 'ize')) {
  91  
  92                          // Double consonant ending
  93                          if (    $this->doubleConsonant($word)
  94                              AND substr($word, -2) != 'll'
  95                              AND substr($word, -2) != 'ss'
  96                              AND substr($word, -2) != 'zz') {
  97  
  98                              $word = substr($word, 0, -1);
  99  
 100                          } else if ($this->m($word) == 1 AND $this->cvc($word)) {
 101                              $word .= 'e';
 102                          }
 103                      }
 104                  }
 105              }
 106  
 107              return $word;
 108          }
 109  
 110  
 111          /**
 112          * Step 1c
 113          *
 114          * @param string $word Word to stem
 115          */
 116          function step1c($word)
 117          {
 118              $v = $this->regex_vowel;
 119  
 120              if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
 121                  $this->replace($word, 'y', 'i');
 122              }
 123  
 124              return $word;
 125          }
 126  
 127  
 128          /**
 129          * Step 2
 130          *
 131          * @param string $word Word to stem
 132          */
 133          function step2($word)
 134          {
 135              switch (substr($word, -2, 1)) {
 136                  case 'a':
 137                         $this->replace($word, 'ational', 'ate', 0)
 138                      OR $this->replace($word, 'tional', 'tion', 0);
 139                      break;
 140  
 141                  case 'c':
 142                         $this->replace($word, 'enci', 'ence', 0)
 143                      OR $this->replace($word, 'anci', 'ance', 0);
 144                      break;
 145  
 146                  case 'e':
 147                      $this->replace($word, 'izer', 'ize', 0);
 148                      break;
 149  
 150                  case 'g':
 151                      $this->replace($word, 'logi', 'log', 0);
 152                      break;
 153  
 154                  case 'l':
 155                         $this->replace($word, 'entli', 'ent', 0)
 156                      OR $this->replace($word, 'ousli', 'ous', 0)
 157                      OR $this->replace($word, 'alli', 'al', 0)
 158                      OR $this->replace($word, 'bli', 'ble', 0)
 159                      OR $this->replace($word, 'eli', 'e', 0);
 160                      break;
 161  
 162                  case 'o':
 163                         $this->replace($word, 'ization', 'ize', 0)
 164                      OR $this->replace($word, 'ation', 'ate', 0)
 165                      OR $this->replace($word, 'ator', 'ate', 0);
 166                      break;
 167  
 168                  case 's':
 169                         $this->replace($word, 'iveness', 'ive', 0)
 170                      OR $this->replace($word, 'fulness', 'ful', 0)
 171                      OR $this->replace($word, 'ousness', 'ous', 0)
 172                      OR $this->replace($word, 'alism', 'al', 0);
 173                      break;
 174  
 175                  case 't':
 176                         $this->replace($word, 'biliti', 'ble', 0)
 177                      OR $this->replace($word, 'aliti', 'al', 0)
 178                      OR $this->replace($word, 'iviti', 'ive', 0);
 179                      break;
 180              }
 181  
 182              return $word;
 183          }
 184  
 185  
 186          /**
 187          * Step 3
 188          *
 189          * @param string $word String to stem
 190          */
 191          function step3($word)
 192          {
 193              switch (substr($word, -2, 1)) {
 194                  case 'a':
 195                      $this->replace($word, 'ical', 'ic', 0);
 196                      break;
 197  
 198                  case 's':
 199                      $this->replace($word, 'ness', '', 0);
 200                      break;
 201  
 202                  case 't':
 203                         $this->replace($word, 'icate', 'ic', 0)
 204                      OR $this->replace($word, 'iciti', 'ic', 0);
 205                      break;
 206  
 207                  case 'u':
 208                      $this->replace($word, 'ful', '', 0);
 209                      break;
 210  
 211                  case 'v':
 212                      $this->replace($word, 'ative', '', 0);
 213                      break;
 214  
 215                  case 'z':
 216                      $this->replace($word, 'alize', 'al', 0);
 217                      break;
 218              }
 219  
 220              return $word;
 221          }
 222  
 223  
 224          /**
 225          * Step 4
 226          *
 227          * @param string $word Word to stem
 228          */
 229          function step4($word)
 230          {
 231              switch (substr($word, -2, 1)) {
 232                  case 'a':
 233                      $this->replace($word, 'al', '', 1);
 234                      break;
 235  
 236                  case 'c':
 237                         $this->replace($word, 'ance', '', 1)
 238                      OR $this->replace($word, 'ence', '', 1);
 239                      break;
 240  
 241                  case 'e':
 242                      $this->replace($word, 'er', '', 1);
 243                      break;
 244  
 245                  case 'i':
 246                      $this->replace($word, 'ic', '', 1);
 247                      break;
 248  
 249                  case 'l':
 250                         $this->replace($word, 'able', '', 1)
 251                      OR $this->replace($word, 'ible', '', 1);
 252                      break;
 253  
 254                  case 'n':
 255                         $this->replace($word, 'ant', '', 1)
 256                      OR $this->replace($word, 'ement', '', 1)
 257                      OR $this->replace($word, 'ment', '', 1)
 258                      OR $this->replace($word, 'ent', '', 1);
 259                      break;
 260  
 261                  case 'o':
 262                      if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
 263                         $this->replace($word, 'ion', '', 1);
 264                      } else {
 265                          $this->replace($word, 'ou', '', 1);
 266                      }
 267                      break;
 268  
 269                  case 's':
 270                      $this->replace($word, 'ism', '', 1);
 271                      break;
 272  
 273                  case 't':
 274                         $this->replace($word, 'ate', '', 1)
 275                      OR $this->replace($word, 'iti', '', 1);
 276                      break;
 277  
 278                  case 'u':
 279                      $this->replace($word, 'ous', '', 1);
 280                      break;
 281  
 282                  case 'v':
 283                      $this->replace($word, 'ive', '', 1);
 284                      break;
 285  
 286                  case 'z':
 287                      $this->replace($word, 'ize', '', 1);
 288                      break;
 289              }
 290  
 291              return $word;
 292          }
 293  
 294  
 295          /**
 296          * Step 5
 297          *
 298          * @param string $word Word to stem
 299          */
 300          function step5($word)
 301          {
 302              // Part a
 303              if (substr($word, -1) == 'e') {
 304                  if ($this->m(substr($word, 0, -1)) > 1) {
 305                      $this->replace($word, 'e', '');
 306  
 307                  } else if ($this->m(substr($word, 0, -1)) == 1) {
 308  
 309                      if (!$this->cvc(substr($word, 0, -1))) {
 310                          $this->replace($word, 'e', '');
 311                      }
 312                  }
 313              }
 314  
 315              // Part b
 316              if ($this->m($word) > 1 AND $this->doubleConsonant($word) AND substr($word, -1) == 'l') {
 317                  $word = substr($word, 0, -1);
 318              }
 319  
 320              return $word;
 321          }
 322  
 323  
 324          /**
 325          * Replaces the first string with the second, at the end of the string. If third
 326          * arg is given, then the preceding string must match that m count at least.
 327          *
 328          * @param  string $str   String to check
 329          * @param  string $check Ending to check for
 330          * @param  string $repl  Replacement string
 331          * @param  int    $m     Optional minimum number of m() to meet
 332          * @return bool          Whether the $check string was at the end
 333          *                       of the $str string. True does not necessarily mean
 334          *                       that it was replaced.
 335          */
 336          function replace(&$str, $check, $repl, $m = null)
 337          {
 338              $len = 0 - strlen($check);
 339  
 340              if (substr($str, $len) == $check) {
 341                  $substr = substr($str, 0, $len);
 342                  if (is_null($m) OR $this->m($substr) > $m) {
 343                      $str = $substr . $repl;
 344                  }
 345  
 346                  return true;
 347              }
 348  
 349              return false;
 350          }
 351  
 352  
 353          /**
 354          * What, you mean it's not obvious from the name?
 355          *
 356          * m() measures the number of consonant sequences in $str. if c is
 357          * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
 358          * presence,
 359          *
 360          * <c><v>       gives 0
 361          * <c>vc<v>     gives 1
 362          * <c>vcvc<v>   gives 2
 363          * <c>vcvcvc<v> gives 3
 364          *
 365          * @param  string $str The string to return the m count for
 366          * @return int         The m count
 367          */
 368          function m($str)
 369          {
 370              $c = $this->regex_consonant;
 371              $v = $this->regex_vowel;
 372  
 373              $str = preg_replace("#^$c+#", '', $str);
 374              $str = preg_replace("#$v+$#", '', $str);
 375  
 376              preg_match_all("#($v+$c+)#", $str, $matches);
 377  
 378              return count($matches[1]);
 379          }
 380  
 381  
 382          /**
 383          * Returns true/false as to whether the given string contains two
 384          * of the same consonant next to each other at the end of the string.
 385          *
 386          * @param  string $str String to check
 387          * @return bool        Result
 388          */
 389          function doubleConsonant($str)
 390          {
 391              $c = $this->regex_consonant;
 392  
 393              return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
 394          }
 395  
 396  
 397          /**
 398          * Checks for ending CVC sequence where second C is not W, X or Y
 399          *
 400          * @param  string $str String to check
 401          * @return bool        Result
 402          */
 403          function cvc($str)
 404          {
 405              $c = $this->regex_consonant;
 406              $v = $this->regex_vowel;
 407  
 408              return     preg_match("#($c$v$c)$#", $str, $matches)
 409                     AND strlen($matches[1]) == 3
 410                     AND $matches[1]{2} != 'w'
 411                     AND $matches[1]{2} != 'x'
 412                     AND $matches[1]{2} != 'y';
 413          }
 414      }
 415  ?>
Code source de CMS made simple 1.0.5

/modules/Search/ -> PorterStemmer.class.php (source)