LifeType 1.2.4 : /class/bayesian/bayesiantokenizer.class.php source

[Sommaire] [Imprimer]
   1  <?php
   2  
   3      lt_include( PLOG_CLASS_PATH."class/dao/bayesiantoken.class.php" );
   4      lt_include( PLOG_CLASS_PATH."class/bayesian/tokenizer.class.php" );
   5  
   6      define( "SPLIT_REG_EXP", "[^a-zA-Z0-9àáèéíïòóúüÀÁÈÉÍÏÒÓÚÜ'$!,.^-]+");
   7  
   8      /**
   9       * \ingroup Bayesian
  10       *
  11       * This class takes care of splitting a valid html source in the different words that
  12       * make it up, taking tags into account. The main public method is BayesianTokenizer::tokenize()
  13       */     
  14      class BayesianTokenizer extends Tokenizer 
  15      {
  16      
  17          var $_htmlTags = array();
  18          
  19          /**
  20           * constructor, it only calls the parent constructor.
  21           * @see Tokenizer
  22           */        
  23          function BayesianTokenizer()
  24          {            
  25              $this->Tokenizer();
  26          }
  27          
  28          /**
  29           * given an input text, possibly containing HTML tags, it will split it into
  30           * all the different words that make it up.
  31           *
  32           * @param text The text to split
  33           * @param unique Whether the return array should contain unique items or if the same
  34           * word is allowed more than once.
  35           * @return An array where each item is a word from the text
  36           */
  37          function tokenize($text, $unique = false)
  38          {
  39              $this->_htmlTags = array();
  40              $text = $this->_stripHtmlTags($text);
  41              $tokensFromHtml = $this->_tokenizeHtmlTags($this->_htmlTags);            
  42              $tokensText = $this->_tokenize($text);            
  43              $tokens = array_merge($tokensText, $tokensFromHtml);
  44              
  45              if ($unique)
  46              {
  47                  $tokens = array_unique($tokens);
  48              }
  49              
  50              return $tokens;
  51          }
  52          
  53          /**
  54           * @private
  55           */
  56          function _tokenize($text)
  57          {
  58              $tokensTemp = split(SPLIT_REG_EXP, $text);            
  59              $tokens = array();
  60              
  61              foreach ($tokensTemp as $token)
  62              {
  63                  if (strlen($token) > 0 && BayesianToken::isValid($token))
  64                  {
  65                      if (ereg("\\$[0-9]+[-][0-9]+", $token))
  66                      {
  67                          $temp = split("[-]", $token);
  68                          
  69                          if (BayesianToken::isValid($temp[0]))
  70                          {
  71                              array_push($tokens, $temp[0]);
  72                          }
  73                          
  74                          if (BayesianToken::isValid("$" . $temp[1]))
  75                          {
  76                              array_push($tokens, "$" . $temp[1]);
  77                          }                        
  78                      }
  79                      else if (!ereg("[0-9]+[,.^][0-9]+", $token))
  80                      {
  81                          $splitted = split("[,.^]", $token);
  82                          
  83                          foreach ($splitted as $splittedOne)
  84                          {
  85                              if (BayesianToken::isValid($splittedOne))
  86                              {
  87                                  array_push($tokens, $splittedOne);
  88                              }
  89                          }
  90                      }
  91                      else
  92                      {
  93                          array_push($tokens, $token);
  94                      }                        
  95                  }
  96              }
  97              
  98              return $tokens;
  99          }
 100          
 101          /**
 102           * @private
 103           */
 104      	function _getValidHtmlTags($tags)
 105          {
 106              $validTags = array();
 107              
 108              foreach ($tags as $tag)
 109              {
 110                  if (eregi("^<a ", $tag) || eregi("^<img ", $tag) || eregi("^<font ", $tag))
 111                  {
 112                      array_push($validTags, $tag);
 113                  }
 114              }
 115              
 116              return $validTags;
 117          }
 118          
 119          /**
 120           * @private
 121           */
 122      	function _stripHtmlTags($text)
 123          {            
 124              preg_match_all("/(<[^>]+>)/", $text, $regs);
 125              
 126              foreach  ($regs[1] as $tag)
 127              {
 128                  array_push($this->_htmlTags, $tag);
 129              }
 130              
 131              $this->_htmlTags = $this->_getValidHtmlTags($this->_htmlTags);
 132              
 133              return preg_replace("/<[^>]+>/", "", $text);
 134          }
 135          
 136          /**
 137           * @private
 138           */
 139      	function _tokenizeHtmlTags($tags)
 140          {   
 141              $tokens = array();
 142                       
 143              foreach ($tags as $tag)
 144              {
 145                  $tokens = array_merge($tokens, $this->_tokenizeHtmlTag($tag));
 146              }
 147              
 148              return $tokens;
 149          }
 150          
 151          /**
 152           * @private
 153           */
 154      	function _tokenizeHtmlTag($tag)
 155          {   
 156              $tokens = array();
 157              
 158              preg_match_all("/([^=]+)=\s*([^\s>]+)/", $tag, $regs);
 159              $count = count($regs[1]);
 160              
 161              //foreach  ($regs[2] as $value)
 162              for ($i = 0; $i < $count; $i++)
 163              {
 164                  $value = $regs[2][$i];
 165                  $prefix = "";
 166                  
 167                  if (eregi("(href|src)", $regs[1][$i]))
 168                  {
 169                      $prefix = TOKEN_URL_MARK;
 170                  }
 171                  
 172                  $token = $this->_unquoteToken($value);
 173                  $tokensTemp = $this->_tokenize($token);
 174                  
 175                  foreach  ($tokensTemp as $tokenTemp)
 176                  {
 177                      if (BayesianToken::isValid($tokenTemp))
 178                      {
 179                          array_push($tokens, $prefix . $tokenTemp);
 180                      }
 181                  }                
 182              }
 183              
 184              return $tokens;
 185          }
 186          
 187          /**
 188           * @private
 189           */
 190      	function _unquoteToken($token)
 191          {   
 192              if (ereg("^['\"](.+)['\"]$", $token, $regs))
 193              {
 194                  return $regs[1];
 195              }
 196              else
 197              {
 198                  return $token;
 199              }
 200          }        
 201          
 202          /**
 203           * @private
 204           */
 205          function addContextMark($tokens, $mark)
 206          {
 207              $count = count($tokens);
 208              
 209              for ($i = 0; $i < $count; $i++)
 210              {
 211                  if (!eregi("^" . TOKEN_URL_MARK, $tokens[$i]))
 212                  {
 213                      $tokens[$i] = $mark . $tokens[$i];
 214                  }
 215              }
 216          
 217              return $tokens;
 218          }
 219      }
 220  ?>
Code source de LifeType 1.2.4

/class/bayesian/ -> bayesiantokenizer.class.php (source)

Généré le : Mon Nov 26 21:04:15 2007	par Balluche grâce à PHPXref 0.7