DokuWiki 2006-11-06 : /inc/parser/lexer.php source

[Sommaire] [Imprimer]
   1  <?php
   2  /**
   3  * Author Markus Baker: http://www.lastcraft.com
   4  * Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/
   5  * For an intro to the Lexer see:
   6  * http://www.phppatterns.com/index.php/article/articleview/106/1/2/
   7  * @author Marcus Baker
   8  * @package Doku
   9  * @subpackage Lexer
  10  * @version $Id: lexer.php,v 1.1 2005/03/23 23:14:09 harryf Exp $
  11  */
  12  
  13  /**
  14  * Init path constant
  15  */
  16  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../../').'/');
  17  
  18  /**#@+
  19   * lexer mode constant
  20   */
  21  define("DOKU_LEXER_ENTER", 1);
  22  define("DOKU_LEXER_MATCHED", 2);
  23  define("DOKU_LEXER_UNMATCHED", 3);
  24  define("DOKU_LEXER_EXIT", 4);
  25  define("DOKU_LEXER_SPECIAL", 5);
  26  /**#@-*/
  27  
  28  /**
  29   *    Compounded regular expression. Any of
  30   *    the contained patterns could match and
  31   *    when one does it's label is returned.
  32   *    @package Doku
  33   *    @subpackage Lexer
  34   */
  35  class Doku_LexerParallelRegex {
  36      var $_patterns;
  37      var $_labels;
  38      var $_regex;
  39      var $_case;
  40  
  41      /**
  42       *    Constructor. Starts with no patterns.
  43       *    @param boolean $case    True for case sensitive, false
  44       *                            for insensitive.
  45       *    @access public
  46       */
  47      function Doku_LexerParallelRegex($case) {
  48          $this->_case = $case;
  49          $this->_patterns = array();
  50          $this->_labels = array();
  51          $this->_regex = null;
  52      }
  53  
  54      /**
  55       *    Adds a pattern with an optional label.
  56       *    @param mixed $pattern       Perl style regex. Must be UTF-8
  57       *                                encoded. If its a string, the (, )
  58       *                                lose their meaning unless they
  59       *                                form part of a lookahead or
  60       *                                lookbehind assertation.
  61       *    @param string $label        Label of regex to be returned
  62       *                                on a match. Label must be ASCII
  63       *    @access public
  64       */
  65      function addPattern($pattern, $label = true) {
  66          $count = count($this->_patterns);
  67          $this->_patterns[$count] = $pattern;
  68          $this->_labels[$count] = $label;
  69          $this->_regex = null;
  70      }
  71  
  72      /**
  73       *    Attempts to match all patterns at once against
  74       *    a string.
  75       *    @param string $subject      String to match against.
  76       *    @param string $match        First matched portion of
  77       *                                subject.
  78       *    @return boolean             True on success.
  79       *    @access public
  80       */
  81      function match($subject, &$match) {
  82          if (count($this->_patterns) == 0) {
  83              return false;
  84          }
  85          if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
  86              $match = "";
  87              return false;
  88          }
  89  
  90          $match = $matches[0];
  91          $size = count($matches);
  92          for ($i = 1; $i < $size; $i++) {
  93              if ($matches[$i] && isset($this->_labels[$i - 1])) {
  94                  return $this->_labels[$i - 1];
  95              }
  96          }
  97          return true;
  98      }
  99  
 100      /**
 101       *    Attempts to split the string against all patterns at once
 102       *
 103       *    @param string $subject      String to match against.
 104       *    @param array $split         The split result: array containing, pre-match, match & post-match strings
 105       *    @return boolean             True on success.
 106       *    @access public
 107       *
 108       *    @author Christopher Smith <chris@jalakai.co.uk>
 109       */
 110      function split($subject, &$split) {
 111          if (count($this->_patterns) == 0) {
 112              return false;
 113          }
 114  
 115          if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
 116              $split = array($subject, "", "");
 117              return false;
 118          }
 119  
 120          $idx = count($matches)-2;
 121  
 122          list($pre, $post) = preg_split($this->_patterns[$idx].$this->_getPerlMatchingFlags(), $subject, 2);
 123  
 124          $split = array($pre, $matches[0], $post);
 125          return isset($this->_labels[$idx]) ? $this->_labels[$idx] : true;
 126      }
 127  
 128      /**
 129       *    Compounds the patterns into a single
 130       *    regular expression separated with the
 131       *    "or" operator. Caches the regex.
 132       *    Will automatically escape (, ) and / tokens.
 133       *    @param array $patterns    List of patterns in order.
 134       *    @access private
 135       */
 136      function _getCompoundedRegex() {
 137          if ($this->_regex == null) {
 138              $cnt = count($this->_patterns);
 139              for ($i = 0; $i < $cnt; $i++) {
 140  
 141                  // Replace lookaheads / lookbehinds with marker
 142                  $m = "\1\1";
 143                  $pattern = preg_replace(
 144                          array (
 145                              '/\(\?(i|m|s|x|U)\)/U',
 146                              '/\(\?(\-[i|m|s|x|U])\)/U',
 147                              '/\(\?\=(.*)\)/sU',
 148                              '/\(\?\!(.*)\)/sU',
 149                              '/\(\?\<\=(.*)\)/sU',
 150                              '/\(\?\<\!(.*)\)/sU',
 151                              '/\(\?\:(.*)\)/sU',
 152                          ),
 153                          array (
 154                              $m.'SO:\\1'.$m,
 155                              $m.'SOR:\\1'.$m,
 156                              $m.'LA:IS:\\1'.$m,
 157                              $m.'LA:NOT:\\1'.$m,
 158                              $m.'LB:IS:\\1'.$m,
 159                              $m.'LB:NOT:\\1'.$m,
 160                              $m.'GRP:\\1'.$m,
 161                          ),
 162                          $this->_patterns[$i]
 163                      );
 164                  // Quote the rest
 165                  $pattern = str_replace(
 166                      array('/', '(', ')'),
 167                      array('\/', '\(', '\)'),
 168                      $pattern
 169                      );
 170  
 171                  // Restore lookaheads / lookbehinds
 172                  $pattern = preg_replace(
 173                          array (
 174                              '/'.$m.'SO:(.{1})'.$m.'/',
 175                              '/'.$m.'SOR:(.{2})'.$m.'/',
 176                              '/'.$m.'LA:IS:(.*)'.$m.'/sU',
 177                              '/'.$m.'LA:NOT:(.*)'.$m.'/sU',
 178                              '/'.$m.'LB:IS:(.*)'.$m.'/sU',
 179                              '/'.$m.'LB:NOT:(.*)'.$m.'/sU',
 180                              '/'.$m.'GRP:(.*)'.$m.'/sU',
 181                          ),
 182                          array (
 183                              '(?\\1)',
 184                              '(?\\1)',
 185                              '(?=\\1)',
 186                              '(?!\\1)',
 187                              '(?<=\\1)',
 188                              '(?<!\\1)',
 189                              '(?:\\1)',
 190                          ),
 191                          $pattern
 192                  );
 193  
 194                  $this->_patterns[$i] = '('.$pattern.')';
 195              }
 196              $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
 197          }
 198          return $this->_regex;
 199      }
 200  
 201      /**
 202       *    Accessor for perl regex mode flags to use.
 203       *    @return string       Perl regex flags.
 204       *    @access private
 205       */
 206      function _getPerlMatchingFlags() {
 207          return ($this->_case ? "msS" : "msSi");
 208      }
 209  }
 210  
 211  /**
 212   *    States for a stack machine.
 213   *    @package Lexer
 214   *    @subpackage Lexer
 215   */
 216  class Doku_LexerStateStack {
 217      var $_stack;
 218  
 219      /**
 220       *    Constructor. Starts in named state.
 221       *    @param string $start        Starting state name.
 222       *    @access public
 223       */
 224      function Doku_LexerStateStack($start) {
 225          $this->_stack = array($start);
 226      }
 227  
 228      /**
 229       *    Accessor for current state.
 230       *    @return string       State.
 231       *    @access public
 232       */
 233      function getCurrent() {
 234          return $this->_stack[count($this->_stack) - 1];
 235      }
 236  
 237      /**
 238       *    Adds a state to the stack and sets it
 239       *    to be the current state.
 240       *    @param string $state        New state.
 241       *    @access public
 242       */
 243      function enter($state) {
 244          array_push($this->_stack, $state);
 245      }
 246  
 247      /**
 248       *    Leaves the current state and reverts
 249       *    to the previous one.
 250       *    @return boolean    False if we drop off
 251       *                       the bottom of the list.
 252       *    @access public
 253       */
 254      function leave() {
 255          if (count($this->_stack) == 1) {
 256              return false;
 257          }
 258          array_pop($this->_stack);
 259          return true;
 260      }
 261  }
 262  
 263  /**
 264   *    Accepts text and breaks it into tokens.
 265   *    Some optimisation to make the sure the
 266   *    content is only scanned by the PHP regex
 267   *    parser once. Lexer modes must not start
 268   *    with leading underscores.
 269   *    @package Doku
 270   *    @subpackage Lexer
 271   */
 272  class Doku_Lexer {
 273      var $_regexes;
 274      var $_parser;
 275      var $_mode;
 276      var $_mode_handlers;
 277      var $_case;
 278  
 279      /**
 280       *    Sets up the lexer in case insensitive matching
 281       *    by default.
 282       *    @param Doku_Parser $parser  Handling strategy by
 283       *                                    reference.
 284       *    @param string $start            Starting handler.
 285       *    @param boolean $case            True for case sensitive.
 286       *    @access public
 287       */
 288      function Doku_Lexer(&$parser, $start = "accept", $case = false) {
 289          $this->_case = $case;
 290          $this->_regexes = array();
 291          $this->_parser = &$parser;
 292          $this->_mode = &new Doku_LexerStateStack($start);
 293          $this->_mode_handlers = array();
 294      }
 295  
 296      /**
 297       *    Adds a token search pattern for a particular
 298       *    parsing mode. The pattern does not change the
 299       *    current mode.
 300       *    @param string $pattern      Perl style regex, but ( and )
 301       *                                lose the usual meaning.
 302       *    @param string $mode         Should only apply this
 303       *                                pattern when dealing with
 304       *                                this type of input.
 305       *    @access public
 306       */
 307      function addPattern($pattern, $mode = "accept") {
 308          if (! isset($this->_regexes[$mode])) {
 309              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 310          }
 311          $this->_regexes[$mode]->addPattern($pattern);
 312      }
 313  
 314      /**
 315       *    Adds a pattern that will enter a new parsing
 316       *    mode. Useful for entering parenthesis, strings,
 317       *    tags, etc.
 318       *    @param string $pattern      Perl style regex, but ( and )
 319       *                                lose the usual meaning.
 320       *    @param string $mode         Should only apply this
 321       *                                pattern when dealing with
 322       *                                this type of input.
 323       *    @param string $new_mode     Change parsing to this new
 324       *                                nested mode.
 325       *    @access public
 326       */
 327      function addEntryPattern($pattern, $mode, $new_mode) {
 328          if (! isset($this->_regexes[$mode])) {
 329              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 330          }
 331          $this->_regexes[$mode]->addPattern($pattern, $new_mode);
 332      }
 333  
 334      /**
 335       *    Adds a pattern that will exit the current mode
 336       *    and re-enter the previous one.
 337       *    @param string $pattern      Perl style regex, but ( and )
 338       *                                lose the usual meaning.
 339       *    @param string $mode         Mode to leave.
 340       *    @access public
 341       */
 342      function addExitPattern($pattern, $mode) {
 343          if (! isset($this->_regexes[$mode])) {
 344              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 345          }
 346          $this->_regexes[$mode]->addPattern($pattern, "__exit");
 347      }
 348  
 349      /**
 350       *    Adds a pattern that has a special mode. Acts as an entry
 351       *    and exit pattern in one go, effectively calling a special
 352       *    parser handler for this token only.
 353       *    @param string $pattern      Perl style regex, but ( and )
 354       *                                lose the usual meaning.
 355       *    @param string $mode         Should only apply this
 356       *                                pattern when dealing with
 357       *                                this type of input.
 358       *    @param string $special      Use this mode for this one token.
 359       *    @access public
 360       */
 361      function addSpecialPattern($pattern, $mode, $special) {
 362          if (! isset($this->_regexes[$mode])) {
 363              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 364          }
 365          $this->_regexes[$mode]->addPattern($pattern, "_$special");
 366      }
 367  
 368      /**
 369       *    Adds a mapping from a mode to another handler.
 370       *    @param string $mode        Mode to be remapped.
 371       *    @param string $handler     New target handler.
 372       *    @access public
 373       */
 374      function mapHandler($mode, $handler) {
 375          $this->_mode_handlers[$mode] = $handler;
 376      }
 377  
 378      /**
 379       *    Splits the page text into tokens. Will fail
 380       *    if the handlers report an error or if no
 381       *    content is consumed. If successful then each
 382       *    unparsed and parsed token invokes a call to the
 383       *    held listener.
 384       *    @param string $raw        Raw HTML text.
 385       *    @return boolean           True on success, else false.
 386       *    @access public
 387       */
 388      function parse($raw) {
 389          if (! isset($this->_parser)) {
 390              return false;
 391          }
 392          $initialLength = strlen($raw);
 393          $length = $initialLength;
 394          $pos = 0;
 395          while (is_array($parsed = $this->_reduce($raw))) {
 396              list($unmatched, $matched, $mode) = $parsed;
 397              $currentLength = strlen($raw);
 398              $matchPos = $initialLength - $currentLength - strlen($matched);
 399              if (! $this->_dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
 400                  return false;
 401              }
 402              if ($currentLength == $length) {
 403                  return false;
 404              }
 405              $length = $currentLength;
 406              $pos = $initialLength - $currentLength;
 407          }
 408          if (!$parsed) {
 409              return false;
 410          }
 411          return $this->_invokeParser($raw, DOKU_LEXER_UNMATCHED, $pos);
 412      }
 413  
 414      /**
 415       *    Sends the matched token and any leading unmatched
 416       *    text to the parser changing the lexer to a new
 417       *    mode if one is listed.
 418       *    @param string $unmatched    Unmatched leading portion.
 419       *    @param string $matched      Actual token match.
 420       *    @param string $mode         Mode after match. A boolean
 421       *                                false mode causes no change.
 422       *    @param int $pos         Current byte index location in raw doc
 423       *                                thats being parsed
 424       *    @return boolean             False if there was any error
 425       *                                from the parser.
 426       *    @access private
 427       */
 428      function _dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) {
 429          if (! $this->_invokeParser($unmatched, DOKU_LEXER_UNMATCHED, $initialPos) ){
 430              return false;
 431          }
 432          if ($this->_isModeEnd($mode)) {
 433              if (! $this->_invokeParser($matched, DOKU_LEXER_EXIT, $matchPos)) {
 434                  return false;
 435              }
 436              return $this->_mode->leave();
 437          }
 438          if ($this->_isSpecialMode($mode)) {
 439              $this->_mode->enter($this->_decodeSpecial($mode));
 440              if (! $this->_invokeParser($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
 441                  return false;
 442              }
 443              return $this->_mode->leave();
 444          }
 445          if (is_string($mode)) {
 446              $this->_mode->enter($mode);
 447              return $this->_invokeParser($matched, DOKU_LEXER_ENTER, $matchPos);
 448          }
 449          return $this->_invokeParser($matched, DOKU_LEXER_MATCHED, $matchPos);
 450      }
 451  
 452      /**
 453       *    Tests to see if the new mode is actually to leave
 454       *    the current mode and pop an item from the matching
 455       *    mode stack.
 456       *    @param string $mode    Mode to test.
 457       *    @return boolean        True if this is the exit mode.
 458       *    @access private
 459       */
 460      function _isModeEnd($mode) {
 461          return ($mode === "__exit");
 462      }
 463  
 464      /**
 465       *    Test to see if the mode is one where this mode
 466       *    is entered for this token only and automatically
 467       *    leaves immediately afterwoods.
 468       *    @param string $mode    Mode to test.
 469       *    @return boolean        True if this is the exit mode.
 470       *    @access private
 471       */
 472      function _isSpecialMode($mode) {
 473          return (strncmp($mode, "_", 1) == 0);
 474      }
 475  
 476      /**
 477       *    Strips the magic underscore marking single token
 478       *    modes.
 479       *    @param string $mode    Mode to decode.
 480       *    @return string         Underlying mode name.
 481       *    @access private
 482       */
 483      function _decodeSpecial($mode) {
 484          return substr($mode, 1);
 485      }
 486  
 487      /**
 488       *    Calls the parser method named after the current
 489       *    mode. Empty content will be ignored. The lexer
 490       *    has a parser handler for each mode in the lexer.
 491       *    @param string $content        Text parsed.
 492       *    @param boolean $is_match      Token is recognised rather
 493       *                                  than unparsed data.
 494       *    @param int $pos         Current byte index location in raw doc
 495       *                                thats being parsed
 496       *    @access private
 497       */
 498      function _invokeParser($content, $is_match, $pos) {
 499          if (($content === "") || ($content === false)) {
 500              return true;
 501          }
 502          $handler = $this->_mode->getCurrent();
 503          if (isset($this->_mode_handlers[$handler])) {
 504              $handler = $this->_mode_handlers[$handler];
 505          }
 506  
 507          // modes starting with plugin_ are all handled by the same
 508          // handler but with an additional parameter
 509          if(substr($handler,0,7)=='plugin_'){
 510            list($handler,$plugin) = split('_',$handler,2);
 511                return $this->_parser->$handler($content, $is_match, $pos, $plugin);
 512          }
 513  
 514              return $this->_parser->$handler($content, $is_match, $pos);
 515          }
 516  
 517      /**
 518       *    Tries to match a chunk of text and if successful
 519       *    removes the recognised chunk and any leading
 520       *    unparsed data. Empty strings will not be matched.
 521       *    @param string $raw         The subject to parse. This is the
 522       *                               content that will be eaten.
 523       *    @return array              Three item list of unparsed
 524       *                               content followed by the
 525       *                               recognised token and finally the
 526       *                               action the parser is to take.
 527       *                               True if no match, false if there
 528       *                               is a parsing error.
 529       *    @access private
 530       */
 531      function _reduce(&$raw) {
 532          if (! isset($this->_regexes[$this->_mode->getCurrent()])) {
 533              return false;
 534          }
 535          if ($raw === "") {
 536              return true;
 537          }
 538          if ($action = $this->_regexes[$this->_mode->getCurrent()]->split($raw, $split)) {
 539              list($unparsed, $match, $raw) = $split;
 540              return array($unparsed, $match, $action);
 541          }
 542          return true;
 543      }
 544  }
 545  
 546  /**
 547  * Escapes regex characters other than (, ) and /
 548  * @TODO
 549  */
 550  function Doku_Lexer_Escape($str) {
 551      //$str = addslashes($str);
 552      $chars = array(
 553          '/\\\\/',
 554          '/\./',
 555          '/\+/',
 556          '/\*/',
 557          '/\?/',
 558          '/\[/',
 559          '/\^/',
 560          '/\]/',
 561          '/\$/',
 562          '/\{/',
 563          '/\}/',
 564          '/\=/',
 565          '/\!/',
 566          '/\</',
 567          '/\>/',
 568          '/\|/',
 569          '/\:/'
 570          );
 571  
 572      $escaped = array(
 573          '\\\\\\\\',
 574          '\.',
 575          '\+',
 576          '\*',
 577          '\?',
 578          '\[',
 579          '\^',
 580          '\]',
 581          '\$',
 582          '\{',
 583          '\}',
 584          '\=',
 585          '\!',
 586          '\<',
 587          '\>',
 588          '\|',
 589          '\:'
 590          );
 591      return preg_replace($chars, $escaped, $str);
 592  }
 593  
 594  //Setup VIM: ex: et ts=4 enc=utf-8 :
Code source de DokuWiki 2006-11-06

/inc/parser/ -> lexer.php (source)