[ Index ] |
|
Code source de PRADO 3.0.6 |
1 <?php 2 /** 3 * base include file for SimpleTest 4 * @package SimpleTest 5 * @subpackage MockObjects 6 * @version $Id: parser.php 1526 2006-11-28 23:34:00Z wei $ 7 */ 8 9 /**#@+ 10 * Lexer mode stack constants 11 */ 12 if (! defined('LEXER_ENTER')) { 13 define('LEXER_ENTER', 1); 14 } 15 if (! defined('LEXER_MATCHED')) { 16 define('LEXER_MATCHED', 2); 17 } 18 if (! defined('LEXER_UNMATCHED')) { 19 define('LEXER_UNMATCHED', 3); 20 } 21 if (! defined('LEXER_EXIT')) { 22 define('LEXER_EXIT', 4); 23 } 24 if (! defined('LEXER_SPECIAL')) { 25 define('LEXER_SPECIAL', 5); 26 } 27 /**#@-*/ 28 29 /** 30 * Compounded regular expression. Any of 31 * the contained patterns could match and 32 * when one does, it's label is returned. 33 * @package SimpleTest 34 * @subpackage WebTester 35 */ 36 class ParallelRegex { 37 protected $_patterns; 38 protected $_labels; 39 protected $_regex; 40 protected $_case; 41 42 /** 43 * Constructor. Starts with no patterns. 44 * @param boolean $case True for case sensitive, false 45 * for insensitive. 46 * @access public 47 */ 48 function ParallelRegex($case) { 49 $this->_case = $case; 50 $this->_patterns = array(); 51 $this->_labels = array(); 52 $this->_regex = null; 53 } 54 55 /** 56 * Adds a pattern with an optional label. 57 * @param string $pattern Perl style regex, but ( and ) 58 * lose the usual meaning. 59 * @param string $label Label of regex to be returned 60 * on a match. 61 * @access public 62 */ 63 function addPattern($pattern, $label = true) { 64 $count = count($this->_patterns); 65 $this->_patterns[$count] = $pattern; 66 $this->_labels[$count] = $label; 67 $this->_regex = null; 68 } 69 70 /** 71 * Attempts to match all patterns at once against 72 * a string. 73 * @param string $subject String to match against. 74 * @param string $match First matched portion of 75 * subject. 76 * @return boolean True on success. 77 * @access public 78 */ 79 function match($subject, $match) { 80 if (count($this->_patterns) == 0) { 81 return false; 82 } 83 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) { 84 $match = ''; 85 return false; 86 } 87 $match = $matches[0]; 88 for ($i = 1; $i < count($matches); $i++) { 89 if ($matches[$i]) { 90 return $this->_labels[$i - 1]; 91 } 92 } 93 return true; 94 } 95 96 /** 97 * Compounds the patterns into a single 98 * regular expression separated with the 99 * "or" operator. Caches the regex. 100 * Will automatically escape (, ) and / tokens. 101 * @param array $patterns List of patterns in order. 102 * @access private 103 */ 104 function _getCompoundedRegex() { 105 if ($this->_regex == null) { 106 for ($i = 0, $count = count($this->_patterns); $i < $count; $i++) { 107 $this->_patterns[$i] = '(' . str_replace( 108 array('/', '(', ')'), 109 array('\/', '\(', '\)'), 110 $this->_patterns[$i]) . ')'; 111 } 112 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags(); 113 } 114 return $this->_regex; 115 } 116 117 /** 118 * Accessor for perl regex mode flags to use. 119 * @return string Perl regex flags. 120 * @access private 121 */ 122 function _getPerlMatchingFlags() { 123 return ($this->_case ? "msS" : "msSi"); 124 } 125 } 126 127 /** 128 * States for a stack machine. 129 * @package SimpleTest 130 * @subpackage WebTester 131 */ 132 class SimpleStateStack { 133 protected $_stack; 134 135 /** 136 * Constructor. Starts in named state. 137 * @param string $start Starting state name. 138 * @access public 139 */ 140 function SimpleStateStack($start) { 141 $this->_stack = array($start); 142 } 143 144 /** 145 * Accessor for current state. 146 * @return string State. 147 * @access public 148 */ 149 function getCurrent() { 150 return $this->_stack[count($this->_stack) - 1]; 151 } 152 153 /** 154 * Adds a state to the stack and sets it 155 * to be the current state. 156 * @param string $state New state. 157 * @access public 158 */ 159 function enter($state) { 160 array_push($this->_stack, $state); 161 } 162 163 /** 164 * Leaves the current state and reverts 165 * to the previous one. 166 * @return boolean False if we drop off 167 * the bottom of the list. 168 * @access public 169 */ 170 function leave() { 171 if (count($this->_stack) == 1) { 172 return false; 173 } 174 array_pop($this->_stack); 175 return true; 176 } 177 } 178 179 /** 180 * Accepts text and breaks it into tokens. 181 * Some optimisation to make the sure the 182 * content is only scanned by the PHP regex 183 * parser once. Lexer modes must not start 184 * with leading underscores. 185 * @package SimpleTest 186 * @subpackage WebTester 187 */ 188 class SimpleLexer { 189 protected $_regexes; 190 protected $_parser; 191 protected $_mode; 192 protected $_mode_handlers; 193 protected $_case; 194 195 /** 196 * Sets up the lexer in case insensitive matching 197 * by default. 198 * @param SimpleSaxParser $parser Handling strategy by 199 * reference. 200 * @param string $start Starting handler. 201 * @param boolean $case True for case sensitive. 202 * @access public 203 */ 204 function SimpleLexer($parser, $start = "accept", $case = false) { 205 $this->_case = $case; 206 $this->_regexes = array(); 207 $this->_parser = $parser; 208 $this->_mode = new SimpleStateStack($start); 209 $this->_mode_handlers = array($start => $start); 210 } 211 212 /** 213 * Adds a token search pattern for a particular 214 * parsing mode. The pattern does not change the 215 * current mode. 216 * @param string $pattern Perl style regex, but ( and ) 217 * lose the usual meaning. 218 * @param string $mode Should only apply this 219 * pattern when dealing with 220 * this type of input. 221 * @access public 222 */ 223 function addPattern($pattern, $mode = "accept") { 224 if (! isset($this->_regexes[$mode])) { 225 $this->_regexes[$mode] = new ParallelRegex($this->_case); 226 } 227 $this->_regexes[$mode]->addPattern($pattern); 228 if (! isset($this->_mode_handlers[$mode])) { 229 $this->_mode_handlers[$mode] = $mode; 230 } 231 } 232 233 /** 234 * Adds a pattern that will enter a new parsing 235 * mode. Useful for entering parenthesis, strings, 236 * tags, etc. 237 * @param string $pattern Perl style regex, but ( and ) 238 * lose the usual meaning. 239 * @param string $mode Should only apply this 240 * pattern when dealing with 241 * this type of input. 242 * @param string $new_mode Change parsing to this new 243 * nested mode. 244 * @access public 245 */ 246 function addEntryPattern($pattern, $mode, $new_mode) { 247 if (! isset($this->_regexes[$mode])) { 248 $this->_regexes[$mode] = new ParallelRegex($this->_case); 249 } 250 $this->_regexes[$mode]->addPattern($pattern, $new_mode); 251 if (! isset($this->_mode_handlers[$new_mode])) { 252 $this->_mode_handlers[$new_mode] = $new_mode; 253 } 254 } 255 256 /** 257 * Adds a pattern that will exit the current mode 258 * and re-enter the previous one. 259 * @param string $pattern Perl style regex, but ( and ) 260 * lose the usual meaning. 261 * @param string $mode Mode to leave. 262 * @access public 263 */ 264 function addExitPattern($pattern, $mode) { 265 if (! isset($this->_regexes[$mode])) { 266 $this->_regexes[$mode] = new ParallelRegex($this->_case); 267 } 268 $this->_regexes[$mode]->addPattern($pattern, "__exit"); 269 if (! isset($this->_mode_handlers[$mode])) { 270 $this->_mode_handlers[$mode] = $mode; 271 } 272 } 273 274 /** 275 * Adds a pattern that has a special mode. Acts as an entry 276 * and exit pattern in one go, effectively calling a special 277 * parser handler for this token only. 278 * @param string $pattern Perl style regex, but ( and ) 279 * lose the usual meaning. 280 * @param string $mode Should only apply this 281 * pattern when dealing with 282 * this type of input. 283 * @param string $special Use this mode for this one token. 284 * @access public 285 */ 286 function addSpecialPattern($pattern, $mode, $special) { 287 if (! isset($this->_regexes[$mode])) { 288 $this->_regexes[$mode] = new ParallelRegex($this->_case); 289 } 290 $this->_regexes[$mode]->addPattern($pattern, "_$special"); 291 if (! isset($this->_mode_handlers[$special])) { 292 $this->_mode_handlers[$special] = $special; 293 } 294 } 295 296 /** 297 * Adds a mapping from a mode to another handler. 298 * @param string $mode Mode to be remapped. 299 * @param string $handler New target handler. 300 * @access public 301 */ 302 function mapHandler($mode, $handler) { 303 $this->_mode_handlers[$mode] = $handler; 304 } 305 306 /** 307 * Splits the page text into tokens. Will fail 308 * if the handlers report an error or if no 309 * content is consumed. If successful then each 310 * unparsed and parsed token invokes a call to the 311 * held listener. 312 * @param string $raw Raw HTML text. 313 * @return boolean True on success, else false. 314 * @access public 315 */ 316 function parse($raw) { 317 if (! isset($this->_parser)) { 318 return false; 319 } 320 $length = strlen($raw); 321 while (is_array($parsed = $this->_reduce($raw))) { 322 list($raw, $unmatched, $matched, $mode) = $parsed; 323 if (! $this->_dispatchTokens($unmatched, $matched, $mode)) { 324 return false; 325 } 326 if ($raw === '') { 327 return true; 328 } 329 if (strlen($raw) == $length) { 330 return false; 331 } 332 $length = strlen($raw); 333 } 334 if (! $parsed) { 335 return false; 336 } 337 return $this->_invokeParser($raw, LEXER_UNMATCHED); 338 } 339 340 /** 341 * Sends the matched token and any leading unmatched 342 * text to the parser changing the lexer to a new 343 * mode if one is listed. 344 * @param string $unmatched Unmatched leading portion. 345 * @param string $matched Actual token match. 346 * @param string $mode Mode after match. A boolean 347 * false mode causes no change. 348 * @return boolean False if there was any error 349 * from the parser. 350 * @access private 351 */ 352 function _dispatchTokens($unmatched, $matched, $mode = false) { 353 if (! $this->_invokeParser($unmatched, LEXER_UNMATCHED)) { 354 return false; 355 } 356 if (is_bool($mode)) { 357 return $this->_invokeParser($matched, LEXER_MATCHED); 358 } 359 if ($this->_isModeEnd($mode)) { 360 if (! $this->_invokeParser($matched, LEXER_EXIT)) { 361 return false; 362 } 363 return $this->_mode->leave(); 364 } 365 if ($this->_isSpecialMode($mode)) { 366 $this->_mode->enter($this->_decodeSpecial($mode)); 367 if (! $this->_invokeParser($matched, LEXER_SPECIAL)) { 368 return false; 369 } 370 return $this->_mode->leave(); 371 } 372 $this->_mode->enter($mode); 373 return $this->_invokeParser($matched, LEXER_ENTER); 374 } 375 376 /** 377 * Tests to see if the new mode is actually to leave 378 * the current mode and pop an item from the matching 379 * mode stack. 380 * @param string $mode Mode to test. 381 * @return boolean True if this is the exit mode. 382 * @access private 383 */ 384 function _isModeEnd($mode) { 385 return ($mode === "__exit"); 386 } 387 388 /** 389 * Test to see if the mode is one where this mode 390 * is entered for this token only and automatically 391 * leaves immediately afterwoods. 392 * @param string $mode Mode to test. 393 * @return boolean True if this is the exit mode. 394 * @access private 395 */ 396 function _isSpecialMode($mode) { 397 return (strncmp($mode, "_", 1) == 0); 398 } 399 400 /** 401 * Strips the magic underscore marking single token 402 * modes. 403 * @param string $mode Mode to decode. 404 * @return string Underlying mode name. 405 * @access private 406 */ 407 function _decodeSpecial($mode) { 408 return substr($mode, 1); 409 } 410 411 /** 412 * Calls the parser method named after the current 413 * mode. Empty content will be ignored. The lexer 414 * has a parser handler for each mode in the lexer. 415 * @param string $content Text parsed. 416 * @param boolean $is_match Token is recognised rather 417 * than unparsed data. 418 * @access private 419 */ 420 function _invokeParser($content, $is_match) { 421 if (($content === '') || ($content === false)) { 422 return true; 423 } 424 $handler = $this->_mode_handlers[$this->_mode->getCurrent()]; 425 return $this->_parser->$handler($content, $is_match); 426 } 427 428 /** 429 * Tries to match a chunk of text and if successful 430 * removes the recognised chunk and any leading 431 * unparsed data. Empty strings will not be matched. 432 * @param string $raw The subject to parse. This is the 433 * content that will be eaten. 434 * @return array/boolean Three item list of unparsed 435 * content followed by the 436 * recognised token and finally the 437 * action the parser is to take. 438 * True if no match, false if there 439 * is a parsing error. 440 * @access private 441 */ 442 function _reduce($raw) { 443 if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) { 444 $unparsed_character_count = strpos($raw, $match); 445 $unparsed = substr($raw, 0, $unparsed_character_count); 446 $raw = substr($raw, $unparsed_character_count + strlen($match)); 447 return array($raw, $unparsed, $match, $action); 448 } 449 return true; 450 } 451 } 452 453 /** 454 * Breas HTML into SAX events. 455 * @package SimpleTest 456 * @subpackage WebTester 457 */ 458 class SimpleHtmlLexer extends SimpleLexer { 459 460 /** 461 * Sets up the lexer with case insensitive matching 462 * and adds the HTML handlers. 463 * @param SimpleSaxParser $parser Handling strategy by 464 * reference. 465 * @access public 466 */ 467 function SimpleHtmlLexer($parser) { 468 $this->SimpleLexer($parser, 'text'); 469 $this->mapHandler('text', 'acceptTextToken'); 470 $this->_addSkipping(); 471 foreach ($this->_getParsedTags() as $tag) { 472 $this->_addTag($tag); 473 } 474 $this->_addInTagTokens(); 475 } 476 477 /** 478 * List of parsed tags. Others are ignored. 479 * @return array List of searched for tags. 480 * @access private 481 */ 482 function _getParsedTags() { 483 return array('a', 'title', 'form', 'input', 'button', 'textarea', 'select', 484 'option', 'frameset', 'frame', 'label'); 485 } 486 487 /** 488 * The lexer has to skip certain sections such 489 * as server code, client code and styles. 490 * @access private 491 */ 492 function _addSkipping() { 493 $this->mapHandler('css', 'ignore'); 494 $this->addEntryPattern('<style', 'text', 'css'); 495 $this->addExitPattern('</style>', 'css'); 496 $this->mapHandler('js', 'ignore'); 497 $this->addEntryPattern('<script', 'text', 'js'); 498 $this->addExitPattern('</script>', 'js'); 499 $this->mapHandler('comment', 'ignore'); 500 $this->addEntryPattern('<!--', 'text', 'comment'); 501 $this->addExitPattern('-->', 'comment'); 502 } 503 504 /** 505 * Pattern matches to start and end a tag. 506 * @param string $tag Name of tag to scan for. 507 * @access private 508 */ 509 function _addTag($tag) { 510 $this->addSpecialPattern("</$tag>", 'text', 'acceptEndToken'); 511 $this->addEntryPattern("<$tag", 'text', 'tag'); 512 } 513 514 /** 515 * Pattern matches to parse the inside of a tag 516 * including the attributes and their quoting. 517 * @access private 518 */ 519 function _addInTagTokens() { 520 $this->mapHandler('tag', 'acceptStartToken'); 521 $this->addSpecialPattern('\s+', 'tag', 'ignore'); 522 $this->_addAttributeTokens(); 523 $this->addExitPattern('/>', 'tag'); 524 $this->addExitPattern('>', 'tag'); 525 } 526 527 /** 528 * Matches attributes that are either single quoted, 529 * double quoted or unquoted. 530 * @access private 531 */ 532 function _addAttributeTokens() { 533 $this->mapHandler('dq_attribute', 'acceptAttributeToken'); 534 $this->addEntryPattern('=\s*"', 'tag', 'dq_attribute'); 535 $this->addPattern("\\\\\"", 'dq_attribute'); 536 $this->addExitPattern('"', 'dq_attribute'); 537 $this->mapHandler('sq_attribute', 'acceptAttributeToken'); 538 $this->addEntryPattern("=\s*'", 'tag', 'sq_attribute'); 539 $this->addPattern("\\\\'", 'sq_attribute'); 540 $this->addExitPattern("'", 'sq_attribute'); 541 $this->mapHandler('uq_attribute', 'acceptAttributeToken'); 542 $this->addSpecialPattern('=\s*[^>\s]*', 'tag', 'uq_attribute'); 543 } 544 } 545 546 /** 547 * Converts HTML tokens into selected SAX events. 548 * @package SimpleTest 549 * @subpackage WebTester 550 */ 551 class SimpleHtmlSaxParser { 552 protected $_lexer; 553 protected $_listener; 554 protected $_tag; 555 protected $_attributes; 556 protected $_current_attribute; 557 558 /** 559 * Sets the listener. 560 * @param SimpleSaxListener $listener SAX event handler. 561 * @access public 562 */ 563 function SimpleHtmlSaxParser($listener) { 564 $this->_listener = $listener; 565 $this->_lexer = $this->createLexer($this); 566 $this->_tag = ''; 567 $this->_attributes = array(); 568 $this->_current_attribute = ''; 569 } 570 571 /** 572 * Runs the content through the lexer which 573 * should call back to the acceptors. 574 * @param string $raw Page text to parse. 575 * @return boolean False if parse error. 576 * @access public 577 */ 578 function parse($raw) { 579 return $this->_lexer->parse($raw); 580 } 581 582 /** 583 * Sets up the matching lexer. Starts in 'text' mode. 584 * @param SimpleSaxParser $parser Event generator, usually $self. 585 * @return SimpleLexer Lexer suitable for this parser. 586 * @access public 587 * @static 588 */ 589 static function &createLexer($parser) { 590 $lexer = new SimpleHtmlLexer($parser); 591 return $lexer; 592 } 593 594 /** 595 * Accepts a token from the tag mode. If the 596 * starting element completes then the element 597 * is dispatched and the current attributes 598 * set back to empty. The element or attribute 599 * name is converted to lower case. 600 * @param string $token Incoming characters. 601 * @param integer $event Lexer event type. 602 * @return boolean False if parse error. 603 * @access public 604 */ 605 function acceptStartToken($token, $event) { 606 if ($event == LEXER_ENTER) { 607 $this->_tag = strtolower(substr($token, 1)); 608 return true; 609 } 610 if ($event == LEXER_EXIT) { 611 $success = $this->_listener->startElement( 612 $this->_tag, 613 $this->_attributes); 614 $this->_tag = ''; 615 $this->_attributes = array(); 616 return $success; 617 } 618 if ($token != '=') { 619 $this->_current_attribute = strtolower(SimpleHtmlSaxParser::decodeHtml($token)); 620 $this->_attributes[$this->_current_attribute] = ''; 621 } 622 return true; 623 } 624 625 /** 626 * Accepts a token from the end tag mode. 627 * The element name is converted to lower case. 628 * @param string $token Incoming characters. 629 * @param integer $event Lexer event type. 630 * @return boolean False if parse error. 631 * @access public 632 */ 633 function acceptEndToken($token, $event) { 634 if (! preg_match('/<\/(.*)>/', $token, $matches)) { 635 return false; 636 } 637 return $this->_listener->endElement(strtolower($matches[1])); 638 } 639 640 /** 641 * Part of the tag data. 642 * @param string $token Incoming characters. 643 * @param integer $event Lexer event type. 644 * @return boolean False if parse error. 645 * @access public 646 */ 647 function acceptAttributeToken($token, $event) { 648 if ($event == LEXER_UNMATCHED) { 649 $this->_attributes[$this->_current_attribute] .= 650 SimpleHtmlSaxParser::decodeHtml($token); 651 } 652 if ($event == LEXER_SPECIAL) { 653 $this->_attributes[$this->_current_attribute] .= 654 preg_replace('/^=\s*/' , '', SimpleHtmlSaxParser::decodeHtml($token)); 655 } 656 return true; 657 } 658 659 /** 660 * A character entity. 661 * @param string $token Incoming characters. 662 * @param integer $event Lexer event type. 663 * @return boolean False if parse error. 664 * @access public 665 */ 666 function acceptEntityToken($token, $event) { 667 } 668 669 /** 670 * Character data between tags regarded as 671 * important. 672 * @param string $token Incoming characters. 673 * @param integer $event Lexer event type. 674 * @return boolean False if parse error. 675 * @access public 676 */ 677 function acceptTextToken($token, $event) { 678 return $this->_listener->addContent($token); 679 } 680 681 /** 682 * Incoming data to be ignored. 683 * @param string $token Incoming characters. 684 * @param integer $event Lexer event type. 685 * @return boolean False if parse error. 686 * @access public 687 */ 688 function ignore($token, $event) { 689 return true; 690 } 691 692 /** 693 * Decodes any HTML entities. 694 * @param string $html Incoming HTML. 695 * @return string Outgoing plain text. 696 * @access public 697 * @static 698 */ 699 static function decodeHtml($html) { 700 static $translations; 701 if (! isset($translations)) { 702 $translations = array_flip(get_html_translation_table(HTML_ENTITIES)); 703 } 704 return strtr($html, $translations); 705 } 706 707 /** 708 * Turns HTML into text browser visible text. Images 709 * are converted to their alt text and tags are supressed. 710 * Entities are converted to their visible representation. 711 * @param string $html HTML to convert. 712 * @return string Plain text. 713 * @access public 714 * @static 715 */ 716 static function normalise($html) { 717 $text = preg_replace('|<!--.*?-->|', '', $html); 718 $text = preg_replace('|<img.*?alt\s*=\s*"(.*?)".*?>|', ' \1 ', $text); 719 $text = preg_replace('|<img.*?alt\s*=\s*\'(.*?)\'.*?>|', ' \1 ', $text); 720 $text = preg_replace('|<img.*?alt\s*=\s*([a-zA-Z_]+).*?>|', ' \1 ', $text); 721 $text = preg_replace('|<.*?>|', '', $text); 722 $text = SimpleHtmlSaxParser::decodeHtml($text); 723 $text = preg_replace('|\s+|', ' ', $text); 724 return trim($text); 725 } 726 } 727 728 /** 729 * SAX event handler. 730 * @package SimpleTest 731 * @subpackage WebTester 732 * @abstract 733 */ 734 class SimpleSaxListener { 735 736 /** 737 * Sets the document to write to. 738 * @access public 739 */ 740 function SimpleSaxListener() { 741 } 742 743 /** 744 * Start of element event. 745 * @param string $name Element name. 746 * @param hash $attributes Name value pairs. 747 * Attributes without content 748 * are marked as true. 749 * @return boolean False on parse error. 750 * @access public 751 */ 752 function startElement($name, $attributes) { 753 } 754 755 /** 756 * End of element event. 757 * @param string $name Element name. 758 * @return boolean False on parse error. 759 * @access public 760 */ 761 function endElement($name) { 762 } 763 764 /** 765 * Unparsed, but relevant data. 766 * @param string $text May include unparsed tags. 767 * @return boolean False on parse error. 768 * @access public 769 */ 770 function addContent($text) { 771 } 772 } 773 ?>
titre
Description
Corps
titre
Description
Corps
titre
Description
Corps
titre
Corps
Généré le : Sun Feb 25 21:07:04 2007 | par Balluche grâce à PHPXref 0.7 |