[ Index ] |
|
Code source de PRADO 3.0.6 |
1 <?php 2 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ 3 4 /** 5 * SafeHTML Parser 6 * 7 * PHP versions 4 and 5 8 * 9 * @category HTML 10 * @package System.Security 11 * @author Roman Ivanov <thingol@mail.ru> 12 * @copyright 2004-2005 Roman Ivanov 13 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause) 14 * @version 1.3.7 15 * @link http://pixel-apes.com/safehtml/ 16 */ 17 18 19 /** 20 * This package requires HTMLSax3 package 21 */ 22 Prado::using('System.3rdParty.SafeHtml.HTMLSax3'); 23 24 25 /** 26 * 27 * SafeHTML Parser 28 * 29 * This parser strips down all potentially dangerous content within HTML: 30 * <ul> 31 * <li>opening tag without its closing tag</li> 32 * <li>closing tag without its opening tag</li> 33 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet", 34 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed", 35 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li> 36 * <li>any of these attributes: on*, data*, dynsrc</li> 37 * <li>javascript:/vbscript:/about: etc. protocols</li> 38 * <li>expression/behavior etc. in styles</li> 39 * <li>any other active content</li> 40 * </ul> 41 * It also tries to convert code to XHTML valid, but htmltidy is far better 42 * solution for this task. 43 * 44 * <b>Example:</b> 45 * <pre> 46 * $parser =& new SafeHTML(); 47 * $result = $parser->parse($doc); 48 * </pre> 49 * 50 * @category HTML 51 * @package System.Security 52 * @author Roman Ivanov <thingol@mail.ru> 53 * @copyright 1997-2005 Roman Ivanov 54 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause) 55 * @version Release: @package_version@ 56 * @link http://pear.php.net/package/SafeHTML 57 */ 58 class TSafeHtmlParser 59 { 60 /** 61 * Storage for resulting HTML output 62 * 63 * @var string 64 * @access private 65 */ 66 private $_xhtml = ''; 67 68 /** 69 * Array of counters for each tag 70 * 71 * @var array 72 * @access private 73 */ 74 private $_counter = array(); 75 76 /** 77 * Stack of unclosed tags 78 * 79 * @var array 80 * @access private 81 */ 82 private $_stack = array(); 83 84 /** 85 * Array of counters for tags that must be deleted with all content 86 * 87 * @var array 88 * @access private 89 */ 90 private $_dcCounter = array(); 91 92 /** 93 * Stack of unclosed tags that must be deleted with all content 94 * 95 * @var array 96 * @access private 97 */ 98 private $_dcStack = array(); 99 100 /** 101 * Stores level of list (ol/ul) nesting 102 * 103 * @var int 104 * @access private 105 */ 106 private $_listScope = 0; 107 108 /** 109 * Stack of unclosed list tags 110 * 111 * @var array 112 * @access private 113 */ 114 private $_liStack = array(); 115 116 /** 117 * Array of prepared regular expressions for protocols (schemas) matching 118 * 119 * @var array 120 * @access private 121 */ 122 private $_protoRegexps = array(); 123 124 /** 125 * Array of prepared regular expressions for CSS matching 126 * 127 * @var array 128 * @access private 129 */ 130 private $_cssRegexps = array(); 131 132 /** 133 * List of single tags ("<tag />") 134 * 135 * @var array 136 * @access public 137 */ 138 public $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', ); 139 140 /** 141 * List of dangerous tags (such tags will be deleted) 142 * 143 * @var array 144 * @access public 145 */ 146 public $deleteTags = array( 147 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body', 148 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer', 149 'iframe', 'layer', 'link', 'meta', 'object', 'style', 150 'title', 'script', 151 ); 152 153 /** 154 * List of dangerous tags (such tags will be deleted, and all content 155 * inside this tags will be also removed) 156 * 157 * @var array 158 * @access public 159 */ 160 public $deleteTagsContent = array('script', 'style', 'title', 'xml', ); 161 162 /** 163 * Type of protocols filtering ('white' or 'black') 164 * 165 * @var string 166 * @access public 167 */ 168 public $protocolFiltering = 'white'; 169 170 /** 171 * List of "dangerous" protocols (used for blacklist-filtering) 172 * 173 * @var array 174 * @access public 175 */ 176 public $blackProtocols = array( 177 'about', 'chrome', 'data', 'disk', 'hcp', 178 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec', 179 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera', 180 'res', 'resource', 'shell', 'vbscript', 'view-source', 181 'vnd.ms.radio', 'wysiwyg', 182 ); 183 184 /** 185 * List of "safe" protocols (used for whitelist-filtering) 186 * 187 * @var array 188 * @access public 189 */ 190 public $whiteProtocols = array( 191 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https', 192 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal', 193 'xmpp', 'callto', 194 ); 195 196 /** 197 * List of attributes that can contain protocols 198 * 199 * @var array 200 * @access public 201 */ 202 public $protocolAttributes = array( 203 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', 204 ); 205 206 /** 207 * List of dangerous CSS keywords 208 * 209 * Whole style="" attribute will be removed, if parser will find one of 210 * these keywords 211 * 212 * @var array 213 * @access public 214 */ 215 public $cssKeywords = array( 216 'absolute', 'behavior', 'behaviour', 'content', 'expression', 217 'fixed', 'include-source', 'moz-binding', 218 ); 219 220 /** 221 * List of tags that can have no "closing tag" 222 * 223 * @var array 224 * @access public 225 * @deprecated XHTML does not allow such tags 226 */ 227 public $noClose = array(); 228 229 /** 230 * List of block-level tags that terminates paragraph 231 * 232 * Paragraph will be closed when this tags opened 233 * 234 * @var array 235 * @access public 236 */ 237 public $closeParagraph = array( 238 'address', 'blockquote', 'center', 'dd', 'dir', 'div', 239 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', 240 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee', 241 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre', 242 'table', 'ul', 'xmp', 243 ); 244 245 /** 246 * List of table tags, all table tags outside a table will be removed 247 * 248 * @var array 249 * @access public 250 */ 251 public $tableTags = array( 252 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 253 'thead', 'tr', 254 ); 255 256 /** 257 * List of list tags 258 * 259 * @var array 260 * @access public 261 */ 262 public $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', ); 263 264 /** 265 * List of dangerous attributes 266 * 267 * @var array 268 * @access public 269 */ 270 public $attributes = array('dynsrc', 'id', 'name', ); 271 272 /** 273 * List of allowed "namespaced" attributes 274 * 275 * @var array 276 * @access public 277 */ 278 public $attributesNS = array('xml:lang', ); 279 280 /** 281 * Constructs class 282 * 283 * @access public 284 */ 285 public function __construct() 286 { 287 //making regular expressions based on Proto & CSS arrays 288 foreach ($this->blackProtocols as $proto) { 289 $preg = "/[\s\x01-\x1F]*"; 290 for ($i=0; $i<strlen($proto); $i++) { 291 $preg .= $proto{$i} . "[\s\x01-\x1F]*"; 292 } 293 $preg .= ":/i"; 294 $this->_protoRegexps[] = $preg; 295 } 296 297 foreach ($this->cssKeywords as $css) { 298 $this->_cssRegexps[] = '/' . $css . '/i'; 299 } 300 return true; 301 } 302 303 /** 304 * Handles the writing of attributes - called from $this->_openHandler() 305 * 306 * @param array $attrs array of attributes $name => $value 307 * @return boolean 308 * @access private 309 */ 310 private function _writeAttrs ($attrs) 311 { 312 if (is_array($attrs)) { 313 foreach ($attrs as $name => $value) { 314 315 $name = strtolower($name); 316 317 if (strpos($name, 'on') === 0) { 318 continue; 319 } 320 if (strpos($name, 'data') === 0) { 321 continue; 322 } 323 if (in_array($name, $this->attributes)) { 324 continue; 325 } 326 if (!preg_match("/^[a-z0-9]+$/i", $name)) { 327 if (!in_array($name, $this->attributesNS)) 328 { 329 continue; 330 } 331 } 332 333 if (($value === TRUE) || (is_null($value))) { 334 $value = $name; 335 } 336 337 if ($name == 'style') { 338 339 // removes insignificant backslahes 340 $value = str_replace("\\", '', $value); 341 342 // removes CSS comments 343 while (1) 344 { 345 $_value = preg_replace("!/\*.*?\*/!s", '', $value); 346 if ($_value == $value) break; 347 $value = $_value; 348 } 349 350 // replace all & to & 351 $value = str_replace('&', '&', $value); 352 $value = str_replace('&', '&', $value); 353 354 foreach ($this->_cssRegexps as $css) { 355 if (preg_match($css, $value)) { 356 continue 2; 357 } 358 } 359 foreach ($this->_protoRegexps as $proto) { 360 if (preg_match($proto, $value)) { 361 continue 2; 362 } 363 } 364 } 365 366 $tempval = preg_replace('/&#(\d+);?/me', "chr('\\1')", $value); //"' 367 $tempval = preg_replace('/&#x([0-9a-f]+);?/mei', "chr(hexdec('\\1'))", $tempval); 368 369 if ((in_array($name, $this->protocolAttributes)) && 370 (strpos($tempval, ':') !== false)) 371 { 372 if ($this->protocolFiltering == 'black') { 373 foreach ($this->_protoRegexps as $proto) { 374 if (preg_match($proto, $tempval)) continue 2; 375 } 376 } else { 377 $_tempval = explode(':', $tempval); 378 $proto = $_tempval[0]; 379 if (!in_array($proto, $this->whiteProtocols)) { 380 continue; 381 } 382 } 383 } 384 385 $value = str_replace("\"", """, $value); 386 $this->_xhtml .= ' ' . $name . '="' . $value . '"'; 387 } 388 } 389 return true; 390 } 391 392 /** 393 * Opening tag handler - called from HTMLSax 394 * 395 * @param object $parser HTML Parser 396 * @param string $name tag name 397 * @param array $attrs tag attributes 398 * @return boolean 399 * @access private 400 */ 401 public function _openHandler(&$parser, $name, $attrs) 402 { 403 $name = strtolower($name); 404 405 if (in_array($name, $this->deleteTagsContent)) { 406 array_push($this->_dcStack, $name); 407 $this->_dcCounter[$name] = isset($this->_dcCounter[$name]) ? $this->_dcCounter[$name]+1 : 1; 408 } 409 if (count($this->_dcStack) != 0) { 410 return true; 411 } 412 413 if (in_array($name, $this->deleteTags)) { 414 return true; 415 } 416 417 if (!preg_match("/^[a-z0-9]+$/i", $name)) { 418 if (preg_match("!(?:\@|://)!i", $name)) { 419 $this->_xhtml .= '<' . $name . '>'; 420 } 421 return true; 422 } 423 424 if (in_array($name, $this->singleTags)) { 425 $this->_xhtml .= '<' . $name; 426 $this->_writeAttrs($attrs); 427 $this->_xhtml .= ' />'; 428 return true; 429 } 430 431 // TABLES: cannot open table elements when we are not inside table 432 if ((isset($this->_counter['table'])) && ($this->_counter['table'] <= 0) 433 && (in_array($name, $this->tableTags))) 434 { 435 return true; 436 } 437 438 // PARAGRAPHS: close paragraph when closeParagraph tags opening 439 if ((in_array($name, $this->closeParagraph)) && (in_array('p', $this->_stack))) { 440 $this->_closeHandler($parser, 'p'); 441 } 442 443 // LISTS: we should close <li> if <li> of the same level opening 444 if ($name == 'li' && count($this->_liStack) && 445 $this->_listScope == $this->_liStack[count($this->_liStack)-1]) 446 { 447 $this->_closeHandler($parser, 'li'); 448 } 449 450 // LISTS: we want to know on what nesting level of lists we are 451 if (in_array($name, $this->listTags)) { 452 $this->_listScope++; 453 } 454 if ($name == 'li') { 455 array_push($this->_liStack, $this->_listScope); 456 } 457 458 $this->_xhtml .= '<' . $name; 459 $this->_writeAttrs($attrs); 460 $this->_xhtml .= '>'; 461 array_push($this->_stack,$name); 462 $this->_counter[$name] = isset($this->_counter[$name]) ? $this->_counter[$name]+1 : 1; 463 return true; 464 } 465 466 /** 467 * Closing tag handler - called from HTMLSax 468 * 469 * @param object $parsers HTML parser 470 * @param string $name tag name 471 * @return boolean 472 * @access private 473 */ 474 public function _closeHandler(&$parser, $name) 475 { 476 477 $name = strtolower($name); 478 479 if (isset($this->_dcCounter[$name]) && ($this->_dcCounter[$name] > 0) && 480 (in_array($name, $this->deleteTagsContent))) 481 { 482 while ($name != ($tag = array_pop($this->_dcStack))) { 483 $this->_dcCounter[$tag]--; 484 } 485 486 $this->_dcCounter[$name]--; 487 } 488 489 if (count($this->_dcStack) != 0) { 490 return true; 491 } 492 493 if ((isset($this->_counter[$name])) && ($this->_counter[$name] > 0)) { 494 while ($name != ($tag = array_pop($this->_stack))) { 495 $this->_closeTag($tag); 496 } 497 498 $this->_closeTag($name); 499 } 500 return true; 501 } 502 503 /** 504 * Closes tag 505 * 506 * @param string $tag tag name 507 * @return boolean 508 * @access private 509 */ 510 public function _closeTag($tag) 511 { 512 if (!in_array($tag, $this->noClose)) { 513 $this->_xhtml .= '</' . $tag . '>'; 514 } 515 516 $this->_counter[$tag]--; 517 518 if (in_array($tag, $this->listTags)) { 519 $this->_listScope--; 520 } 521 522 if ($tag == 'li') { 523 array_pop($this->_liStack); 524 } 525 return true; 526 } 527 528 /** 529 * Character data handler - called from HTMLSax 530 * 531 * @param object $parser HTML parser 532 * @param string $data textual data 533 * @return boolean 534 * @access private 535 */ 536 public function _dataHandler(&$parser, $data) 537 { 538 if (count($this->_dcStack) == 0) { 539 $this->_xhtml .= $data; 540 } 541 return true; 542 } 543 544 /** 545 * Escape handler - called from HTMLSax 546 * 547 * @param object $parser HTML parser 548 * @param string $data comments or other type of data 549 * @return boolean 550 * @access private 551 */ 552 public function _escapeHandler(&$parser, $data) 553 { 554 return true; 555 } 556 557 /** 558 * Returns the XHTML document 559 * 560 * @return string Processed (X)HTML document 561 * @access public 562 */ 563 public function getXHTML () 564 { 565 while ($tag = array_pop($this->_stack)) { 566 $this->_closeTag($tag); 567 } 568 569 return $this->_xhtml; 570 } 571 572 /** 573 * Clears current document data 574 * 575 * @return boolean 576 * @access public 577 */ 578 public function clear() 579 { 580 $this->_xhtml = ''; 581 return true; 582 } 583 584 /** 585 * Main parsing fuction 586 * 587 * @param string $doc HTML document for processing 588 * @return string Processed (X)HTML document 589 * @access public 590 */ 591 public function parse($doc) 592 { 593 $this->clear(); 594 595 // Save all '<' symbols 596 $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', (string)$doc); 597 598 // Web documents shouldn't contains \x00 symbol 599 $doc = str_replace("\x00", '', $doc); 600 601 // Opera6 bug workaround 602 $doc = str_replace("\xC0\xBC", '<', $doc); 603 604 // UTF-7 encoding ASCII decode 605 $doc = $this->repackUTF7($doc); 606 607 // Instantiate the parser 608 $parser= new TSax3(); 609 610 // Set up the parser 611 $parser->set_object($this); 612 613 $parser->set_element_handler('_openHandler','_closeHandler'); 614 $parser->set_data_handler('_dataHandler'); 615 $parser->set_escape_handler('_escapeHandler'); 616 617 $parser->parse($doc); 618 619 return $this->getXHTML(); 620 621 } 622 623 624 /** 625 * UTF-7 decoding fuction 626 * 627 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII 628 * @return string Decoded document 629 * @access private 630 */ 631 private function repackUTF7($str) 632 { 633 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str); 634 } 635 636 /** 637 * Additional UTF-7 decoding fuction 638 * 639 * @param string $str String for recode ASCII part of UTF-7 back to ASCII 640 * @return string Recoded string 641 * @access private 642 */ 643 private function repackUTF7Callback($str) 644 { 645 $str = base64_decode($str[1]); 646 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str); 647 return preg_replace('/\x00(.)/', '$1', $str); 648 } 649 650 /** 651 * Additional UTF-7 encoding fuction 652 * 653 * @param string $str String for recode ASCII part of UTF-7 back to ASCII 654 * @return string Recoded string 655 * @access private 656 */ 657 private function repackUTF7Back($str) 658 { 659 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-'; 660 } 661 } 662 663 /* 664 * Local variables: 665 * tab-width: 4 666 * c-basic-offset: 4 667 * c-hanging-comment-ender-p: nil 668 * End: 669 */ 670 671 ?>
titre
Description
Corps
titre
Description
Corps
titre
Description
Corps
titre
Corps
Généré le : Sun Feb 25 21:07:04 2007 | par Balluche grâce à PHPXref 0.7 |