[ Index ] |
|
Code source de SPIP 1.8.3 |
1 <?php 2 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ 3 4 /** 5 * SafeHTML Parser 6 * 7 * PHP versions 4 and 5 8 * 9 * @category HTML 10 * @package SafeHTML 11 * @author Roman Ivanov <thingol@mail.ru> 12 * @copyright 2004-2005 Roman Ivanov 13 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause) 14 * @version CVS: $Id:$ 15 * @link http://pixel-apes.com/safehtml/ 16 */ 17 18 19 /** 20 * This package requires HTMLSax3 package 21 */ 22 require_once (XML_HTMLSAX3 . 'HTMLSax3.php'); 23 24 25 /** 26 * 27 * SafeHTML Parser 28 * 29 * This parser strips down all potentially dangerous content within HTML: 30 * <ul> 31 * <li>opening tag without its closing tag</li> 32 * <li>closing tag without its opening tag</li> 33 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet", 34 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed", 35 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li> 36 * <li>any of these attributes: on*, data*, dynsrc</li> 37 * <li>javascript:/vbscript:/about: etc. protocols</li> 38 * <li>expression/behavior etc. in styles</li> 39 * <li>any other active content</li> 40 * </ul> 41 * It also tries to convert code to XHTML valid, but htmltidy is far better 42 * solution for this task. 43 * 44 * <b>Example:</b> 45 * <pre> 46 * $parser =& new SafeHTML(); 47 * $result = $parser->parse($doc); 48 * </pre> 49 * 50 * @category HTML 51 * @package SafeHTML 52 * @author Roman Ivanov <thingol@mail.ru> 53 * @copyright 1997-2005 Roman Ivanov 54 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause) 55 * @version Release: @package_version@ 56 * @link http://pear.php.net/package/SafeHTML 57 */ 58 class SafeHTML 59 { 60 /** 61 * Storage for resulting HTML output 62 * 63 * @var string 64 * @access private 65 */ 66 var $_xhtml = ''; 67 68 /** 69 * Array of counters for each tag 70 * 71 * @var array 72 * @access private 73 */ 74 var $_counter = array(); 75 76 /** 77 * Stack of unclosed tags 78 * 79 * @var array 80 * @access private 81 */ 82 var $_stack = array(); 83 84 /** 85 * Array of counters for tags that must be deleted with all content 86 * 87 * @var array 88 * @access private 89 */ 90 var $_dcCounter = array(); 91 92 /** 93 * Stack of unclosed tags that must be deleted with all content 94 * 95 * @var array 96 * @access private 97 */ 98 var $_dcStack = array(); 99 100 /** 101 * Stores level of list (ol/ul) nesting 102 * 103 * @var int 104 * @access private 105 */ 106 var $_listScope = 0; 107 108 /** 109 * Stack of unclosed list tags 110 * 111 * @var array 112 * @access private 113 */ 114 var $_liStack = array(); 115 116 /** 117 * Array of prepared regular expressions for protocols (schemas) matching 118 * 119 * @var array 120 * @access private 121 */ 122 var $_protoRegexps = array(); 123 124 /** 125 * Array of prepared regular expressions for CSS matching 126 * 127 * @var array 128 * @access private 129 */ 130 var $_cssRegexps = array(); 131 132 /** 133 * List of single tags ("<tag />") 134 * 135 * @var array 136 * @access public 137 */ 138 var $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', ); 139 140 /** 141 * List of dangerous tags (such tags will be deleted) 142 * 143 * @var array 144 * @access public 145 */ 146 var $deleteTags = array( 147 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body', 148 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer', 149 'iframe', 'layer', 'link', 'meta', 'object', 'style', 150 'title', 'script', 151 ); 152 153 /** 154 * List of dangerous tags (such tags will be deleted, and all content 155 * inside this tags will be also removed) 156 * 157 * @var array 158 * @access public 159 */ 160 var $deleteTagsContent = array('script', 'style', 'title', 'xml', ); 161 162 /** 163 * Type of protocols filtering ('white' or 'black') 164 * 165 * @var string 166 * @access public 167 */ 168 var $protocolFiltering = 'white'; 169 170 /** 171 * List of "dangerous" protocols (used for blacklist-filtering) 172 * 173 * @var array 174 * @access public 175 */ 176 var $blackProtocols = array( 177 'about', 'chrome', 'data', 'disk', 'hcp', 178 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec', 179 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera', 180 'res', 'resource', 'shell', 'vbscript', 'view-source', 181 'vnd.ms.radio', 'wysiwyg', 182 ); 183 184 /** 185 * List of "safe" protocols (used for whitelist-filtering) 186 * 187 * @var array 188 * @access public 189 */ 190 var $whiteProtocols = array( 191 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https', 192 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal', 193 'xmpp', 194 ); 195 196 /** 197 * List of attributes that can contain protocols 198 * 199 * @var array 200 * @access public 201 */ 202 var $protocolAttributes = array( 203 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', 204 ); 205 206 /** 207 * List of dangerous CSS keywords 208 * 209 * Whole style="" attribute will be removed, if parser will find one of 210 * these keywords 211 * 212 * @var array 213 * @access public 214 */ 215 var $cssKeywords = array( 216 'absolute', 'behavior', 'behaviour', 'content', 'expression', 217 'fixed', 'include-source', 'moz-binding', 218 ); 219 220 /** 221 * List of tags that can have no "closing tag" 222 * 223 * @var array 224 * @access public 225 * @deprecated XHTML does not allow such tags 226 */ 227 var $noClose = array(); 228 229 /** 230 * List of block-level tags that terminates paragraph 231 * 232 * Paragraph will be closed when this tags opened 233 * 234 * @var array 235 * @access public 236 */ 237 var $closeParagraph = array( 238 'address', 'blockquote', 'center', 'dd', 'dir', 'div', 239 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', 240 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee', 241 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre', 242 'table', 'ul', 'xmp', 243 ); 244 245 /** 246 * List of table tags, all table tags outside a table will be removed 247 * 248 * @var array 249 * @access public 250 */ 251 var $tableTags = array( 252 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 253 'thead', 'tr', 254 ); 255 256 /** 257 * List of list tags 258 * 259 * @var array 260 * @access public 261 */ 262 var $listTags = array('dir', 'menu', 'ol', 'ul', ); 263 264 /** 265 * List of dangerous attributes 266 * 267 * @var array 268 * @access public 269 */ 270 var $attributes = array('dynsrc', 'id', 'name', ); 271 272 /** 273 * Constructs class 274 * 275 * @access public 276 */ 277 function SafeHTML() 278 { 279 //making regular expressions based on Proto & CSS arrays 280 foreach ($this->blackProtocols as $proto) { 281 $preg = "/[\s\x01-\x1F]*"; 282 for ($i=0; $i<strlen($proto); $i++) { 283 $preg .= $proto{$i} . "[\s\x01-\x1F]*"; 284 } 285 $preg .= ":/i"; 286 $this->_protoRegexps[] = $preg; 287 } 288 289 foreach ($this->cssKeywords as $css) { 290 $this->_cssRegexps[] = '/' . $css . '/i'; 291 } 292 return true; 293 } 294 295 /** 296 * Handles the writing of attributes - called from $this->_openHandler() 297 * 298 * @param array $attrs array of attributes $name => $value 299 * @return boolean 300 * @access private 301 */ 302 function _writeAttrs ($attrs) 303 { 304 if (is_array($attrs)) { 305 foreach ($attrs as $name => $value) { 306 307 $name = strtolower($name); 308 309 if (strpos($name, 'on') === 0) { 310 continue; 311 } 312 if (strpos($name, 'data') === 0) { 313 continue; 314 } 315 if (in_array($name, $this->attributes)) { 316 continue; 317 } 318 if (!preg_match("/^[a-z0-9]+$/i", $name)) { 319 continue; 320 } 321 322 if (($value === TRUE) || (is_null($value))) { 323 $value = $name; 324 } 325 326 if ($name == 'style') { 327 328 // removes insignificant backslahes 329 $value = str_replace("\\", '', $value); 330 331 // removes CSS comments 332 while (1) 333 { 334 $_value = preg_replace("!/\*.*?\*/!s", '', $value); 335 if ($_value == $value) break; 336 $value = $_value; 337 } 338 339 // replace all & to & 340 $value = str_replace('&', '&', $value); 341 $value = str_replace('&', '&', $value); 342 343 foreach ($this->_cssRegexps as $css) { 344 if (preg_match($css, $value)) { 345 continue 2; 346 } 347 } 348 foreach ($this->_protoRegexps as $proto) { 349 if (preg_match($proto, $value)) { 350 continue 2; 351 } 352 } 353 } 354 355 $tempval = preg_replace('/&#(\d+);?/me', "chr('\\1')", $value); //"' 356 $tempval = preg_replace('/&#x([0-9a-f]+);?/mei', "chr(hexdec('\\1'))", $tempval); 357 358 if ((in_array($name, $this->protocolAttributes)) && 359 (strpos($tempval, ':') !== false)) 360 { 361 if ($this->protocolFiltering == 'black') { 362 foreach ($this->_protoRegexps as $proto) { 363 if (preg_match($proto, $tempval)) continue 2; 364 } 365 } else { 366 $_tempval = explode(':', $tempval); 367 $proto = $_tempval[0]; 368 if (!in_array($proto, $this->whiteProtocols)) { 369 continue; 370 } 371 } 372 } 373 374 $value = str_replace("\"", """, $value); 375 $this->_xhtml .= ' ' . $name . '="' . $value . '"'; 376 } 377 } 378 return true; 379 } 380 381 /** 382 * Opening tag handler - called from HTMLSax 383 * 384 * @param object $parser HTML Parser 385 * @param string $name tag name 386 * @param array $attrs tag attributes 387 * @return boolean 388 * @access private 389 */ 390 function _openHandler(&$parser, $name, $attrs) 391 { 392 $name = strtolower($name); 393 394 if (in_array($name, $this->deleteTagsContent)) { 395 array_push($this->_dcStack, $name); 396 $this->_dcCounter[$name] = isset($this->_dcCounter[$name]) ? $this->_dcCounter[$name]+1 : 1; 397 } 398 if (count($this->_dcStack) != 0) { 399 return true; 400 } 401 402 if (in_array($name, $this->deleteTags)) { 403 return true; 404 } 405 406 if (!preg_match("/^[a-z0-9]+$/i", $name)) { 407 if (preg_match("!(?:\@|://)!i", $name)) { 408 $this->_xhtml .= '<' . $name . '>'; 409 } 410 return true; 411 } 412 413 if (in_array($name, $this->singleTags)) { 414 $this->_xhtml .= '<' . $name; 415 $this->_writeAttrs($attrs); 416 $this->_xhtml .= ' />'; 417 return true; 418 } 419 420 // TABLES: cannot open table elements when we are not inside table 421 if ((isset($this->_counter['table'])) && ($this->_counter['table'] <= 0) 422 && (in_array($name, $this->tableTags))) 423 { 424 return true; 425 } 426 427 // PARAGRAPHS: close paragraph when closeParagraph tags opening 428 if ((in_array($name, $this->closeParagraph)) && (in_array('p', $this->_stack))) { 429 $this->_closeHandler($parser, 'p'); 430 } 431 432 // LISTS: we should close <li> if <li> of the same level opening 433 if ($name == 'li' && count($this->_liStack) && 434 $this->_listScope == $this->_liStack[count($this->_liStack)-1]) 435 { 436 $this->_closeHandler($parser, 'li'); 437 } 438 439 // LISTS: we want to know on what nesting level of lists we are 440 if (in_array($name, $this->listTags)) { 441 $this->_listScope++; 442 } 443 if ($name == 'li') { 444 array_push($this->_liStack, $this->_listScope); 445 } 446 447 $this->_xhtml .= '<' . $name; 448 $this->_writeAttrs($attrs); 449 $this->_xhtml .= '>'; 450 array_push($this->_stack,$name); 451 $this->_counter[$name] = isset($this->_counter[$name]) ? $this->_counter[$name]+1 : 1; 452 return true; 453 } 454 455 /** 456 * Closing tag handler - called from HTMLSax 457 * 458 * @param object $parsers HTML parser 459 * @param string $name tag name 460 * @return boolean 461 * @access private 462 */ 463 function _closeHandler(&$parser, $name) 464 { 465 466 $name = strtolower($name); 467 468 if (isset($this->_dcCounter[$name]) && ($this->_dcCounter[$name] > 0) && 469 (in_array($name, $this->deleteTagsContent))) 470 { 471 while ($name != ($tag = array_pop($this->_dcStack))) { 472 $this->_dcCounter[$tag]--; 473 } 474 475 $this->_dcCounter[$name]--; 476 } 477 478 if (count($this->_dcStack) != 0) { 479 return true; 480 } 481 482 if ((isset($this->_counter[$name])) && ($this->_counter[$name] > 0)) { 483 while ($name != ($tag = array_pop($this->_stack))) { 484 $this->_closeTag($tag); 485 } 486 487 $this->_closeTag($name); 488 } 489 return true; 490 } 491 492 /** 493 * Closes tag 494 * 495 * @param string $tag tag name 496 * @return boolean 497 * @access private 498 */ 499 function _closeTag($tag) 500 { 501 if (!in_array($tag, $this->noClose)) { 502 $this->_xhtml .= '</' . $tag . '>'; 503 } 504 505 $this->_counter[$tag]--; 506 507 if (in_array($tag, $this->listTags)) { 508 $this->_listScope--; 509 } 510 511 if ($tag == 'li') { 512 array_pop($this->_liStack); 513 } 514 return true; 515 } 516 517 /** 518 * Character data handler - called from HTMLSax 519 * 520 * @param object $parser HTML parser 521 * @param string $data textual data 522 * @return boolean 523 * @access private 524 */ 525 function _dataHandler(&$parser, $data) 526 { 527 if (count($this->_dcStack) == 0) { 528 $this->_xhtml .= $data; 529 } 530 return true; 531 } 532 533 /** 534 * Escape handler - called from HTMLSax 535 * 536 * @param object $parser HTML parser 537 * @param string $data comments or other type of data 538 * @return boolean 539 * @access private 540 */ 541 function _escapeHandler(&$parser, $data) 542 { 543 return true; 544 } 545 546 /** 547 * Returns the XHTML document 548 * 549 * @return string Processed (X)HTML document 550 * @access public 551 */ 552 function getXHTML () 553 { 554 while ($tag = array_pop($this->_stack)) { 555 $this->_closeTag($tag); 556 } 557 558 return $this->_xhtml; 559 } 560 561 /** 562 * Clears current document data 563 * 564 * @return boolean 565 * @access public 566 */ 567 function clear() 568 { 569 $this->_xhtml = ''; 570 return true; 571 } 572 573 /** 574 * Main parsing fuction 575 * 576 * @param string $doc HTML document for processing 577 * @return string Processed (X)HTML document 578 * @access public 579 */ 580 function parse($doc) 581 { 582 583 // Save all '<' symbols 584 $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc); 585 586 // Web documents shouldn't contains \x00 symbol 587 $doc = str_replace("\x00", '', $doc); 588 589 // Opera6 bug workaround 590 $doc = str_replace("\xC0\xBC", '<', $doc); 591 592 // UTF-7 encoding XSS workaround 593 $doc = str_replace("+ADw-", '<', $doc); 594 595 // Instantiate the parser 596 $parser=& new XML_HTMLSax3(); 597 598 // Set up the parser 599 $parser->set_object($this); 600 601 $parser->set_element_handler('_openHandler','_closeHandler'); 602 $parser->set_data_handler('_dataHandler'); 603 $parser->set_escape_handler('_escapeHandler'); 604 605 $parser->parse($doc); 606 607 return $this->getXHTML(); 608 609 } 610 611 } 612 613 /* 614 * Local variables: 615 * tab-width: 4 616 * c-basic-offset: 4 617 * c-hanging-comment-ender-p: nil 618 * End: 619 */ 620 621 ?>
titre
Description
Corps
titre
Description
Corps
titre
Description
Corps
titre
Corps
Généré le : Thu Feb 22 22:27:47 2007 | par Balluche grâce à PHPXref 0.7 |