[ Index ]
 

Code source de SPIP 1.8.3

Accédez au Source d'autres logiciels libres | Soutenez Angelica Josefina !

title

Body

[fermer]

/ecrire/safehtml/classes/ -> safehtml.php (source)

   1  <?php
   2  /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
   3  
   4  /**
   5   * SafeHTML Parser
   6   *
   7   * PHP versions 4 and 5
   8   *
   9   * @category   HTML
  10   * @package    SafeHTML
  11   * @author     Roman Ivanov <thingol@mail.ru>
  12   * @copyright  2004-2005 Roman Ivanov
  13   * @license    http://www.debian.org/misc/bsd.license  BSD License (3 Clause)
  14   * @version    CVS: $Id:$
  15   * @link       http://pixel-apes.com/safehtml/
  16   */
  17  
  18  
  19  /**
  20   * This package requires HTMLSax3 package
  21   */
  22  require_once (XML_HTMLSAX3 . 'HTMLSax3.php');
  23  
  24   
  25  /**
  26   *
  27   * SafeHTML Parser
  28   *
  29   * This parser strips down all potentially dangerous content within HTML:
  30   * <ul>
  31   * <li>opening tag without its closing tag</li>
  32   * <li>closing tag without its opening tag</li>
  33   * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet", 
  34   * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed", 
  35   * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
  36   * <li>any of these attributes: on*, data*, dynsrc</li>
  37   * <li>javascript:/vbscript:/about: etc. protocols</li>
  38   * <li>expression/behavior etc. in styles</li>
  39   * <li>any other active content</li>
  40   * </ul>
  41   * It also tries to convert code to XHTML valid, but htmltidy is far better 
  42   * solution for this task.
  43   *
  44   * <b>Example:</b>
  45   * <pre>
  46   * $parser =& new SafeHTML();
  47   * $result = $parser->parse($doc);
  48   * </pre>
  49   *
  50   * @category   HTML
  51   * @package    SafeHTML
  52   * @author     Roman Ivanov <thingol@mail.ru>
  53   * @copyright  1997-2005 Roman Ivanov
  54   * @license    http://www.debian.org/misc/bsd.license  BSD License (3 Clause)
  55   * @version    Release: @package_version@
  56   * @link       http://pear.php.net/package/SafeHTML
  57   */
  58  class SafeHTML 
  59  {
  60      /**
  61       * Storage for resulting HTML output
  62       *
  63       * @var string
  64       * @access private
  65       */
  66      var $_xhtml = '';
  67      
  68      /**
  69       * Array of counters for each tag
  70       *
  71       * @var array
  72       * @access private
  73       */
  74      var $_counter = array();
  75      
  76      /**
  77       * Stack of unclosed tags
  78       *
  79       * @var array
  80       * @access private
  81       */
  82      var $_stack = array();
  83      
  84      /**
  85       * Array of counters for tags that must be deleted with all content
  86       *
  87       * @var array
  88       * @access private
  89       */
  90      var $_dcCounter = array();
  91      
  92      /**
  93       * Stack of unclosed tags that must be deleted with all content
  94       *
  95       * @var array
  96       * @access private
  97       */
  98      var $_dcStack = array();
  99      
 100      /**
 101       * Stores level of list (ol/ul) nesting
 102       *
 103       * @var int
 104       * @access private
 105       */
 106      var $_listScope = 0; 
 107      
 108      /**
 109       * Stack of unclosed list tags 
 110       *
 111       * @var array
 112       * @access private
 113       */
 114      var $_liStack = array();
 115  
 116      /**
 117       * Array of prepared regular expressions for protocols (schemas) matching
 118       *
 119       * @var array
 120       * @access private
 121       */
 122      var $_protoRegexps = array();
 123      
 124      /**
 125       * Array of prepared regular expressions for CSS matching
 126       *
 127       * @var array
 128       * @access private
 129       */
 130      var $_cssRegexps = array();
 131  
 132      /**
 133       * List of single tags ("<tag />")
 134       *
 135       * @var array
 136       * @access public
 137       */
 138      var $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
 139  
 140      /**
 141       * List of dangerous tags (such tags will be deleted)
 142       *
 143       * @var array
 144       * @access public
 145       */
 146      var $deleteTags = array(
 147          'applet', 'base',   'basefont', 'bgsound', 'blink',  'body', 
 148          'embed',  'frame',  'frameset', 'head',    'html',   'ilayer', 
 149          'iframe', 'layer',  'link',     'meta',    'object', 'style', 
 150          'title',  'script', 
 151          );
 152  
 153      /**
 154       * List of dangerous tags (such tags will be deleted, and all content 
 155       * inside this tags will be also removed)
 156       *
 157       * @var array
 158       * @access public
 159       */
 160      var $deleteTagsContent = array('script', 'style', 'title', 'xml', );
 161  
 162      /**
 163       * Type of protocols filtering ('white' or 'black')
 164       *
 165       * @var string
 166       * @access public
 167       */
 168      var $protocolFiltering = 'white';
 169  
 170      /**
 171       * List of "dangerous" protocols (used for blacklist-filtering)
 172       *
 173       * @var array
 174       * @access public
 175       */
 176      var $blackProtocols = array(
 177          'about',   'chrome',     'data',       'disk',     'hcp',     
 178          'help',    'javascript', 'livescript', 'lynxcgi',  'lynxexec', 
 179          'ms-help', 'ms-its',     'mhtml',      'mocha',    'opera',   
 180          'res',     'resource',   'shell',      'vbscript', 'view-source', 
 181          'vnd.ms.radio',          'wysiwyg', 
 182          );
 183  
 184      /**
 185       * List of "safe" protocols (used for whitelist-filtering)
 186       *
 187       * @var array
 188       * @access public
 189       */
 190      var $whiteProtocols = array(
 191          'ed2k',   'file', 'ftp',  'gopher', 'http',  'https', 
 192          'irc',    'mailto', 'news', 'nntp', 'telnet', 'webcal', 
 193          'xmpp', 
 194          );
 195  
 196      /**
 197       * List of attributes that can contain protocols
 198       *
 199       * @var array
 200       * @access public
 201       */
 202      var $protocolAttributes = array(
 203          'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', 
 204          );
 205  
 206      /**
 207       * List of dangerous CSS keywords
 208       *
 209       * Whole style="" attribute will be removed, if parser will find one of 
 210       * these keywords
 211       *
 212       * @var array
 213       * @access public
 214       */
 215      var $cssKeywords = array(
 216          'absolute', 'behavior',       'behaviour',   'content', 'expression', 
 217          'fixed',    'include-source', 'moz-binding',
 218          );
 219  
 220      /**
 221       * List of tags that can have no "closing tag"
 222       *
 223       * @var array
 224       * @access public
 225       * @deprecated XHTML does not allow such tags
 226       */
 227      var $noClose = array();
 228  
 229      /**
 230       * List of block-level tags that terminates paragraph
 231       *
 232       * Paragraph will be closed when this tags opened
 233       *
 234       * @var array
 235       * @access public
 236       */
 237      var $closeParagraph = array(
 238          'address', 'blockquote', 'center', 'dd',      'dir',       'div', 
 239          'dl',      'dt',         'h1',     'h2',      'h3',        'h4', 
 240          'h5',      'h6',         'hr',     'isindex', 'listing',   'marquee', 
 241          'menu',    'multicol',   'ol',     'p',       'plaintext', 'pre', 
 242          'table',   'ul',         'xmp', 
 243          );
 244  
 245      /**
 246       * List of table tags, all table tags outside a table will be removed
 247       *
 248       * @var array
 249       * @access public
 250       */
 251      var $tableTags = array(
 252          'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 
 253          'thead',   'tr', 
 254          );
 255  
 256      /**
 257       * List of list tags
 258       *
 259       * @var array
 260       * @access public
 261       */
 262      var $listTags = array('dir', 'menu', 'ol', 'ul', );
 263  
 264      /**
 265       * List of dangerous attributes
 266       *
 267       * @var array
 268       * @access public
 269       */
 270      var $attributes = array('dynsrc', 'id', 'name', );
 271  
 272      /**
 273       * Constructs class
 274       *
 275       * @access public
 276       */
 277      function SafeHTML() 
 278      {
 279          //making regular expressions based on Proto & CSS arrays
 280          foreach ($this->blackProtocols as $proto) {
 281              $preg = "/[\s\x01-\x1F]*";
 282              for ($i=0; $i<strlen($proto); $i++) {
 283                  $preg .= $proto{$i} . "[\s\x01-\x1F]*";
 284              }
 285              $preg .= ":/i";
 286              $this->_protoRegexps[] = $preg;
 287          }
 288  
 289          foreach ($this->cssKeywords as $css) {
 290              $this->_cssRegexps[] = '/' . $css . '/i';
 291          }
 292          return true;
 293      }
 294  
 295      /**
 296       * Handles the writing of attributes - called from $this->_openHandler()
 297       *
 298       * @param array $attrs array of attributes $name => $value
 299       * @return boolean
 300       * @access private
 301       */
 302      function _writeAttrs ($attrs) 
 303      {
 304          if (is_array($attrs)) {
 305              foreach ($attrs as $name => $value) {
 306  
 307                  $name = strtolower($name);
 308  
 309                  if (strpos($name, 'on') === 0) {
 310                      continue;
 311                  }
 312                  if (strpos($name, 'data') === 0) {
 313                      continue;
 314                  }
 315                  if (in_array($name, $this->attributes)) {
 316                      continue;
 317                  }
 318                  if (!preg_match("/^[a-z0-9]+$/i", $name)) {
 319                      continue;
 320                  }
 321  
 322                  if (($value === TRUE) || (is_null($value))) {
 323                      $value = $name;
 324                  }
 325  
 326                  if ($name == 'style') {
 327                     
 328                     // removes insignificant backslahes
 329                     $value = str_replace("\\", '', $value);
 330  
 331                     // removes CSS comments
 332                     while (1)
 333                     {
 334                       $_value = preg_replace("!/\*.*?\*/!s", '', $value);
 335                       if ($_value == $value) break;
 336                       $value = $_value;
 337                     }
 338                     
 339                     // replace all & to &amp;
 340                     $value = str_replace('&amp;', '&', $value);
 341                     $value = str_replace('&', '&amp;', $value);
 342  
 343                     foreach ($this->_cssRegexps as $css) {
 344                         if (preg_match($css, $value)) { 
 345                             continue 2;
 346                         }
 347                     }
 348                     foreach ($this->_protoRegexps as $proto) {
 349                         if (preg_match($proto, $value)) {
 350                             continue 2;
 351                         }
 352                     }
 353                  }
 354  
 355                  $tempval = preg_replace('/&#(\d+);?/me', "chr('\\1')", $value); //"'
 356                  $tempval = preg_replace('/&#x([0-9a-f]+);?/mei', "chr(hexdec('\\1'))", $tempval);
 357  
 358                  if ((in_array($name, $this->protocolAttributes)) && 
 359                      (strpos($tempval, ':') !== false)) 
 360                  {
 361                      if ($this->protocolFiltering == 'black') {
 362                          foreach ($this->_protoRegexps as $proto) {
 363                              if (preg_match($proto, $tempval)) continue 2;
 364                          }
 365                      } else {
 366                          $_tempval = explode(':', $tempval);
 367                          $proto = $_tempval[0];
 368                          if (!in_array($proto, $this->whiteProtocols)) {
 369                              continue;
 370                          }
 371                      }
 372                  }
 373  
 374                  $value = str_replace("\"", "&quot;", $value);
 375                  $this->_xhtml .= ' ' . $name . '="' . $value . '"';
 376              }
 377          }
 378          return true;
 379      }
 380  
 381      /**
 382       * Opening tag handler - called from HTMLSax
 383       *
 384       * @param object $parser HTML Parser
 385       * @param string $name   tag name
 386       * @param array  $attrs  tag attributes
 387       * @return boolean
 388       * @access private
 389       */
 390      function _openHandler(&$parser, $name, $attrs) 
 391      {
 392          $name = strtolower($name);
 393  
 394          if (in_array($name, $this->deleteTagsContent)) {
 395              array_push($this->_dcStack, $name);
 396              $this->_dcCounter[$name] = isset($this->_dcCounter[$name]) ? $this->_dcCounter[$name]+1 : 1;
 397          }
 398          if (count($this->_dcStack) != 0) {
 399              return true;
 400          }
 401  
 402          if (in_array($name, $this->deleteTags)) {
 403              return true;
 404          }
 405          
 406          if (!preg_match("/^[a-z0-9]+$/i", $name)) {
 407              if (preg_match("!(?:\@|://)!i", $name)) {
 408                  $this->_xhtml .= '&lt;' . $name . '&gt;';
 409              }
 410              return true;
 411          }
 412  
 413          if (in_array($name, $this->singleTags)) {
 414              $this->_xhtml .= '<' . $name;
 415              $this->_writeAttrs($attrs);
 416              $this->_xhtml .= ' />';
 417              return true;
 418          }
 419  
 420          // TABLES: cannot open table elements when we are not inside table
 421          if ((isset($this->_counter['table'])) && ($this->_counter['table'] <= 0) 
 422              && (in_array($name, $this->tableTags))) 
 423          {
 424              return true;
 425          }
 426  
 427          // PARAGRAPHS: close paragraph when closeParagraph tags opening
 428          if ((in_array($name, $this->closeParagraph)) && (in_array('p', $this->_stack))) {
 429              $this->_closeHandler($parser, 'p');
 430          }
 431  
 432          // LISTS: we should close <li> if <li> of the same level opening
 433          if ($name == 'li' && count($this->_liStack) && 
 434              $this->_listScope == $this->_liStack[count($this->_liStack)-1]) 
 435          {
 436              $this->_closeHandler($parser, 'li');
 437          }
 438  
 439          // LISTS: we want to know on what nesting level of lists we are
 440          if (in_array($name, $this->listTags)) {
 441              $this->_listScope++;
 442          }
 443          if ($name == 'li') {
 444              array_push($this->_liStack, $this->_listScope);
 445          }
 446              
 447          $this->_xhtml .= '<' . $name;
 448          $this->_writeAttrs($attrs);
 449          $this->_xhtml .= '>';
 450          array_push($this->_stack,$name);
 451          $this->_counter[$name] = isset($this->_counter[$name]) ? $this->_counter[$name]+1 : 1;
 452          return true;
 453      }
 454  
 455      /**
 456       * Closing tag handler - called from HTMLSax
 457       *
 458       * @param object $parsers HTML parser
 459       * @param string $name    tag name
 460       * @return boolean
 461       * @access private
 462       */
 463      function _closeHandler(&$parser, $name) 
 464      {
 465  
 466          $name = strtolower($name);
 467  
 468          if (isset($this->_dcCounter[$name]) && ($this->_dcCounter[$name] > 0) && 
 469              (in_array($name, $this->deleteTagsContent))) 
 470          {
 471             while ($name != ($tag = array_pop($this->_dcStack))) {
 472              $this->_dcCounter[$tag]--;
 473             }
 474  
 475             $this->_dcCounter[$name]--;
 476          }
 477  
 478          if (count($this->_dcStack) != 0) {
 479              return true;
 480          }
 481  
 482          if ((isset($this->_counter[$name])) && ($this->_counter[$name] > 0)) {
 483             while ($name != ($tag = array_pop($this->_stack))) {
 484                 $this->_closeTag($tag);
 485             }
 486  
 487             $this->_closeTag($name);
 488          }
 489          return true;
 490      }
 491  
 492      /**
 493       * Closes tag 
 494       *
 495       * @param string $tag tag name
 496       * @return boolean
 497       * @access private
 498       */
 499      function _closeTag($tag) 
 500      {
 501          if (!in_array($tag, $this->noClose)) {
 502              $this->_xhtml .= '</' . $tag . '>';
 503          }
 504  
 505          $this->_counter[$tag]--;
 506  
 507          if (in_array($tag, $this->listTags)) {
 508              $this->_listScope--;
 509          }
 510  
 511          if ($tag == 'li') {
 512              array_pop($this->_liStack);
 513          }
 514          return true;
 515      }
 516  
 517      /**
 518       * Character data handler - called from HTMLSax
 519       *
 520       * @param object $parser HTML parser
 521       * @param string $data   textual data
 522       * @return boolean
 523       * @access private
 524       */
 525      function _dataHandler(&$parser, $data) 
 526      {
 527          if (count($this->_dcStack) == 0) {
 528              $this->_xhtml .= $data;
 529          }
 530          return true;
 531      }
 532  
 533      /**
 534       * Escape handler - called from HTMLSax
 535       *
 536       * @param object $parser HTML parser
 537       * @param string $data   comments or other type of data
 538       * @return boolean
 539       * @access private
 540       */
 541      function _escapeHandler(&$parser, $data) 
 542      {
 543          return true;
 544      }
 545  
 546      /**
 547       * Returns the XHTML document
 548       *
 549       * @return string Processed (X)HTML document
 550       * @access public
 551       */
 552      function getXHTML () 
 553      {
 554          while ($tag = array_pop($this->_stack)) {
 555              $this->_closeTag($tag);
 556          }
 557          
 558          return $this->_xhtml;
 559      }
 560  
 561      /**
 562       * Clears current document data
 563       *
 564       * @return boolean
 565       * @access public
 566       */
 567      function clear() 
 568      {
 569          $this->_xhtml = '';
 570          return true;
 571      }
 572  
 573      /**
 574       * Main parsing fuction
 575       *
 576       * @param string $doc HTML document for processing
 577       * @return string Processed (X)HTML document
 578       * @access public
 579       */
 580      function parse($doc) 
 581      {
 582  
 583         // Save all '<' symbols
 584         $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '&lt;', $doc);
 585  
 586         // Web documents shouldn't contains \x00 symbol
 587         $doc = str_replace("\x00", '', $doc);
 588  
 589         // Opera6 bug workaround
 590         $doc = str_replace("\xC0\xBC", '&lt;', $doc);
 591         
 592         // UTF-7 encoding XSS workaround
 593         $doc = str_replace("+ADw-", '&lt;', $doc);
 594  
 595         // Instantiate the parser
 596         $parser=& new XML_HTMLSax3();
 597  
 598         // Set up the parser
 599         $parser->set_object($this);
 600  
 601         $parser->set_element_handler('_openHandler','_closeHandler');
 602         $parser->set_data_handler('_dataHandler');
 603         $parser->set_escape_handler('_escapeHandler');
 604  
 605         $parser->parse($doc);
 606  
 607         return $this->getXHTML();
 608  
 609      }
 610  
 611  }
 612  
 613  /*
 614   * Local variables:
 615   * tab-width: 4
 616   * c-basic-offset: 4
 617   * c-hanging-comment-ender-p: nil
 618   * End:
 619   */
 620  
 621  ?>


Généré le : Thu Feb 22 22:27:47 2007 par Balluche grâce à PHPXref 0.7