eGroupWare 1.2.106-2 : /felamimail/inc/class.htmlfilter.inc.php source

[Sommaire] [Imprimer]
   1  <?php
   2  /**
   3   * htmlfilter.inc
   4   * ---------------
   5   * This set of functions allows you to filter html in order to remove
   6   * any malicious tags from it. Useful in cases when you need to filter
   7   * user input for any cross-site-scripting attempts.
   8   *
   9   * Copyright (c) 2002 by Duke University
  10   *
  11   * This program is free software; you can redistribute it and/or
  12   * modify it under the terms of the GNU General Public License
  13   * as published by the Free Software Foundation; either version 2
  14   * of the License, or (at your option) any later version.
  15   *
  16   * This program is distributed in the hope that it will be useful,
  17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19   * GNU General Public License for more details.
  20   * 
  21   * You should have received a copy of the GNU General Public License
  22   * along with this program; if not, write to the Free Software
  23   * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  
  24   * 02111-1307, USA.
  25   *
  26   * @Author  Konstantin Riabitsev <icon@linux.duke.edu>
  27   * @Version 1.0.5 (Oct-16-2002)
  28   */
  29  
  30  class htmlfilter
  31  {
  32  
  33  /**
  34   * See http://www.mricon.com/html/phpfilter.html
  35   *
  36   * This is a debugging function used throughout the code. To enable
  37   * debugging you have to specify a global variable called "debug" before
  38   * calling sanitize() and set it to true. 
  39   *
  40   * Note: Although insignificantly, debugging does slow you down even
  41   * when $debug is set to false. If you wish to get rid of all
  42   * debugging calls, run the following command:
  43   *
  44   * fgrep -v 'spew("' htmlfilter.inc > htmlfilter.inc.new
  45   *
  46   * htmlfilter.inc.new will contain no debugging calls.
  47   *
  48   * @param  $message  A string with the message to output.
  49   * @return           void.
  50   */
  51  function spew($message){
  52      global $debug;
  53      #$debug = true;
  54      if ($debug == true){
  55          echo "$message<br>";
  56      }
  57  }
  58  
  59  /**
  60   * This function returns the final tag out of the tag name, an array
  61   * of attributes, and the type of the tag. This function is called by 
  62   * sanitize internally.
  63   *
  64   * @param  $tagname  the name of the tag.
  65   * @param  $attary   the array of attributes and their values
  66   * @param  $tagtype  The type of the tag (see in comments).
  67   * @return           a string with the final tag representation.
  68   */
  69  function tagprint($tagname, $attary, $tagtype){
  70      $me = 'tagprint';
  71      if ($tagtype == 2){
  72          $fulltag = '</' . $tagname . '>';
  73      } else {
  74          $fulltag = '<' . $tagname;
  75          if (is_array($attary) && sizeof($attary)){
  76              $atts = Array();
  77              while (list($attname, $attvalue) = each($attary)){
  78                  array_push($atts, "$attname=$attvalue");
  79              }
  80              $fulltag .= ' ' . join(' ', $atts);
  81          }
  82          if ($tagtype == 3){
  83              $fulltag .= ' /';
  84          }
  85          $fulltag .= '>';
  86      }
  87      $this->spew("$me: $fulltag\n");
  88      return $fulltag;
  89  }
  90  
  91  /**
  92   * A small helper function to use with array_walk. Modifies a by-ref
  93   * value and makes it lowercase.
  94   *
  95   * @param  $val a value passed by-ref.
  96   * @return      void since it modifies a by-ref value.
  97   */
  98  function casenormalize(&$val){
  99      $val = strtolower($val);
 100  }
 101  
 102  /**
 103   * This function skips any whitespace from the current position within
 104   * a string and to the next non-whitespace value.
 105   * 
 106   * @param  $body   the string
 107   * @param  $offset the offset within the string where we should start
 108   *                 looking for the next non-whitespace character.
 109   * @return         the location within the $body where the next
 110   *                 non-whitespace char is located.
 111   */
 112  function skipspace($body, $offset){
 113      $me = 'skipspace';
 114      preg_match('/^(\s*)/s', substr($body, $offset), $matches);
 115      if (sizeof($matches{1})){
 116          $count = strlen($matches{1});
 117          $this->spew("$me: skipped $count chars\n");
 118          $offset += $count;
 119      }
 120      return $offset;
 121  }
 122  
 123  /**
 124   * This function looks for the next character within a string.  It's
 125   * really just a glorified "strpos", except it catches the failures
 126   * nicely.
 127   *
 128   * @param  $body   The string to look for needle in.
 129   * @param  $offset Start looking from this position.
 130   * @param  $needle The character/string to look for.
 131   * @return         location of the next occurance of the needle, or
 132   *                 strlen($body) if needle wasn't found.
 133   */
 134  function findnxstr($body, $offset, $needle){
 135      $me = 'findnxstr';
 136      $pos = strpos($body, $needle, $offset);
 137      if ($pos === FALSE){
 138          $pos = strlen($body);
 139          $this->spew("$me: end of body reached\n");
 140      }
 141      $this->spew("$me: '$needle' found at pos $pos\n");
 142      return $pos;
 143  }
 144  
 145  /**
 146   * This function takes a PCRE-style regexp and tries to match it
 147   * within the string.
 148   *
 149   * @param  $body   The string to look for needle in.
 150   * @param  $offset Start looking from here.
 151   * @param  $reg    A PCRE-style regex to match.
 152   * @return         Returns a false if no matches found, or an array
 153   *                 with the following members:
 154   *                 - integer with the location of the match within $body
 155   *                 - string with whatever content between offset and the match
 156   *                 - string with whatever it is we matched
 157   */
 158  function findnxreg($body, $offset, $reg){
 159      $me = 'findnxreg';
 160      $matches = Array();
 161      $retarr = Array();
 162      $preg_rule = '%^(.*?)(' . $reg . ')%s';
 163      preg_match($preg_rule, substr($body, $offset), $matches);
 164      if (!$matches{0}){
 165          $this->spew("$me: No matches found.\n");
 166          $retarr = false;
 167      } else {
 168          $retarr{0} = $offset + strlen($matches{1});
 169          $retarr{1} = $matches{1};
 170          $retarr{2} = $matches{2};
 171          $this->spew("$me: '$reg' found at pos $offset matching '".$matches{2}."'\n");
 172      }
 173      return $retarr;
 174  }
 175  
 176  /**
 177   * This function looks for the next tag.
 178   *
 179   * @param  $body   String where to look for the next tag.
 180   * @param  $offset Start looking from here.
 181   * @return         false if no more tags exist in the body, or
 182   *                 an array with the following members:
 183   *                 - string with the name of the tag
 184   *                 - array with attributes and their values
 185   *                 - integer with tag type (1, 2, or 3)
 186   *                 - integer where the tag starts (starting "<")
 187   *                 - integer where the tag ends (ending ">")
 188   *                 first three members will be false, if the tag is invalid.
 189   */
 190  function getnxtag($body, $offset){
 191      $me = 'getnxtag';
 192      if ($offset > strlen($body)){
 193          $this->spew("$me: Past the end of body\n");
 194          return false;
 195      }
 196      $lt = $this->findnxstr($body, $offset, '<');
 197      if ($lt == strlen($body)){
 198          $this->spew("$me: No more tags found!\n");
 199          return false;
 200      }
 201      /**
 202       * We are here:
 203       * blah blah <tag attribute="value">
 204       * \---------^
 205       */
 206      $this->spew("$me: Found '<' at pos $lt\n");
 207      $pos = $this->skipspace($body, $lt + 1);
 208      if ($pos >= strlen($body)){
 209          $this->spew("$me: End of body reached.\n");
 210          return Array(false, false, false, $lt, strlen($body));
 211      }
 212      /**
 213       * There are 3 kinds of tags:
 214       * 1. Opening tag, e.g.:
 215       *    <a href="blah">
 216       * 2. Closing tag, e.g.:
 217       *    </a>
 218       * 3. XHTML-style content-less tag, e.g.:
 219       *    <img src="blah"/>
 220       */
 221      $tagtype = false;
 222      switch (substr($body, $pos, 1)){
 223      case '/':
 224          $this->spew("$me: This is a closing tag (type 2)\n");
 225          $tagtype = 2;
 226          $pos++;
 227          break;
 228      case '!':
 229          /**
 230           * A comment or an SGML declaration.
 231           */
 232          if (substr($body, $pos+1, 2) == '--'){
 233              $this->spew("$me: A comment found. Stripping.\n");
 234              $gt = strpos($body, '-->', $pos);
 235              if ($gt === false){
 236                  $gt = strlen($body);
 237              } else {
 238                  $gt += 2;
 239              }
 240              return Array(false, false, false, $lt, $gt);
 241          } else {
 242              $this->spew("$me: An SGML declaration found. Stripping.\n");
 243              $gt = $this->findnxstr($body, $pos, '>');
 244              return Array(false, false, false, $lt, $gt);
 245          }
 246          break;
 247      default:
 248          /**
 249           * Assume tagtype 1 for now. If it's type 3, we'll switch values
 250           * later.
 251           */
 252          $tagtype = 1;
 253          break;
 254      }
 255      
 256      $tag_start = $pos;
 257      $tagname = '';
 258      /**
 259       * Look for next [\W-_], which will indicate the end of the tag name.
 260       */
 261      $regary = $this->findnxreg($body, $pos, '[^\w\-_]');
 262      if ($regary == false){
 263          $this->spew("$me: End of body reached while analyzing tag name\n");
 264          return Array(false, false, false, $lt, strlen($body));
 265      }
 266      list($pos, $tagname, $match) = $regary;
 267      $tagname = strtolower($tagname);
 268      
 269      /**
 270       * $match can be either of these:
 271       * '>'  indicating the end of the tag entirely.
 272       * '\s' indicating the end of the tag name.
 273       * '/'  indicating that this is type-3 xhtml tag.
 274       * 
 275       * Whatever else we find there indicates an invalid tag.
 276       */
 277      switch ($match){
 278      case '/':
 279          /**
 280           * This is an xhtml-style tag with a closing / at the
 281           * end, like so: <img src="blah"/>. Check if it's followed
 282           * by the closing bracket. If not, then this tag is invalid
 283           */
 284          if (substr($body, $pos, 2) == '/>'){
 285              $this->spew("$me: XHTML-style tag found.\n");
 286              $pos++;
 287              $this->spew("$me: Setting tagtype to 3\n");
 288              $tagtype = 3;
 289          } else {
 290              $this->spew("$me: Found invalid character '/'.\n");
 291              $gt = $this->findnxstr($body, $pos, '>');
 292              $this->spew("$me: Tag is invalid. Returning.\n");
 293              $retary = Array(false, false, false, $lt, $gt);
 294              return $retary;
 295          }
 296      case '>':
 297          $this->spew("$me: End of tag found at $pos\n");
 298          $this->spew("$me: Tagname is '$tagname'\n");
 299          $this->spew("$me: This tag has no attributes\n");
 300          return Array($tagname, false, $tagtype, $lt, $pos);
 301          break;
 302      default:
 303          /**
 304           * Check if it's whitespace
 305           */
 306          if (preg_match('/\s/', $match)){
 307              $this->spew("$me: Tagname is '$tagname'\n");
 308          } else {
 309              /**
 310               * This is an invalid tag! Look for the next closing ">".
 311               */
 312              $this->spew("$me: Invalid characters found in tag name: $match\n");
 313              $gt = $this->findnxstr($body, $offset, '>');
 314              return Array(false, false, false, $lt, $gt);
 315          }
 316      }
 317      
 318      /**
 319       * At this point we're here:
 320       * <tagname  attribute='blah'>
 321       * \-------^
 322       *
 323       * At this point we loop in order to find all attributes.
 324       */
 325      $attname = '';
 326      $atttype = false;
 327      $attary = Array();
 328      
 329      while ($pos <= strlen($body)){
 330          $pos = $this->skipspace($body, $pos);
 331          if ($pos == strlen($body)){
 332              /**
 333               * Non-closed tag.
 334               */
 335              $this->spew("$me: End of body reached before end of tag. Discarding.\n");
 336              return Array(false, false, false, $lt, $pos);
 337          }
 338          /**
 339           * See if we arrived at a ">" or "/>", which means that we reached
 340           * the end of the tag.
 341           */
 342          $matches = Array();
 343          preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches);
 344          if (isset($matches{0}) && $matches{0}){
 345              /**
 346               * Yep. So we did.
 347               */
 348              $this->spew("$me: Arrived at the end of the tag.\n");
 349              $pos += strlen($matches{1});
 350              if ($matches{2} == '/>'){
 351                  $tagtype = 3;
 352                  $pos++;
 353              }
 354              return Array($tagname, $attary, $tagtype, $lt, $pos);
 355          }
 356          
 357          /**
 358           * There are several types of attributes, with optional
 359           * [:space:] between members.
 360           * Type 1:
 361           *   attrname[:space:]=[:space:]'CDATA'
 362           * Type 2:
 363           *   attrname[:space:]=[:space:]"CDATA"
 364           * Type 3:
 365           *   attr[:space:]=[:space:]CDATA
 366           * Type 4:
 367           *   attrname
 368           *
 369           * We leave types 1 and 2 the same, type 3 we check for
 370           * '"' and convert to "&quot" if needed, then wrap in
 371           * double quotes. Type 4 we convert into:
 372           * attrname="yes".
 373           */
 374          $regary = $this->findnxreg($body, $pos, '[^\w\-_]');
 375          if ($regary == false){
 376              /**
 377               * Looks like body ended before the end of tag.
 378               */
 379              $this->spew("$me: End of body found before end of tag.\n");
 380              $this->spew("$me: Invalid, returning\n");
 381              return Array(false, false, false, $lt, strlen($body));
 382          }
 383          list($pos, $attname, $match) = $regary;
 384          $attname = strtolower($attname);
 385          $this->spew("$me: Attribute '$attname' found\n");
 386          /**
 387           * We arrived at the end of attribute name. Several things possible
 388           * here:
 389           * '>'  means the end of the tag and this is attribute type 4
 390           * '/'  if followed by '>' means the same thing as above
 391           * '\s' means a lot of things -- look what it's followed by.
 392           *      anything else means the attribute is invalid.
 393           */
 394          switch($match){
 395          case '/':
 396              /**
 397               * This is an xhtml-style tag with a closing / at the
 398               * end, like so: <img src="blah"/>. Check if it's followed
 399               * by the closing bracket. If not, then this tag is invalid
 400               */
 401              if (substr($body, $pos, 2) == '/>'){
 402                  $this->spew("$me: This is an xhtml-style tag.\n");
 403                  $pos++;
 404                  $this->spew("$me: Setting tagtype to 3\n");
 405                  $tagtype = 3;
 406              } else {
 407                  $this->spew("$me: Found invalid character '/'.\n");
 408                  $gt = $this->findnxstr($body, $pos, '>');
 409                  $this->spew("$me: Tag is invalid. Returning.\n");
 410                  $retary = Array(false, false, false, $lt, $gt);
 411                  return $retary;
 412              }
 413          case '>':
 414              $this->spew("$me: found type 4 attribute.\n");
 415              $this->spew("$me: Additionally, end of tag found at $pos\n");
 416              $this->spew("$me: Attname is '$attname'\n");
 417              $this->spew("$me: Setting attvalue to 'yes'\n");
 418              $attary{$attname} = '"yes"';
 419              return Array($tagname, $attary, $tagtype, $lt, $pos);
 420              break;
 421          default:
 422              /**
 423               * Skip whitespace and see what we arrive at.
 424               */
 425              $pos = $this->skipspace($body, $pos);
 426              $char = substr($body, $pos, 1);
 427              /**
 428               * Two things are valid here:
 429               * '=' means this is attribute type 1 2 or 3.
 430               * \w means this was attribute type 4.
 431               * anything else we ignore and re-loop. End of tag and
 432               * invalid stuff will be caught by our checks at the beginning
 433               * of the loop.
 434               */
 435              if ($char == '='){
 436                  $this->spew("$me: Attribute type 1, 2, or 3 found.\n");
 437                  $pos++;
 438                  $pos = $this->skipspace($body, $pos);
 439                  /**
 440                   * Here are 3 possibilities:
 441                   * "'"  attribute type 1
 442                   * '"'  attribute type 2
 443                   * everything else is the content of tag type 3
 444                   */
 445                  $quot = substr($body, $pos, 1);
 446                  if ($quot == '\''){
 447                      $this->spew("$me: In fact, this is attribute type 1\n");
 448                      $this->spew("$me: looking for closing quote\n");
 449                      $regary = $this->findnxreg($body, $pos+1, '\'');
 450                      if ($regary == false){
 451                          $this->spew("$me: end of body reached before end of val\n");
 452                          $this->spew("$me: Returning\n");
 453                          return Array(false, false, false, $lt, strlen($body));
 454                      }
 455                      list($pos, $attval, $match) = $regary;
 456                      $this->spew("$me: Attvalue is '$attval'\n");
 457                      $pos++;
 458                      $attary{$attname} = '\'' . $attval . '\'';
 459                  } else if ($quot == '"'){
 460                      $this->spew("$me: In fact, this is attribute type 2\n");
 461                      $this->spew("$me: looking for closing quote\n");
 462                      $regary = $this->findnxreg($body, $pos+1, '\"');
 463                      if ($regary == false){
 464                          $this->spew("$me: end of body reached before end of val\n");
 465                          $this->spew("$me: Returning\n");
 466                          return Array(false, false, false, $lt, strlen($body));
 467                      }
 468                      list($pos, $attval, $match) = $regary;
 469                      $this->spew("$me: Attvalue is \"$attval\"\n");
 470                      $pos++;
 471                      $attary{$attname} = '"' . $attval . '"';
 472                  } else {
 473                      $this->spew("$me: This looks like attribute type 3\n");
 474                      /**
 475                       * These are hateful. Look for \s, or >.
 476                       */
 477                      $this->spew("$me: Looking for end of attval\n");
 478                      $regary = $this->findnxreg($body, $pos, '[\s>]');
 479                      if ($regary == false){
 480                          $this->spew("$me: end of body reached before end of val\n");
 481                          $this->spew("$me: Returning\n");
 482                          return Array(false, false, false, $lt, strlen($body));
 483                      }
 484                      list($pos, $attval, $match) = $regary;
 485                      /**
 486                       * If it's ">" it will be caught at the top.
 487                       */
 488                      $this->spew("$me: translating '\"' into &quot;\n");
 489                      $attval = preg_replace('/\"/s', '&quot;', $attval);
 490                      $this->spew("$me: wrapping in quotes\n");
 491                      $attary{$attname} = '"' . $attval . '"';
 492                  }
 493              } else if (preg_match('|[\w/>]|', $char)) {
 494                  /**
 495                   * That was attribute type 4.
 496                   */
 497                  $this->spew("$me: attribute type 4 found.\n");
 498                  $this->spew("$me: Setting value to 'yes'\n");
 499                  $attary{$attname} = '"yes"';
 500              } else {
 501                  /**
 502                   * An illegal character. Find next '>' and return.
 503                   */
 504                  $this->spew("$me: illegal character '$char' found.\n");
 505                  $this->spew("$me: returning\n");
 506                  $gt = $this->findnxstr($body, $pos, '>');
 507                  return Array(false, false, false, $lt, $gt);
 508              }
 509          }
 510      }
 511      /**
 512       * The fact that we got here indicates that the tag end was never
 513       * found. Return invalid tag indication so it gets stripped.
 514       */
 515      $this->spew("$me: No tag end found\n");
 516      return Array(false, false, false, $lt, strlen($body));
 517  }
 518  
 519  /**
 520   * This function checks attribute values for entity-encoded values
 521   * and returns them translated into 8-bit strings so we can run
 522   * checks on them.
 523   *
 524   * @param  $attvalue A string to run entity check against.
 525   * @return           Translated value.
 526   */
 527  function deent($attvalue){
 528      $me = 'deent';
 529      /**
 530       * See if we have to run the checks first. All entities must start
 531       * with "&".
 532       */
 533      if (strpos($attvalue, '&') === false){
 534          return $attvalue;
 535      }
 536      /**
 537       * Check named entities first.
 538       */
 539      $this->spew("$me: translating named entities\n");
 540      $trans = get_html_translation_table(HTML_ENTITIES);
 541      /**
 542       * Leave &quot; in, as it can mess us up.
 543       */
 544      $trans = array_flip($trans);
 545      unset($trans{'&quot;'});
 546      while (list($ent, $val) = each($trans)){
 547          $attvalue = preg_replace('/' . $ent . '*/si', $val, $attvalue);
 548      }
 549      /**
 550       * Now translate numbered entities from 1 to 255 if needed.
 551       */
 552      if (strpos($attvalue, '#') !== false){
 553          $this->spew("$me: translating numbered entities\n");
 554          $omit = Array(34, 39);
 555          for ($asc = 256; $asc >= 0; $asc--){
 556              if (!in_array($asc, $omit)){
 557                  $chr = chr($asc);
 558                  $octrule = '/\&#0*' . $asc . ';*/si';
 559                  $hexrule = '/\&#x0*' . dechex($asc) . ';*/si';
 560                  $attvalue = preg_replace($octrule, $chr, $attvalue);
 561                  $attvalue = preg_replace($hexrule, $chr, $attvalue);
 562              }
 563          }
 564      }
 565      $this->spew("$me: translated into: $attvalue\n");
 566      return $attvalue;
 567  }
 568  
 569  /**
 570   * This function runs various checks against the attributes.
 571   *
 572   * @param  $tagname         String with the name of the tag.
 573   * @param  $attary          Array with all tag attributes.
 574   * @param  $rm_attnames     See description for sanitize
 575   * @param  $bad_attvals     See description for sanitize
 576   * @param  $add_attr_to_tag See description for sanitize
 577   * @return                  Array with modified attributes.
 578   */
 579  function fixatts($tagname, 
 580                                   $attary, 
 581                                   $rm_attnames,
 582                                   $bad_attvals,
 583                                   $add_attr_to_tag
 584                                   ){
 585      $me = 'fixatts';
 586      $this->spew("$me: Fixing attributes\n");
 587      while (list($attname, $attvalue) = each($attary)){
 588          /**
 589           * See if this attribute should be removed.
 590           */
 591          foreach ($rm_attnames as $matchtag=>$matchattrs){
 592              if (preg_match($matchtag, $tagname)){
 593                  foreach ($matchattrs as $matchattr){
 594                      if (preg_match($matchattr, $attname)){
 595                          $this->spew("$me: Attribute '$attname' defined as bad.\n");
 596                          $this->spew("$me: Removing.\n");
 597                          unset($attary{$attname});
 598                          continue;
 599                      }
 600                  }
 601              }
 602          }
 603          /**
 604           * Remove any entities.
 605           */
 606          $attvalue = $this->deent($attvalue);
 607          
 608          /**
 609           * Now let's run checks on the attvalues.
 610           * I don't expect anyone to comprehend this. If you do,
 611           * get in touch with me so I can drive to where you live and
 612           * shake your hand personally. :)
 613           */
 614          foreach ($bad_attvals as $matchtag=>$matchattrs){
 615              if (preg_match($matchtag, $tagname)){
 616                  foreach ($matchattrs as $matchattr=>$valary){
 617                      if (preg_match($matchattr, $attname)){
 618                          /**
 619                           * There are two arrays in valary.
 620                           * First is matches.
 621                           * Second one is replacements
 622                           */
 623                          list($valmatch, $valrepl) = $valary;
 624                          $newvalue = preg_replace($valmatch, $valrepl, $attvalue);
 625                          if ($newvalue != $attvalue){
 626                              $this->spew("$me: attvalue is now $newvalue\n");
 627                              $attary{$attname} = $newvalue;
 628                          }
 629                      }
 630                  }
 631              }
 632          }
 633      }
 634      /**
 635       * See if we need to append any attributes to this tag.
 636       */
 637      foreach ($add_attr_to_tag as $matchtag=>$addattary){
 638          if (preg_match($matchtag, $tagname)){
 639              $attary = array_merge($attary, $addattary);
 640              $this->spew("$me: Added attributes to this tag\n");
 641          }
 642      }
 643      return $attary;
 644  }
 645  
 646  /**
 647   * This is the main function and the one you should actually be calling.
 648   * There are several variables you should be aware of an which need
 649   * special description.
 650   *
 651   * $tag_list
 652   * ----------
 653   * This is a simple one-dimentional array of strings, except for the
 654   * very first one. The first member should be einter false or true.
 655   * In case it's FALSE, the following list will be considered a list of
 656   * tags that should be explicitly REMOVED from the body, and all
 657   * others that did not match the list will be allowed.  If the first
 658   * member is TRUE, then the list is the list of tags that should be
 659   * explicitly ALLOWED -- any tag not matching this list will be
 660   * discarded.
 661   *
 662   * Examples:
 663   * $tag_list = Array(
 664   *                   false,   
 665   *                   "blink", 
 666   *                   "link",
 667   *             "object",
 668   *             "meta",
 669   *                   "marquee",
 670   *                   "html"
 671   *                    );
 672   *
 673   * This will allow all tags except for blink, link, object, meta, marquee, 
 674   * and html.
 675   *
 676   * $tag_list = Array(
 677   *                   true, 
 678   *                   "b", 
 679   *                   "a", 
 680   *                   "i", 
 681   *                   "img", 
 682   *                   "strong", 
 683   *                   "em", 
 684   *                   "p"
 685   *                  );
 686   *
 687   * This will remove all tags from the body except b, a, i, img, strong, em and
 688   * p.
 689   *
 690   * $rm_tags_with_content
 691   * ---------------------
 692   * This is a simple one-dimentional array of strings, which specifies the
 693   * tags to be removed with any and all content between the beginning and
 694   * the end of the tag.
 695   * Example:
 696   * $rm_tags_with_content = Array(
 697   *                               "script",
 698   *                               "style", 
 699   *                               "applet",
 700   *                               "embed"
 701   *                              );
 702   *
 703   * This will remove the following structure:
 704   * <script>
 705   *  window.alert("Isn't cross-site-scripting fun?!");
 706   * </script>
 707   * 
 708   * $self_closing_tags
 709   * ------------------
 710   * This is a simple one-dimentional array of strings, which specifies which
 711   * tags contain no content and should not be forcefully closed if this option
 712   * is turned on (see further).
 713   * Example:
 714   * $self_closing_tags =  Array(
 715   *                             "img",
 716   *                             "br", 
 717   *                             "hr",
 718   *                             "input"
 719   *                            );    
 720   *
 721   * $force_tag_closing
 722   * ------------------
 723   * Set it to true to forcefully close any tags opened within the document.
 724   * This is good if you want to take care of people who like to screw up
 725   * the pages by leaving unclosed tags like <a>, <b>, <i>, etc.
 726   *
 727   * $rm_attnames
 728   * -------------
 729   * Now we come to parameters that are more obscure. This parameter is
 730   * a nested array which is used to specify which attributes should be
 731   * removed. It goes like so:
 732   * 
 733   * $rm_attnames = Array(
 734   *   "PCRE regex to match tag name" =>
 735   *     Array(
 736   *           "PCRE regex to match attribute name"
 737   *           )
 738   *   );
 739   *
 740   * Example:
 741   * $rm_attnames = Array(
 742   *   "|.*|" =>
 743   *     Array(
 744   *           "|target|i",
 745   *           "|^on.*|i"  
 746   *          )
 747   *   );
 748   *
 749   * This will match all attributes (.*), and specify that all attributes
 750   * named "target" and starting with "on" should be removed. This will take
 751   * care of the following problem:
 752   * <em onmouseover="window.alert('muahahahaha')">
 753   * The "onmouseover" will be removed.
 754   *
 755   * $bad_attvals
 756   * ------------
 757   * This is where it gets ugly. This is a nested array with many levels.
 758   * It goes like so:
 759   *
 760   * $bad_attvals = Array(
 761   *   "pcre regex to match tag name" =>
 762   *     Array(
 763   *           "pcre regex to match attribute name" =>
 764   *             Array(
 765   *                   "pcre regex to match attribute value"
 766   *                  )
 767   *             Array(
 768   *                   "pcre regex replace a match from above with"
 769   *                  )
 770   *          )
 771   *   );
 772   *
 773   * An extensive example:
 774   *
 775   * $bad_attvals = Array(
 776   *   "|.*|" =>
 777   *      Array(
 778   *            "/^src|background|href|action/i" =>
 779   *                Array(
 780   *                      Array(
 781   *                            "/^([\'\"])\s*\S+script\s*:.*([\'\"])/si"
 782   *                            ),
 783   *                      Array(
 784   *                            "\\1http://veryfunny.com/\\2"
 785   *                            )
 786   *                      ),
 787   *            "/^style/i" =>
 788   *                Array(
 789   *                      Array(
 790   *                            "/expression/si",
 791   *                            "/url\(([\'\"])\s*https*:.*([\'\"])\)/si",
 792   *                            "/url\(([\'\"])\s*\S+script:.*([\'\"])\)/si"
 793   *                           ),
 794   *                      Array(
 795   *                            "idiocy",
 796   *                            "url(\\1http://veryfunny.com/\\2)",
 797   *                            "url(\\1http://veryfynny.com/\\2)"
 798   *                           )
 799   *                      )
 800   *            )
 801   *  );
 802   *
 803   * This will take care of nearly all known cross-site scripting exploits,
 804   * plus some (see my filter sample at 
 805   * http://www.mricon.com/html/phpfilter.html for a working version).
 806   *
 807   * $add_attr_to_tag
 808   * ----------------
 809   * This is a useful little feature which lets you add attributes to 
 810   * certain tags. It is a nested array as well, but not at all like
 811   * the previous one. It goes like so:
 812   * 
 813   * $add_attr_to_tag = Array(
 814   *   "PCRE regex to match tag name" =>
 815   *     Array(
 816   *           "attribute name"=>'"attribute value"'
 817   *          )
 818   *   );
 819   * 
 820   * Note: don't forget quotes around attribute value.
 821   * 
 822   * Example:
 823   * 
 824   * $add_attr_to_tag = Array(
 825   *   "/^a$/si" => 
 826   *     Array(
 827   *           'target'=>'"_new"'
 828   *          )
 829   *   );
 830   * 
 831   * This will change all <a> tags and add target="_new" to them so all links
 832   * open in a new window.
 833   *
 834   *
 835   *
 836   * @param $body                 the string with HTML you wish to filter
 837   * @param $tag_list             see description above
 838   * @param $rm_tags_with_content see description above
 839   * @param $self_closing_tags    see description above
 840   * @param $force_tag_closing    see description above
 841   * @param $rm_attnames          see description above
 842   * @param $bad_attvals          see description above
 843   * @param $add_attr_to_tag      see description above
 844   * @return                      sanitized html safe to show on your pages.
 845   */
 846  function sanitize($body, 
 847                                      $tag_list, 
 848                                      $rm_tags_with_content,
 849                                      $self_closing_tags,
 850                                      $force_tag_closing,
 851                                      $rm_attnames,
 852                                      $bad_attvals,
 853                                      $add_attr_to_tag
 854                                      ){
 855      $me = 'sanitize';
 856      /**
 857       * Normalize rm_tags and rm_tags_with_content.
 858       */
 859      @array_walk($rm_tags, 'casenormalize');
 860      @array_walk($rm_tags_with_content, 'casenormalize');
 861      @array_walk($self_closing_tags, 'casenormalize');
 862      /**
 863       * See if tag_list is of tags to remove or tags to allow.
 864       * false  means remove these tags
 865       * true   means allow these tags
 866       */
 867      $rm_tags = array_shift($tag_list);
 868      $curpos = 0;
 869      $open_tags = Array();
 870      #$trusted = "<!-- begin sanitized html -->\n";
 871      $trusted = "";
 872      $skip_content = false;
 873      /**
 874       * Take care of netscape's stupid javascript entities like
 875       * &{alert('boo')};
 876       */
 877      $body = preg_replace('/&(\{.*?\};)/si', '&amp;\\1', $body);
 878      $this->spew("$me: invoking the loop\n");
 879      while (($curtag = $this->getnxtag($body, $curpos)) != FALSE){
 880          list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
 881          $this->spew("$me: grabbing free-standing content\n");
 882          $free_content = substr($body, $curpos, $lt - $curpos);
 883          $this->spew("$me: " . strlen($free_content) . " chars grabbed\n");
 884          if ($skip_content == false){
 885              $this->spew("$me: appending free content to trusted.\n");
 886              $trusted .= $free_content;
 887          } else {
 888              $this->spew("$me: Skipping free content.\n");
 889          }
 890          if ($tagname != FALSE){
 891              $this->spew("$me: tagname is '$tagname'\n");
 892              if ($tagtype == 2){
 893                  $this->spew("$me: This is a closing tag\n");
 894                  if ($skip_content == $tagname){
 895                      /**
 896                       * Got to the end of tag we needed to remove.
 897                       */
 898                      $this->spew("$me: Finished removing tag with content\n");
 899                      $tagname = false;
 900                      $skip_content = false;
 901                  } else {
 902                      if ($skip_content == false){
 903                          if (isset($open_tags{$tagname}) && 
 904                                  $open_tags{$tagname} > 0){
 905                              $this->spew("$me: popping '$tagname' from open_tags\n");
 906                              $open_tags{$tagname}--;
 907                          } else {
 908                              $this->spew("$me: '$tagname' was never opened\n");
 909                              $this->spew("$me: removing\n");
 910                              $tagname = false;
 911                          }
 912                      } else {
 913                          $this->spew("$me: Skipping this tag\n");
 914                      }
 915                  }
 916              } else {
 917                  /**
 918                   * $rm_tags_with_content
 919                   */
 920                  if ($skip_content == false){
 921                      /**
 922                       * See if this is a self-closing type and change
 923                       * tagtype appropriately.
 924                       */
 925                      if ($tagtype == 1
 926                              && in_array($tagname, $self_closing_tags)){
 927                          $this->spew("$me: Self-closing tag. Changing tagtype.\n");
 928                          $tagtype = 3;
 929                      }
 930                      /**
 931                       * See if we should skip this tag and any content
 932                       * inside it.
 933                       */
 934                      if ($tagtype == 1 && in_array($tagname, $rm_tags_with_content)){
 935                          $this->spew("$me: removing this tag with content\n");
 936                          $skip_content = $tagname;
 937                      } else {
 938                          if (($rm_tags == false && in_array($tagname, $tag_list)) ||
 939                                  ($rm_tags == true && !in_array($tagname, $tag_list))){
 940                              $this->spew("$me: Removing this tag.\n");
 941                              $tagname = false;
 942                          } else {
 943                              if ($tagtype == 1){
 944                                  $this->spew("$me: adding '$tagname' to open_tags\n");
 945                                  if (isset($open_tags{$tagname})){
 946                                      $open_tags{$tagname}++;
 947                                  } else {
 948                                      $open_tags{$tagname} = 1;
 949                                  }
 950                              }
 951                              /**
 952                               * This is where we run other checks.
 953                               */
 954                              if (is_array($attary) && sizeof($attary) > 0){
 955                                  $attary = $this->fixatts($tagname,
 956                                                                      $attary,
 957                                                                      $rm_attnames,
 958                                                                      $bad_attvals,
 959                                                                      $add_attr_to_tag);
 960                              }
 961                          }
 962                      }
 963                  } else {
 964                      $this->spew("$me: Skipping this tag\n");
 965                  }
 966              }
 967              if ($tagname != false && $skip_content == false){
 968                  $this->spew("$me: Appending tag to trusted.\n");
 969                  $trusted .= $this->tagprint($tagname, $attary, $tagtype);
 970              }
 971          } else {
 972              $this->spew("$me: Removing invalid tag\n");
 973          }
 974          $curpos = $gt + 1;
 975      }
 976      $this->spew("$me: Appending any leftover content\n");
 977      $trusted .= substr($body, $curpos, strlen($body) - $curpos);
 978      if ($force_tag_closing == true){
 979          foreach ($open_tags as $tagname=>$opentimes){
 980              while ($opentimes > 0){
 981                  $this->spew("$me: '$tagname' left open. Closing by force.\n");
 982                  $trusted .= '</' . $tagname . '>';
 983                  $opentimes--;
 984              }
 985          }
 986          $trusted .= "\n";
 987      }
 988  #  $trusted .= "<!-- end sanitized html -->\n";
 989      $trusted .= "";
 990      return $trusted;
 991  }
 992  // class end
 993  }
 994  ?>
Code source de eGroupWare 1.2.106-2

/felamimail/inc/ -> class.htmlfilter.inc.php (source)