[ Index ] |
|
Code source de eGroupWare 1.2.106-2 |
1 <?php 2 /** 3 * htmlfilter.inc 4 * --------------- 5 * This set of functions allows you to filter html in order to remove 6 * any malicious tags from it. Useful in cases when you need to filter 7 * user input for any cross-site-scripting attempts. 8 * 9 * Copyright (c) 2002 by Duke University 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public License 13 * as published by the Free Software Foundation; either version 2 14 * of the License, or (at your option) any later version. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public License 22 * along with this program; if not, write to the Free Software 23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 24 * 02111-1307, USA. 25 * 26 * @Author Konstantin Riabitsev <icon@linux.duke.edu> 27 * @Version 1.0.5 (Oct-16-2002) 28 */ 29 30 class htmlfilter 31 { 32 33 /** 34 * See http://www.mricon.com/html/phpfilter.html 35 * 36 * This is a debugging function used throughout the code. To enable 37 * debugging you have to specify a global variable called "debug" before 38 * calling sanitize() and set it to true. 39 * 40 * Note: Although insignificantly, debugging does slow you down even 41 * when $debug is set to false. If you wish to get rid of all 42 * debugging calls, run the following command: 43 * 44 * fgrep -v 'spew("' htmlfilter.inc > htmlfilter.inc.new 45 * 46 * htmlfilter.inc.new will contain no debugging calls. 47 * 48 * @param $message A string with the message to output. 49 * @return void. 50 */ 51 function spew($message){ 52 global $debug; 53 #$debug = true; 54 if ($debug == true){ 55 echo "$message<br>"; 56 } 57 } 58 59 /** 60 * This function returns the final tag out of the tag name, an array 61 * of attributes, and the type of the tag. This function is called by 62 * sanitize internally. 63 * 64 * @param $tagname the name of the tag. 65 * @param $attary the array of attributes and their values 66 * @param $tagtype The type of the tag (see in comments). 67 * @return a string with the final tag representation. 68 */ 69 function tagprint($tagname, $attary, $tagtype){ 70 $me = 'tagprint'; 71 if ($tagtype == 2){ 72 $fulltag = '</' . $tagname . '>'; 73 } else { 74 $fulltag = '<' . $tagname; 75 if (is_array($attary) && sizeof($attary)){ 76 $atts = Array(); 77 while (list($attname, $attvalue) = each($attary)){ 78 array_push($atts, "$attname=$attvalue"); 79 } 80 $fulltag .= ' ' . join(' ', $atts); 81 } 82 if ($tagtype == 3){ 83 $fulltag .= ' /'; 84 } 85 $fulltag .= '>'; 86 } 87 $this->spew("$me: $fulltag\n"); 88 return $fulltag; 89 } 90 91 /** 92 * A small helper function to use with array_walk. Modifies a by-ref 93 * value and makes it lowercase. 94 * 95 * @param $val a value passed by-ref. 96 * @return void since it modifies a by-ref value. 97 */ 98 function casenormalize(&$val){ 99 $val = strtolower($val); 100 } 101 102 /** 103 * This function skips any whitespace from the current position within 104 * a string and to the next non-whitespace value. 105 * 106 * @param $body the string 107 * @param $offset the offset within the string where we should start 108 * looking for the next non-whitespace character. 109 * @return the location within the $body where the next 110 * non-whitespace char is located. 111 */ 112 function skipspace($body, $offset){ 113 $me = 'skipspace'; 114 preg_match('/^(\s*)/s', substr($body, $offset), $matches); 115 if (sizeof($matches{1})){ 116 $count = strlen($matches{1}); 117 $this->spew("$me: skipped $count chars\n"); 118 $offset += $count; 119 } 120 return $offset; 121 } 122 123 /** 124 * This function looks for the next character within a string. It's 125 * really just a glorified "strpos", except it catches the failures 126 * nicely. 127 * 128 * @param $body The string to look for needle in. 129 * @param $offset Start looking from this position. 130 * @param $needle The character/string to look for. 131 * @return location of the next occurance of the needle, or 132 * strlen($body) if needle wasn't found. 133 */ 134 function findnxstr($body, $offset, $needle){ 135 $me = 'findnxstr'; 136 $pos = strpos($body, $needle, $offset); 137 if ($pos === FALSE){ 138 $pos = strlen($body); 139 $this->spew("$me: end of body reached\n"); 140 } 141 $this->spew("$me: '$needle' found at pos $pos\n"); 142 return $pos; 143 } 144 145 /** 146 * This function takes a PCRE-style regexp and tries to match it 147 * within the string. 148 * 149 * @param $body The string to look for needle in. 150 * @param $offset Start looking from here. 151 * @param $reg A PCRE-style regex to match. 152 * @return Returns a false if no matches found, or an array 153 * with the following members: 154 * - integer with the location of the match within $body 155 * - string with whatever content between offset and the match 156 * - string with whatever it is we matched 157 */ 158 function findnxreg($body, $offset, $reg){ 159 $me = 'findnxreg'; 160 $matches = Array(); 161 $retarr = Array(); 162 $preg_rule = '%^(.*?)(' . $reg . ')%s'; 163 preg_match($preg_rule, substr($body, $offset), $matches); 164 if (!$matches{0}){ 165 $this->spew("$me: No matches found.\n"); 166 $retarr = false; 167 } else { 168 $retarr{0} = $offset + strlen($matches{1}); 169 $retarr{1} = $matches{1}; 170 $retarr{2} = $matches{2}; 171 $this->spew("$me: '$reg' found at pos $offset matching '".$matches{2}."'\n"); 172 } 173 return $retarr; 174 } 175 176 /** 177 * This function looks for the next tag. 178 * 179 * @param $body String where to look for the next tag. 180 * @param $offset Start looking from here. 181 * @return false if no more tags exist in the body, or 182 * an array with the following members: 183 * - string with the name of the tag 184 * - array with attributes and their values 185 * - integer with tag type (1, 2, or 3) 186 * - integer where the tag starts (starting "<") 187 * - integer where the tag ends (ending ">") 188 * first three members will be false, if the tag is invalid. 189 */ 190 function getnxtag($body, $offset){ 191 $me = 'getnxtag'; 192 if ($offset > strlen($body)){ 193 $this->spew("$me: Past the end of body\n"); 194 return false; 195 } 196 $lt = $this->findnxstr($body, $offset, '<'); 197 if ($lt == strlen($body)){ 198 $this->spew("$me: No more tags found!\n"); 199 return false; 200 } 201 /** 202 * We are here: 203 * blah blah <tag attribute="value"> 204 * \---------^ 205 */ 206 $this->spew("$me: Found '<' at pos $lt\n"); 207 $pos = $this->skipspace($body, $lt + 1); 208 if ($pos >= strlen($body)){ 209 $this->spew("$me: End of body reached.\n"); 210 return Array(false, false, false, $lt, strlen($body)); 211 } 212 /** 213 * There are 3 kinds of tags: 214 * 1. Opening tag, e.g.: 215 * <a href="blah"> 216 * 2. Closing tag, e.g.: 217 * </a> 218 * 3. XHTML-style content-less tag, e.g.: 219 * <img src="blah"/> 220 */ 221 $tagtype = false; 222 switch (substr($body, $pos, 1)){ 223 case '/': 224 $this->spew("$me: This is a closing tag (type 2)\n"); 225 $tagtype = 2; 226 $pos++; 227 break; 228 case '!': 229 /** 230 * A comment or an SGML declaration. 231 */ 232 if (substr($body, $pos+1, 2) == '--'){ 233 $this->spew("$me: A comment found. Stripping.\n"); 234 $gt = strpos($body, '-->', $pos); 235 if ($gt === false){ 236 $gt = strlen($body); 237 } else { 238 $gt += 2; 239 } 240 return Array(false, false, false, $lt, $gt); 241 } else { 242 $this->spew("$me: An SGML declaration found. Stripping.\n"); 243 $gt = $this->findnxstr($body, $pos, '>'); 244 return Array(false, false, false, $lt, $gt); 245 } 246 break; 247 default: 248 /** 249 * Assume tagtype 1 for now. If it's type 3, we'll switch values 250 * later. 251 */ 252 $tagtype = 1; 253 break; 254 } 255 256 $tag_start = $pos; 257 $tagname = ''; 258 /** 259 * Look for next [\W-_], which will indicate the end of the tag name. 260 */ 261 $regary = $this->findnxreg($body, $pos, '[^\w\-_]'); 262 if ($regary == false){ 263 $this->spew("$me: End of body reached while analyzing tag name\n"); 264 return Array(false, false, false, $lt, strlen($body)); 265 } 266 list($pos, $tagname, $match) = $regary; 267 $tagname = strtolower($tagname); 268 269 /** 270 * $match can be either of these: 271 * '>' indicating the end of the tag entirely. 272 * '\s' indicating the end of the tag name. 273 * '/' indicating that this is type-3 xhtml tag. 274 * 275 * Whatever else we find there indicates an invalid tag. 276 */ 277 switch ($match){ 278 case '/': 279 /** 280 * This is an xhtml-style tag with a closing / at the 281 * end, like so: <img src="blah"/>. Check if it's followed 282 * by the closing bracket. If not, then this tag is invalid 283 */ 284 if (substr($body, $pos, 2) == '/>'){ 285 $this->spew("$me: XHTML-style tag found.\n"); 286 $pos++; 287 $this->spew("$me: Setting tagtype to 3\n"); 288 $tagtype = 3; 289 } else { 290 $this->spew("$me: Found invalid character '/'.\n"); 291 $gt = $this->findnxstr($body, $pos, '>'); 292 $this->spew("$me: Tag is invalid. Returning.\n"); 293 $retary = Array(false, false, false, $lt, $gt); 294 return $retary; 295 } 296 case '>': 297 $this->spew("$me: End of tag found at $pos\n"); 298 $this->spew("$me: Tagname is '$tagname'\n"); 299 $this->spew("$me: This tag has no attributes\n"); 300 return Array($tagname, false, $tagtype, $lt, $pos); 301 break; 302 default: 303 /** 304 * Check if it's whitespace 305 */ 306 if (preg_match('/\s/', $match)){ 307 $this->spew("$me: Tagname is '$tagname'\n"); 308 } else { 309 /** 310 * This is an invalid tag! Look for the next closing ">". 311 */ 312 $this->spew("$me: Invalid characters found in tag name: $match\n"); 313 $gt = $this->findnxstr($body, $offset, '>'); 314 return Array(false, false, false, $lt, $gt); 315 } 316 } 317 318 /** 319 * At this point we're here: 320 * <tagname attribute='blah'> 321 * \-------^ 322 * 323 * At this point we loop in order to find all attributes. 324 */ 325 $attname = ''; 326 $atttype = false; 327 $attary = Array(); 328 329 while ($pos <= strlen($body)){ 330 $pos = $this->skipspace($body, $pos); 331 if ($pos == strlen($body)){ 332 /** 333 * Non-closed tag. 334 */ 335 $this->spew("$me: End of body reached before end of tag. Discarding.\n"); 336 return Array(false, false, false, $lt, $pos); 337 } 338 /** 339 * See if we arrived at a ">" or "/>", which means that we reached 340 * the end of the tag. 341 */ 342 $matches = Array(); 343 preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches); 344 if (isset($matches{0}) && $matches{0}){ 345 /** 346 * Yep. So we did. 347 */ 348 $this->spew("$me: Arrived at the end of the tag.\n"); 349 $pos += strlen($matches{1}); 350 if ($matches{2} == '/>'){ 351 $tagtype = 3; 352 $pos++; 353 } 354 return Array($tagname, $attary, $tagtype, $lt, $pos); 355 } 356 357 /** 358 * There are several types of attributes, with optional 359 * [:space:] between members. 360 * Type 1: 361 * attrname[:space:]=[:space:]'CDATA' 362 * Type 2: 363 * attrname[:space:]=[:space:]"CDATA" 364 * Type 3: 365 * attr[:space:]=[:space:]CDATA 366 * Type 4: 367 * attrname 368 * 369 * We leave types 1 and 2 the same, type 3 we check for 370 * '"' and convert to """ if needed, then wrap in 371 * double quotes. Type 4 we convert into: 372 * attrname="yes". 373 */ 374 $regary = $this->findnxreg($body, $pos, '[^\w\-_]'); 375 if ($regary == false){ 376 /** 377 * Looks like body ended before the end of tag. 378 */ 379 $this->spew("$me: End of body found before end of tag.\n"); 380 $this->spew("$me: Invalid, returning\n"); 381 return Array(false, false, false, $lt, strlen($body)); 382 } 383 list($pos, $attname, $match) = $regary; 384 $attname = strtolower($attname); 385 $this->spew("$me: Attribute '$attname' found\n"); 386 /** 387 * We arrived at the end of attribute name. Several things possible 388 * here: 389 * '>' means the end of the tag and this is attribute type 4 390 * '/' if followed by '>' means the same thing as above 391 * '\s' means a lot of things -- look what it's followed by. 392 * anything else means the attribute is invalid. 393 */ 394 switch($match){ 395 case '/': 396 /** 397 * This is an xhtml-style tag with a closing / at the 398 * end, like so: <img src="blah"/>. Check if it's followed 399 * by the closing bracket. If not, then this tag is invalid 400 */ 401 if (substr($body, $pos, 2) == '/>'){ 402 $this->spew("$me: This is an xhtml-style tag.\n"); 403 $pos++; 404 $this->spew("$me: Setting tagtype to 3\n"); 405 $tagtype = 3; 406 } else { 407 $this->spew("$me: Found invalid character '/'.\n"); 408 $gt = $this->findnxstr($body, $pos, '>'); 409 $this->spew("$me: Tag is invalid. Returning.\n"); 410 $retary = Array(false, false, false, $lt, $gt); 411 return $retary; 412 } 413 case '>': 414 $this->spew("$me: found type 4 attribute.\n"); 415 $this->spew("$me: Additionally, end of tag found at $pos\n"); 416 $this->spew("$me: Attname is '$attname'\n"); 417 $this->spew("$me: Setting attvalue to 'yes'\n"); 418 $attary{$attname} = '"yes"'; 419 return Array($tagname, $attary, $tagtype, $lt, $pos); 420 break; 421 default: 422 /** 423 * Skip whitespace and see what we arrive at. 424 */ 425 $pos = $this->skipspace($body, $pos); 426 $char = substr($body, $pos, 1); 427 /** 428 * Two things are valid here: 429 * '=' means this is attribute type 1 2 or 3. 430 * \w means this was attribute type 4. 431 * anything else we ignore and re-loop. End of tag and 432 * invalid stuff will be caught by our checks at the beginning 433 * of the loop. 434 */ 435 if ($char == '='){ 436 $this->spew("$me: Attribute type 1, 2, or 3 found.\n"); 437 $pos++; 438 $pos = $this->skipspace($body, $pos); 439 /** 440 * Here are 3 possibilities: 441 * "'" attribute type 1 442 * '"' attribute type 2 443 * everything else is the content of tag type 3 444 */ 445 $quot = substr($body, $pos, 1); 446 if ($quot == '\''){ 447 $this->spew("$me: In fact, this is attribute type 1\n"); 448 $this->spew("$me: looking for closing quote\n"); 449 $regary = $this->findnxreg($body, $pos+1, '\''); 450 if ($regary == false){ 451 $this->spew("$me: end of body reached before end of val\n"); 452 $this->spew("$me: Returning\n"); 453 return Array(false, false, false, $lt, strlen($body)); 454 } 455 list($pos, $attval, $match) = $regary; 456 $this->spew("$me: Attvalue is '$attval'\n"); 457 $pos++; 458 $attary{$attname} = '\'' . $attval . '\''; 459 } else if ($quot == '"'){ 460 $this->spew("$me: In fact, this is attribute type 2\n"); 461 $this->spew("$me: looking for closing quote\n"); 462 $regary = $this->findnxreg($body, $pos+1, '\"'); 463 if ($regary == false){ 464 $this->spew("$me: end of body reached before end of val\n"); 465 $this->spew("$me: Returning\n"); 466 return Array(false, false, false, $lt, strlen($body)); 467 } 468 list($pos, $attval, $match) = $regary; 469 $this->spew("$me: Attvalue is \"$attval\"\n"); 470 $pos++; 471 $attary{$attname} = '"' . $attval . '"'; 472 } else { 473 $this->spew("$me: This looks like attribute type 3\n"); 474 /** 475 * These are hateful. Look for \s, or >. 476 */ 477 $this->spew("$me: Looking for end of attval\n"); 478 $regary = $this->findnxreg($body, $pos, '[\s>]'); 479 if ($regary == false){ 480 $this->spew("$me: end of body reached before end of val\n"); 481 $this->spew("$me: Returning\n"); 482 return Array(false, false, false, $lt, strlen($body)); 483 } 484 list($pos, $attval, $match) = $regary; 485 /** 486 * If it's ">" it will be caught at the top. 487 */ 488 $this->spew("$me: translating '\"' into "\n"); 489 $attval = preg_replace('/\"/s', '"', $attval); 490 $this->spew("$me: wrapping in quotes\n"); 491 $attary{$attname} = '"' . $attval . '"'; 492 } 493 } else if (preg_match('|[\w/>]|', $char)) { 494 /** 495 * That was attribute type 4. 496 */ 497 $this->spew("$me: attribute type 4 found.\n"); 498 $this->spew("$me: Setting value to 'yes'\n"); 499 $attary{$attname} = '"yes"'; 500 } else { 501 /** 502 * An illegal character. Find next '>' and return. 503 */ 504 $this->spew("$me: illegal character '$char' found.\n"); 505 $this->spew("$me: returning\n"); 506 $gt = $this->findnxstr($body, $pos, '>'); 507 return Array(false, false, false, $lt, $gt); 508 } 509 } 510 } 511 /** 512 * The fact that we got here indicates that the tag end was never 513 * found. Return invalid tag indication so it gets stripped. 514 */ 515 $this->spew("$me: No tag end found\n"); 516 return Array(false, false, false, $lt, strlen($body)); 517 } 518 519 /** 520 * This function checks attribute values for entity-encoded values 521 * and returns them translated into 8-bit strings so we can run 522 * checks on them. 523 * 524 * @param $attvalue A string to run entity check against. 525 * @return Translated value. 526 */ 527 function deent($attvalue){ 528 $me = 'deent'; 529 /** 530 * See if we have to run the checks first. All entities must start 531 * with "&". 532 */ 533 if (strpos($attvalue, '&') === false){ 534 return $attvalue; 535 } 536 /** 537 * Check named entities first. 538 */ 539 $this->spew("$me: translating named entities\n"); 540 $trans = get_html_translation_table(HTML_ENTITIES); 541 /** 542 * Leave " in, as it can mess us up. 543 */ 544 $trans = array_flip($trans); 545 unset($trans{'"'}); 546 while (list($ent, $val) = each($trans)){ 547 $attvalue = preg_replace('/' . $ent . '*/si', $val, $attvalue); 548 } 549 /** 550 * Now translate numbered entities from 1 to 255 if needed. 551 */ 552 if (strpos($attvalue, '#') !== false){ 553 $this->spew("$me: translating numbered entities\n"); 554 $omit = Array(34, 39); 555 for ($asc = 256; $asc >= 0; $asc--){ 556 if (!in_array($asc, $omit)){ 557 $chr = chr($asc); 558 $octrule = '/\�*' . $asc . ';*/si'; 559 $hexrule = '/\�*' . dechex($asc) . ';*/si'; 560 $attvalue = preg_replace($octrule, $chr, $attvalue); 561 $attvalue = preg_replace($hexrule, $chr, $attvalue); 562 } 563 } 564 } 565 $this->spew("$me: translated into: $attvalue\n"); 566 return $attvalue; 567 } 568 569 /** 570 * This function runs various checks against the attributes. 571 * 572 * @param $tagname String with the name of the tag. 573 * @param $attary Array with all tag attributes. 574 * @param $rm_attnames See description for sanitize 575 * @param $bad_attvals See description for sanitize 576 * @param $add_attr_to_tag See description for sanitize 577 * @return Array with modified attributes. 578 */ 579 function fixatts($tagname, 580 $attary, 581 $rm_attnames, 582 $bad_attvals, 583 $add_attr_to_tag 584 ){ 585 $me = 'fixatts'; 586 $this->spew("$me: Fixing attributes\n"); 587 while (list($attname, $attvalue) = each($attary)){ 588 /** 589 * See if this attribute should be removed. 590 */ 591 foreach ($rm_attnames as $matchtag=>$matchattrs){ 592 if (preg_match($matchtag, $tagname)){ 593 foreach ($matchattrs as $matchattr){ 594 if (preg_match($matchattr, $attname)){ 595 $this->spew("$me: Attribute '$attname' defined as bad.\n"); 596 $this->spew("$me: Removing.\n"); 597 unset($attary{$attname}); 598 continue; 599 } 600 } 601 } 602 } 603 /** 604 * Remove any entities. 605 */ 606 $attvalue = $this->deent($attvalue); 607 608 /** 609 * Now let's run checks on the attvalues. 610 * I don't expect anyone to comprehend this. If you do, 611 * get in touch with me so I can drive to where you live and 612 * shake your hand personally. :) 613 */ 614 foreach ($bad_attvals as $matchtag=>$matchattrs){ 615 if (preg_match($matchtag, $tagname)){ 616 foreach ($matchattrs as $matchattr=>$valary){ 617 if (preg_match($matchattr, $attname)){ 618 /** 619 * There are two arrays in valary. 620 * First is matches. 621 * Second one is replacements 622 */ 623 list($valmatch, $valrepl) = $valary; 624 $newvalue = preg_replace($valmatch, $valrepl, $attvalue); 625 if ($newvalue != $attvalue){ 626 $this->spew("$me: attvalue is now $newvalue\n"); 627 $attary{$attname} = $newvalue; 628 } 629 } 630 } 631 } 632 } 633 } 634 /** 635 * See if we need to append any attributes to this tag. 636 */ 637 foreach ($add_attr_to_tag as $matchtag=>$addattary){ 638 if (preg_match($matchtag, $tagname)){ 639 $attary = array_merge($attary, $addattary); 640 $this->spew("$me: Added attributes to this tag\n"); 641 } 642 } 643 return $attary; 644 } 645 646 /** 647 * This is the main function and the one you should actually be calling. 648 * There are several variables you should be aware of an which need 649 * special description. 650 * 651 * $tag_list 652 * ---------- 653 * This is a simple one-dimentional array of strings, except for the 654 * very first one. The first member should be einter false or true. 655 * In case it's FALSE, the following list will be considered a list of 656 * tags that should be explicitly REMOVED from the body, and all 657 * others that did not match the list will be allowed. If the first 658 * member is TRUE, then the list is the list of tags that should be 659 * explicitly ALLOWED -- any tag not matching this list will be 660 * discarded. 661 * 662 * Examples: 663 * $tag_list = Array( 664 * false, 665 * "blink", 666 * "link", 667 * "object", 668 * "meta", 669 * "marquee", 670 * "html" 671 * ); 672 * 673 * This will allow all tags except for blink, link, object, meta, marquee, 674 * and html. 675 * 676 * $tag_list = Array( 677 * true, 678 * "b", 679 * "a", 680 * "i", 681 * "img", 682 * "strong", 683 * "em", 684 * "p" 685 * ); 686 * 687 * This will remove all tags from the body except b, a, i, img, strong, em and 688 * p. 689 * 690 * $rm_tags_with_content 691 * --------------------- 692 * This is a simple one-dimentional array of strings, which specifies the 693 * tags to be removed with any and all content between the beginning and 694 * the end of the tag. 695 * Example: 696 * $rm_tags_with_content = Array( 697 * "script", 698 * "style", 699 * "applet", 700 * "embed" 701 * ); 702 * 703 * This will remove the following structure: 704 * <script> 705 * window.alert("Isn't cross-site-scripting fun?!"); 706 * </script> 707 * 708 * $self_closing_tags 709 * ------------------ 710 * This is a simple one-dimentional array of strings, which specifies which 711 * tags contain no content and should not be forcefully closed if this option 712 * is turned on (see further). 713 * Example: 714 * $self_closing_tags = Array( 715 * "img", 716 * "br", 717 * "hr", 718 * "input" 719 * ); 720 * 721 * $force_tag_closing 722 * ------------------ 723 * Set it to true to forcefully close any tags opened within the document. 724 * This is good if you want to take care of people who like to screw up 725 * the pages by leaving unclosed tags like <a>, <b>, <i>, etc. 726 * 727 * $rm_attnames 728 * ------------- 729 * Now we come to parameters that are more obscure. This parameter is 730 * a nested array which is used to specify which attributes should be 731 * removed. It goes like so: 732 * 733 * $rm_attnames = Array( 734 * "PCRE regex to match tag name" => 735 * Array( 736 * "PCRE regex to match attribute name" 737 * ) 738 * ); 739 * 740 * Example: 741 * $rm_attnames = Array( 742 * "|.*|" => 743 * Array( 744 * "|target|i", 745 * "|^on.*|i" 746 * ) 747 * ); 748 * 749 * This will match all attributes (.*), and specify that all attributes 750 * named "target" and starting with "on" should be removed. This will take 751 * care of the following problem: 752 * <em onmouseover="window.alert('muahahahaha')"> 753 * The "onmouseover" will be removed. 754 * 755 * $bad_attvals 756 * ------------ 757 * This is where it gets ugly. This is a nested array with many levels. 758 * It goes like so: 759 * 760 * $bad_attvals = Array( 761 * "pcre regex to match tag name" => 762 * Array( 763 * "pcre regex to match attribute name" => 764 * Array( 765 * "pcre regex to match attribute value" 766 * ) 767 * Array( 768 * "pcre regex replace a match from above with" 769 * ) 770 * ) 771 * ); 772 * 773 * An extensive example: 774 * 775 * $bad_attvals = Array( 776 * "|.*|" => 777 * Array( 778 * "/^src|background|href|action/i" => 779 * Array( 780 * Array( 781 * "/^([\'\"])\s*\S+script\s*:.*([\'\"])/si" 782 * ), 783 * Array( 784 * "\\1http://veryfunny.com/\\2" 785 * ) 786 * ), 787 * "/^style/i" => 788 * Array( 789 * Array( 790 * "/expression/si", 791 * "/url\(([\'\"])\s*https*:.*([\'\"])\)/si", 792 * "/url\(([\'\"])\s*\S+script:.*([\'\"])\)/si" 793 * ), 794 * Array( 795 * "idiocy", 796 * "url(\\1http://veryfunny.com/\\2)", 797 * "url(\\1http://veryfynny.com/\\2)" 798 * ) 799 * ) 800 * ) 801 * ); 802 * 803 * This will take care of nearly all known cross-site scripting exploits, 804 * plus some (see my filter sample at 805 * http://www.mricon.com/html/phpfilter.html for a working version). 806 * 807 * $add_attr_to_tag 808 * ---------------- 809 * This is a useful little feature which lets you add attributes to 810 * certain tags. It is a nested array as well, but not at all like 811 * the previous one. It goes like so: 812 * 813 * $add_attr_to_tag = Array( 814 * "PCRE regex to match tag name" => 815 * Array( 816 * "attribute name"=>'"attribute value"' 817 * ) 818 * ); 819 * 820 * Note: don't forget quotes around attribute value. 821 * 822 * Example: 823 * 824 * $add_attr_to_tag = Array( 825 * "/^a$/si" => 826 * Array( 827 * 'target'=>'"_new"' 828 * ) 829 * ); 830 * 831 * This will change all <a> tags and add target="_new" to them so all links 832 * open in a new window. 833 * 834 * 835 * 836 * @param $body the string with HTML you wish to filter 837 * @param $tag_list see description above 838 * @param $rm_tags_with_content see description above 839 * @param $self_closing_tags see description above 840 * @param $force_tag_closing see description above 841 * @param $rm_attnames see description above 842 * @param $bad_attvals see description above 843 * @param $add_attr_to_tag see description above 844 * @return sanitized html safe to show on your pages. 845 */ 846 function sanitize($body, 847 $tag_list, 848 $rm_tags_with_content, 849 $self_closing_tags, 850 $force_tag_closing, 851 $rm_attnames, 852 $bad_attvals, 853 $add_attr_to_tag 854 ){ 855 $me = 'sanitize'; 856 /** 857 * Normalize rm_tags and rm_tags_with_content. 858 */ 859 @array_walk($rm_tags, 'casenormalize'); 860 @array_walk($rm_tags_with_content, 'casenormalize'); 861 @array_walk($self_closing_tags, 'casenormalize'); 862 /** 863 * See if tag_list is of tags to remove or tags to allow. 864 * false means remove these tags 865 * true means allow these tags 866 */ 867 $rm_tags = array_shift($tag_list); 868 $curpos = 0; 869 $open_tags = Array(); 870 #$trusted = "<!-- begin sanitized html -->\n"; 871 $trusted = ""; 872 $skip_content = false; 873 /** 874 * Take care of netscape's stupid javascript entities like 875 * &{alert('boo')}; 876 */ 877 $body = preg_replace('/&(\{.*?\};)/si', '&\\1', $body); 878 $this->spew("$me: invoking the loop\n"); 879 while (($curtag = $this->getnxtag($body, $curpos)) != FALSE){ 880 list($tagname, $attary, $tagtype, $lt, $gt) = $curtag; 881 $this->spew("$me: grabbing free-standing content\n"); 882 $free_content = substr($body, $curpos, $lt - $curpos); 883 $this->spew("$me: " . strlen($free_content) . " chars grabbed\n"); 884 if ($skip_content == false){ 885 $this->spew("$me: appending free content to trusted.\n"); 886 $trusted .= $free_content; 887 } else { 888 $this->spew("$me: Skipping free content.\n"); 889 } 890 if ($tagname != FALSE){ 891 $this->spew("$me: tagname is '$tagname'\n"); 892 if ($tagtype == 2){ 893 $this->spew("$me: This is a closing tag\n"); 894 if ($skip_content == $tagname){ 895 /** 896 * Got to the end of tag we needed to remove. 897 */ 898 $this->spew("$me: Finished removing tag with content\n"); 899 $tagname = false; 900 $skip_content = false; 901 } else { 902 if ($skip_content == false){ 903 if (isset($open_tags{$tagname}) && 904 $open_tags{$tagname} > 0){ 905 $this->spew("$me: popping '$tagname' from open_tags\n"); 906 $open_tags{$tagname}--; 907 } else { 908 $this->spew("$me: '$tagname' was never opened\n"); 909 $this->spew("$me: removing\n"); 910 $tagname = false; 911 } 912 } else { 913 $this->spew("$me: Skipping this tag\n"); 914 } 915 } 916 } else { 917 /** 918 * $rm_tags_with_content 919 */ 920 if ($skip_content == false){ 921 /** 922 * See if this is a self-closing type and change 923 * tagtype appropriately. 924 */ 925 if ($tagtype == 1 926 && in_array($tagname, $self_closing_tags)){ 927 $this->spew("$me: Self-closing tag. Changing tagtype.\n"); 928 $tagtype = 3; 929 } 930 /** 931 * See if we should skip this tag and any content 932 * inside it. 933 */ 934 if ($tagtype == 1 && in_array($tagname, $rm_tags_with_content)){ 935 $this->spew("$me: removing this tag with content\n"); 936 $skip_content = $tagname; 937 } else { 938 if (($rm_tags == false && in_array($tagname, $tag_list)) || 939 ($rm_tags == true && !in_array($tagname, $tag_list))){ 940 $this->spew("$me: Removing this tag.\n"); 941 $tagname = false; 942 } else { 943 if ($tagtype == 1){ 944 $this->spew("$me: adding '$tagname' to open_tags\n"); 945 if (isset($open_tags{$tagname})){ 946 $open_tags{$tagname}++; 947 } else { 948 $open_tags{$tagname} = 1; 949 } 950 } 951 /** 952 * This is where we run other checks. 953 */ 954 if (is_array($attary) && sizeof($attary) > 0){ 955 $attary = $this->fixatts($tagname, 956 $attary, 957 $rm_attnames, 958 $bad_attvals, 959 $add_attr_to_tag); 960 } 961 } 962 } 963 } else { 964 $this->spew("$me: Skipping this tag\n"); 965 } 966 } 967 if ($tagname != false && $skip_content == false){ 968 $this->spew("$me: Appending tag to trusted.\n"); 969 $trusted .= $this->tagprint($tagname, $attary, $tagtype); 970 } 971 } else { 972 $this->spew("$me: Removing invalid tag\n"); 973 } 974 $curpos = $gt + 1; 975 } 976 $this->spew("$me: Appending any leftover content\n"); 977 $trusted .= substr($body, $curpos, strlen($body) - $curpos); 978 if ($force_tag_closing == true){ 979 foreach ($open_tags as $tagname=>$opentimes){ 980 while ($opentimes > 0){ 981 $this->spew("$me: '$tagname' left open. Closing by force.\n"); 982 $trusted .= '</' . $tagname . '>'; 983 $opentimes--; 984 } 985 } 986 $trusted .= "\n"; 987 } 988 # $trusted .= "<!-- end sanitized html -->\n"; 989 $trusted .= ""; 990 return $trusted; 991 } 992 // class end 993 } 994 ?>
titre
Description
Corps
titre
Description
Corps
titre
Description
Corps
titre
Corps
Généré le : Sun Feb 25 17:20:01 2007 | par Balluche grâce à PHPXref 0.7 |