A function to close HTML tags in PHP (Brute Force)
02 November 2010
This is a result of several hours' hard-wording, so please keep my name there if possible when you are using it.
Test case:
echo htmlEscape(closeHtmlTags("<a>a")).'<br />'; // should be closed.In the above test case, the htmlEscape is a function to escape "<" to "<" etc.
echo htmlEscape(closeHtmlTags("<_a>a")).'<br />'; // _a is not a valid html tag.
echo htmlEscape(closeHtmlTags("<a_>a")).'<br />'; // a_ is a possible html tag.
echo htmlEscape(closeHtmlTags("a")).'<br />';
echo htmlEscape(closeHtmlTags("<a>a<")).'<br />'; // Recognize last '<' as close tag.
echo htmlEscape(closeHtmlTags("<a>a</")).'<br />';
echo htmlEscape(closeHtmlTags("<a>a</a")).'<br />';
echo htmlEscape(closeHtmlTags("<a>a</a>")).'<br />';
echo htmlEscape(closeHtmlTags("<")).'<br />';
echo htmlEscape(closeHtmlTags("<a><img href=\"<aaa c='s>.jpg\"><img><br ><br><br/><br//>a")).'<br />';
echo htmlEscape(closeHtmlTags('<p href=">"><div><p><div><p>aa</p r="2>"><div>arr')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><p2><x><p3><p4>m</x>aa')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><p2><x><p3><p4>m<')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><p2><x><p3><p4>m</')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><p2><x><p3><p4>m</p')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><p2><x><p3><p4>m</p4 rel=">\'>')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><p2><x><p3><p4>m</x')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><p2><xx><p3><p4>m</x')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><script><xx>r')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><script><xx>r<')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><script><xx>r</')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><script><xx>r</scr')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><script><xx>r</script m="5>"')).'<br />';
echo htmlEscape(closeHtmlTags('<p1><script><xx>r</script m="5>">')).'<br />';
// By David <david@24k.com.sg>
function htmlEscape($src) {
$htmlReplaceTable = array("\n"=>"<br/>","&"=>"&","<"=>"<",">"=>">","\r"=>"", "\""=>""");
$src = "$src";
$dst = "";
for ($index = 0, $indexMax = strlen($src); $index < $indexMax; $index++) {
$char = $src[$index];
if (isset($htmlReplaceTable[$char])) {
$dst = $dst . $htmlReplaceTable[$char];
} else {
$dst = $dst . $char;
}
}
return $dst;
}
The function:
- <?php
- // By David <david@24k.com.sg>
- function closeHtmlTags($html) {
- $arr_single_tags = array('meta', 'img', 'br', 'link', 'area', 'hr', 'input', '!');
- $at = 0;
- $end = strlen($html);
- $isInQuote1 = false;
- $isInQuote2 = false;
- $isInTag = false;
- $isInOpeningTag = false;
- $isReadingTag = false;
- $tagClosing = array();
- $tagClosingCount = 0;
- while ($at < $end) {
- $char = $html{$at};
- if ($char == '<') {
- if ($isInQuote1) {
- // Pass
- } else if ($isInQuote2) {
- // Pass
- } else if ($isInTag) {
- // Pass
- } else {
- if ($at == $end - 1) {
- if ($tagClosingCount) {
- $html .= "/";
- $isInTag = true;
- $isInOpeningTag = false;
- $isReadingTag = true;
- $tagCurr = '';
- } else {
- $html .= " />";
- }
- break;
- } else {
- $charNext = $html{++$at};
- if (($charNext >= 'a' && $charNext <= 'z') || ($charNext >= 'A' && $charNext <= 'Z') || ($charNext == '!')) {
- $isInTag = true;
- $isInOpeningTag = true;
- $isReadingTag = $charNext != '!';
- $tagCurr = $charNext;
- } else if ($charNext == '/') {
- if ($at == $end - 1) {
- $isInTag = true;
- $isInOpeningTag = false;
- $isReadingTag = true;
- $tagCurr = '';
- break;
- } else {
- $charNext = $html{++$at};
- if (($charNext >= 'a' && $charNext <= 'z') || ($charNext >= 'A' && $charNext <= 'Z')) {
- $isInTag = true;
- $isInOpeningTag = false;
- $isReadingTag = true;
- $tagCurr = $charNext;
- } else {
- // Pass
- }
- }
- } else {
- // Pass
- }
- }
- }
- } else if ($char == '>') {
- if ($isInQuote1) {
- // Pass
- } else if ($isInQuote2) {
- // Pass
- } else if (!$isInTag) {
- // Pass
- } else {
- $isInTag = false;
- $isReadingTag = false;
- $tagCurr = strtolower($tagCurr);
- if ($isInOpeningTag) {
- if ($tagCurr === "script") {
- $pos = stripos($html, "</script", $at);
- if ($pos === false) {
- $len = strlen($html);
- if (!strcmp(strtolower(substr($html, $len - 1)), "<")) {
- $html .= "/script>";
- } else if (!strcmp(strtolower(substr($html, $len - 2)), "</")) {
- $html .= "script>";
- } else if (!strcmp(strtolower(substr($html, $len - 3)), "</s")) {
- $html .= "cript>";
- } else if (!strcmp(strtolower(substr($html, $len - 4)), "</sc")) {
- $html .= "ript>";
- } else if (!strcmp(strtolower(substr($html, $len - 5)), "</scr")) {
- $html .= "ipt>";
- } else if (!strcmp(strtolower(substr($html, $len - 6)), "</scri")) {
- $html .= "pt>";
- } else if (!strcmp(strtolower(substr($html, $len - 7)), "</scrip")) {
- $html .= "t>";
- } else if (!strcmp(strtolower(substr($html, $len - 8)), "</script")) {
- $html .= ">";
- } else {
- $html .= "</script>";
- }
- break;
- } else {
- $at = $pos + 8;
- array_push($tagClosing, "script");
- $tagClosingCount++;
- $isInTag = true;
- $isInOpeningTag = false;
- $isReadingTag = false;
- $tagCurr = "script";
- }
- } else if (in_array($tagCurr, $arr_single_tags, true)) {
- // Pass
- } else {
- array_push($tagClosing, $tagCurr);
- $tagClosingCount++;
- }
- } else {
- if ($tagClosingCount && $tagClosing[$tagClosingCount - 1] === $tagCurr) {
- array_pop($tagClosing);
- $tagClosingCount--;
- } else {
- $tagAt = $tagClosingCount - 2;
- while ($tagAt >= 0) {
- if ($tagClosing[$tagAt] === $tagCurr) {
- break;
- }
- $tagAt--;
- }
- if ($tagAt >= 0) {
- $tagClosingCount--;
- while ($tagAt < $tagClosingCount) {
- $tagAt2 = $tagAt + 1;
- $tagClosing[$tagAt] = $tagClosing[$tagAt2];
- $tagAt = $tagAt2;
- }
- array_pop($tagClosing);
- } else {
- // Pass
- }
- }
- }
- }
- } else if ($char == '"') {
- if ($isInQuote1) {
- $isInQuote1 = false;
- } else if ($isInQuote2) {
- // Pass
- } else if ($isInTag) {
- $isReadingTag = false;
- $isInQuote1 = true;
- } else {
- // Pass
- }
- } else if ($char == "'") {
- if ($isInQuote1) {
- // Pass
- } else if ($isInQuote2) {
- $isInQuote2 = false;
- } else if ($isInTag) {
- $isReadingTag = false;
- $isInQuote2 = true;
- } else {
- // Pass
- }
- } else if (($char >= 'a' && $char <= 'z') || ($char >= 'A' && $char <= 'Z') || ($char == "_") || ($char >= '0' && $char <= '9')) {
- if ($isInQuote1) {
- // Pass
- } else if ($isInQuote2) {
- // Pass
- } else if ($isInTag) {
- if ($isReadingTag) {
- $tagCurr .= $char;
- } else {
- // Pass
- }
- } else {
- // Pass
- }
- } else {
- if ($isInQuote1) {
- // Pass
- } else if ($isInQuote2) {
- // Pass
- } else if ($isInTag) {
- $isReadingTag = false;
- } else {
- // Pass
- }
- }
- $at++;
- }
- if ($isInQuote1) {
- $html .= '"';
- }
- if ($isInQuote2) {
- $html .= "'";
- }
- if ($isInTag) {
- if ($isInOpeningTag) {
- $html .= "/>";
- } else {
- $tagCurr = strtolower($tagCurr);
- $tagCurrLen = strlen($tagCurr);
- if ($tagClosingCount && !strncmp($tagClosing[$tagClosingCount - 1], $tagCurr, $tagCurrLen)) {
- if (strlen($tagClosing[$tagClosingCount - 1]) != $tagCurrLen) {
- $html .= substr($tagClosing[$tagClosingCount - 1], $tagCurrLen);
- }
- $html .= ">";
- array_pop($tagClosing);
- $tagClosingCount--;
- } else {
- $tagAt = $tagClosingCount - 2;
- while ($tagAt >= 0) {
- if (!strncmp($tagClosing[$tagAt], $tagCurr, $tagCurrLen)) {
- break;
- }
- $tagAt--;
- }
- if ($tagAt >= 0) {
- if (strlen($tagClosing[$tagAt]) != $tagCurrLen) {
- $html .= substr($tagClosing[$tagAt], $tagCurrLen);
- }
- $html .= ">";
- $tagClosingCount--;
- while ($tagAt < $tagClosingCount) {
- $tagAt2 = $tagAt + 1;
- $tagClosing[$tagAt] = $tagClosing[$tagAt2];
- $tagAt = $tagAt2;
- }
- array_pop($tagClosing);
- } else {
- // Pass
- }
- }
- }
- }
- while (--$tagClosingCount >= 0) {
- $html .= "</{$tagClosing[$tagClosingCount]}>";
- }
- return $html;
- }
- ?>