Use DOM To Remove HTML Tag

We better use DOM to parse HTML content than Regex. For example we have a content like this.

<p><img class="aligncenter size-full wp-image-3172" src="//wordpress.org/news/files/2014/04/theme1.jpg" alt="theme" width="1003" height="558">
<p class="wp-caption-text">Wordpress Theme</p><br>
Looking for a new theme should be easy and fun. Lose yourself in the boundless supply of free WordPress.org themes with the beautiful new theme browser.</p>

We want to strip out the p tag with class=”wp-caption-text” and the content (“WordPress Theme”). The tag is right below the the image. I made a function to accomplish the task. Here is the code.

Source code

<?php
function remove_content_in_tag($content,$tag,$attribute){
	$doc=new DOMDocument();
	libxml_use_internal_errors(true);
	$doc->loadHTML($content);
	$xpath=new DOMXPath($doc);
	$nlist=$xpath->query("//".$tag);
	if($attribute != "")
		$nlist=$xpath->query("//".$tag."[@".$attribute."]");
 
	for($i=0;$i<$nlist->length;$i++){
		$node=$nlist->item($i);
		$node->parentNode->removeChild($node);
	}
 
	$c_modified=$doc->saveHTML();
	return $c_modified;
}
?>

To call the function and to remove the tags, You can use it like this.

Source code

<?php
    echo remove_content_in_tag($content,"p","class='wp-caption-text'");
?>

Here is the complete code.

Source code

<?php
$content = <<<EOF
<p><img class="aligncenter size-full wp-image-3172" src="//wordpress.org/news/files/2014/04/theme1.jpg" alt="theme" width="1003" height="558"><p class="wp-caption-text">Wordpress Theme</p><br>
Looking for a new theme should be easy and fun. Lose yourself in the boundless supply of free WordPress.org themes with the beautiful new theme browser.</p>
EOF;
 
//echo $content;
 
echo remove_content_in_tag($content,"p","class='wp-caption-text'");
//echo remove_content_in_tag($content,"p","");
//echo remove_content_in_tag($content,"img","");
 
function remove_content_in_tag($content,$tag,$attribute){
    $doc=new DOMDocument();
    libxml_use_internal_errors(true);
    $doc->loadHTML($content);
    $xpath=new DOMXPath($doc);
    $nlist=$xpath->query("//".$tag);
    if($attribute != "")
        $nlist=$xpath->query("//".$tag."[@".$attribute."]");
 
    for($i=0;$i<$nlist->length;$i++){
        $node=$nlist->item($i);
        $node->parentNode->removeChild($node);
    }
 
    $c_modified=$doc->saveHTML();
    return $c_modified;
}
?>

My Adventures

My adventures in Programming World

Use DOM To Remove HTML Tag