Skip to content

Instantly share code, notes, and snippets.

@akkunchoi
Last active February 14, 2017 14:06
Show Gist options
  • Select an option

  • Save akkunchoi/7787284 to your computer and use it in GitHub Desktop.

Select an option

Save akkunchoi/7787284 to your computer and use it in GitHub Desktop.
php tidy dom xpath
<html>
<section>hoge</section>
</html>
<?php
/**
* 断片の場合: loadXMLにする
* loadHTMLでは日本語文字化け
* ウェブページで、metaが設定されていればloadHTMLにする
* loadXMLではIEの条件付きコメントが解釈できない?
*
*
*/
//$html = file_get_contents('https://github.com/');
//$html = file_get_contents('http://localhost/');
//$html = '<a><div>hoge</div><![if !IE]><div id="legal"><![endif]></a>';
//$html = '<html><div><b><!--[if lt IE 9]><script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]-->あいうえお</b></div></html>';
$html = file_get_contents($argv[1]);
$dom = dom($html);
var_dump($dom->saveXML());
/*
$xpath = new DOMXPath($dom);
foreach ($xpath->query('//a') as $e){
var_dump($e->nodeValue);
}
*/
function dom($html){
$config = array(
'output-xml' => true,
'numeric-entities' => true,
'new-blocklevel-tags' => 'article,aside,bdi,command,details,summary,figure,figcaption,footer,header,hgroup,mark,meter,nav,progress,ruby,rt,rp,section,time,wbr,audio,video,source,embed,track,canvas,datalist,keygen,output'
);
$tidy = tidy_parse_string($html, $config, 'UTF8');
$tidy->cleanRepair();
$html = tidy_get_output($tidy);
//var_dump($html);
// PHP Warning: DOMDocument::loadXML(): StartTag: invalid element name in Entity
$html = preg_replace('/\<\!\[(.*?)\]\>/um', '<!--[\1]-->', $html);
// loadXML() keeps multibyte strings even if partial html given
$html = '<?xml version="1.0" encoding="UTF-8"?>' . "\n" . $html;
//libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadXML($html);
//$doc->loadHTML($html);
//libxml_clear_errors();
//var_dump($doc->saveXML());
return $doc;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment