$html = curl($url); libxml_use_internal_errors(true); $dom = new DOMDocument(); $dom->loadHTML($html);
<?php $doc = new DOMDocument(); $doc->loadHTMLFile('http://netkiller.github.io/'); $xpath = new DOMXPath($doc); $title = $xpath->evaluate('string(/html/head/title)'); echo "Document title is: " . $title . "\n"; ?>
<?php function curl($url, $fields = array(), $auth = false){ $url_arr = parse_url($url); $curl = curl_init($url); $headers = array( 'Accept: text/plain, */*; q=0.01', 'Accept-Encoding: gzip, deflate', 'Accept-Language: zh-CN,zh;q=0.8,en;q=0.6,vi;q=0.4,zh-TW;q=0.2', 'Connection: keep-alive', 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8', ); $headers[]= 'Host: '.$url_arr['host']; $headers[]= 'Origin: https://'.$url_arr['host']; $headers[]= 'X-Requested-With: XMLHttpRequest'; // curl_setopt($curl, CURLOPT_HTTPHEADER, $headers); curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1"); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_VERBOSE, 0); curl_setopt($curl, CURLOPT_HEADER, 0); curl_setopt($curl, CURLOPT_REFERER, $url) ; curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // curl_setopt($curl, CURLOPT_COOKIEFILE, $jar); // curl_setopt($curl, CURLOPT_COOKIEJAR, $jar); // if($auth){ // curl_setopt($curl, CURLOPT_USERPWD, "$auth"); // curl_setopt($curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); // } if($fields){ $fields_string = http_build_query($fields); curl_setopt($curl, CURLOPT_POST, true); curl_setopt($curl, CURLOPT_BINARYTRANSFER, true); curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string); } $response = curl_exec($curl); curl_close($curl); // $this->referer = $url; return $response; } $url = "http://netkiller.github.io/journal/index.html"; $html = curl($url); libxml_use_internal_errors(true); $dom = new DOMDocument(); $dom->loadHTML($html); $xpath = new DOMXPath($dom); $xml = $xpath->query('//div[@class="section"]'); foreach ($xml as $result_object){ //echo $result_object->childNodes->item(0)->nodeValue; print_r($result_object); }
下面的例子是从某个网站扣取一个HTML块的例子
$url = "http://netkiller.github.io/journal/index.html"; $html = curl($url); libxml_use_internal_errors(true); $dom = new DOMDocument(); $dom->loadHTML($html); $xpath = new DOMXPath($dom); $xml = $xpath->query('//div[@class="section"]'); $xhtml = $dom->saveHTML($xml->item(0)); print_r($xhtml);
原文出处:Netkiller 系列 手札
本文作者:陈景峯
转载请与作者联系,同时请务必标明文章原始出处和作者信息及本声明。