说下思路,利用PHP的curl函数储存cookie,google搜索页面是无法用file_get_connents打开的,必须要完全模拟浏览器才行,百度就不同了,直接用file_get_conntens抓取页面,然后用正则处理下就行了,这里就不列举百度了。code如下,有问题请联系我
- <?php
- header("Content-Type: text/html;charset=utf-8");
- function ggsearch($url_s, $keyword, $page = 1) {
- $enKeyword = urlencode($keyword);
- $rsState = false;
- $page_num = ($page -1) * 10;
- if ($page <= 10) {
- $interface = "eth0:" . rand(1, 4); //避免GG封IP
- $cookie_file = dirname(__FILE__) . "/temp/google.txt"; //存储cookie值
- $url = "http://www.google.com/search?q=$enKeyword&hl=en&prmd=imvns&ei=JPnJTvLFI8HlggeXwbRl&start=$page_num&sa=N";
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_URL, $url);
- //curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);//获取浏览器类型
- curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB5");
- curl_setopt($ch, CURLOPT_INTERFACE, "$interface"); //指定访问IP地址
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
- curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file);
- $contents = curl_exec($ch);
- curl_close($ch);
- $match = "!<div\s*id=\"search\">(.*)</div>\s+<\!--z-->!";
- preg_match_all("$match", "$contents", $line);
- while (list ($k, $v) = each($line[0])) {
- preg_match_all("!<h3\s+class=\"r\"><a[^>]+>(.*?)</a>!", $v, $title);
- $num = count($title[1]);
- for ($i = 0; $i < $num; $i++) {
- if (strstr($title[0][$i], $url_s)) {
- $rsState = true;
- $j = $i +1;
- $sum = $j + (($page) * 10 - 10);
- //echo $contents;
- echo "关键字" . $keyword . "<br>" . "排名:" . '<font color="red" size="20" style="">' . $sum . '</font>' . "####" . "第" . '<font color="#00FFFF" size="18" style="">'.$page . '</font>'. " 页" . "第" .'<font color="#8000FF" size="15" style="">'.$j . '</font>'. "名" . $title[0][$i] . "<br>";
- echo "<a href='" . $url . "'>" . "点击搜索结果" . "</a>" . "<br>";
- echo "<hr>";
- break;
- }
- }
- }
- unset ($contents);
- if ($rsState === false) {
- ggsearch($url_s, $keyword, ++ $page); //找不到搜索页面的继续往下搜索
- }
- } else {
- echo '关键字' . $keyword . '10页之内没有该网站排名' . '<br>';
- echo "<hr>";
- }
- }
- if (!emptyempty ($_POST['submit'])) {
- $time = explode(' ', microtime());
- $start = $time[0] + $time[1];
- $more_key = trim($_POST['textarea']);
- $url_s = trim($_POST['url']);
- if (!emptyempty ($more_key) && !emptyempty ($url_s)) {
- /*判断输入字符的规律*/
- if (strstr($more_key, "\n")) {
- $exkey = explode("\n", $more_key);
- }
- if(strstr($more_key, "|")) {
- $exkey = explode("|", $more_key);
- }
- if(!strstr($more_key, "\n")&&!strstr($more_key, "|")){
- $exkey=array($more_key);
- }
- /*判断是否有www或者http://之类的东西*/
- if (count(explode('.', $url_s)) <= 2) {
- $url = ltrim($url_s, 'http://www');
- $url = 'www.' . $url_s;
- }
- foreach ($exkey as $keyword) {
- //$keyword;
- ggsearch($url_s, $keyword);
- }
- $endtime = explode(' ', microtime());
- $end = $endtime[0] + $endtime[1];
- echo '<hr>';
- echo '程序运行时间: ';
- echo $end - $start;
- //die();
- }
- }
- ?>
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
- <html xmlns="http://www.w3.org/1999/xhtml">
- <head>
- <title>抓取排名</title>
- </head>
- <body>
- <form action="" method="post">
- <span>关键字:</span> <textarea name="textarea" rows="20" cols="40" wrap="off">
- 格式例如:keyword1|keyword2|keyword3
- 或者: keyword1
- keyword2
- keyword3
- </textarea>
- <span>url地址:</span><input type="text" name="url">
- <input type="submit" name="submit" value="搜索">
- </form>
- </body>
- </html>
本文转自 mcshell 51CTO博客,原文链接:http://blog.51cto.com/mcshell/728050,如需转载请自行联系原作者