IT资讯--------抓取各个博客上的文章

简介: 该App已经上传到百度应用市场:http://shouji.baidu.com/soft/item?docid=8928185&from=as&f=search_app_it%E8%B5%84%E8%AE%AF%40list_1_image%402%40header_all_input 有兴趣的可以下载看看。接下来我会公布源代码,不过该App并没有使用



该App已经上传到百度应用市场:http://shouji.baidu.com/soft/item?docid=8928185&from=as&f=search_app_it%E8%B5%84%E8%AE%AF%40list_1_image%402%40header_all_input

有兴趣的可以下载看看。接下来我会公布源代码,不过该App并没有使用代码混淆,所以可以通过反编译清楚的看到源码。



CSDN 的博客文章 :

1.文章的信息:

public class NewsItem {
    private int id;
    //标题
    private String title;
    //链接
    private String link;
    //发布时间
    private String date;
    //图片链接
    private String picLink;
    //内容
    private String content;
    //类型
    private int newsType;

    public NewsItem() {
    }

    public int getId() {
        return id;
    }

    public void setId(int id) {
        this.id = id;
    }

    public int getNewsType() {
        return newsType;
    }

    public void setNewsType(int newsType) {
        this.newsType = newsType;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getPicLink() {
        return picLink;
    }

    public void setPicLink(String picLink) {
        this.picLink = picLink;
    }

    public String getDate() {
        return date;
    }

    public void setDate(String date) {
        this.date = date;
    }

    public String getLink() {
        return link;
    }

    public void setLink(String link) {
        this.link = link;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }
}


抓取文章的:

**
 * Created by Administrator on 2015/11/13.
 * 处理NewItem的业务类
 */
public class NewsItemBiz {

    public List<NewsItem> getNewsItems(int newTypes, int currentPage) throws CommonExecption {
        String urlStr = URLUtil.generateUrl(newTypes, currentPage);
        String htmlStr = DataUtil.doGet(urlStr, "UTF-8");

        List<NewsItem> newsItems = new ArrayList<NewsItem>();
        NewsItem newsItem = null;
        Document doc = Jsoup.parse(htmlStr);
        Elements units = doc.getElementsByClass("unit");
        for (int i = 0; i < units.size(); i++) {
            newsItem = new NewsItem();
            newsItem.setNewsType(newTypes);

            Element unit_ele = units.get(i);
            Element h1_ele = unit_ele.getElementsByTag("h1").get(0);
            Element h1_a_ele = h1_ele.child(0);
            String title = h1_a_ele.text();
            title = AppUtil.encoding(title,"utf-8");
            String hred = h1_a_ele.attr("href");
            hred = AppUtil.encoding(hred,"utf-8");
            newsItem.setLink(hred);
            newsItem.setTitle(title);

            Element h4_ele = unit_ele.getElementsByTag("h4").get(0);
            Element ago_ele = h4_ele.getElementsByClass("ago").get(0);
            String date = ago_ele.text();
            date = AppUtil.encoding(date,"utf-8");
            newsItem.setDate(date);

            Element d1_ele = unit_ele.getElementsByTag("dl").get(0);
            Element dt_ele = d1_ele.child(0);

            try {
                Element img_ele = dt_ele.child(0);
                String imgLink = img_ele.child(0).attr("src");
                //    System.out.println("link--------"+imgLink);
                imgLink = AppUtil.encoding(imgLink,"utf-8");
                newsItem.setPicLink(imgLink);
            } catch (IndexOutOfBoundsException e) {
                e.printStackTrace();
            }
            Element content_ele = d1_ele.child(1);
            String content = content_ele.text();
            content = AppUtil.encoding(content,"utf-8");
            newsItem.setContent(content);
            newsItems.add(newsItem);
        }
        return newsItems;
    }

    public NewsDto getNews(String urlStr) throws CommonExecption {
        NewsDto newsDto = new NewsDto();
        List<News> newses = new ArrayList<>();
        String htmlStr = DataUtil.doGet(urlStr, "UTF-8");
        Document doc = Jsoup.parse(htmlStr);

        // 获得文章中的第一个detail
    //    System.out.println(htmlStr);
        Element detailEle = doc.select(".left .detail").get(0);
        // 标题
        Element titleEle = detailEle.select("h1.title").get(0);
        News news = new News();
        String title = titleEle.text();
        title = AppUtil.encoding(title,"utf-8");
        news.setTitle(title);
        news.setType(Constant.TITLE);
        newses.add(news);
        // 摘要
        Element summaryEle = detailEle.select("div.summary").get(0);
        news = new News();
        String summary = summaryEle.text();
        summary = AppUtil.encoding(summary,"utf-8");
        news.setSumary(summary);
        news.setType(Constant.SUMMARY);
        newses.add(news);
        // 内容
        Element contentEle = detailEle.select("div.con.news_content").get(0);
        Elements childrenEle = contentEle.children();

        for (Element child : childrenEle) {
            Elements imgEles = child.getElementsByTag("img");
            // 图片
            if (imgEles.size() > 0) {
                for (Element imgEle : imgEles) {
                    if (imgEle.attr("src").equals(""))
                        continue;
                    news = new News();
                    String imgLink = imgEle.attr("src");
                    imgLink = AppUtil.encoding(imgLink,"utf-8");
                    news.setImageLink(imgLink);
                    news.setType(Constant.IMG);
                    newses.add(news);
                }
            }
            // 移除图片
            imgEles.remove();

            if (child.text().equals(""))
                continue;

            news = new News();
            news.setType(Constant.CONTENT);

            try {
                if (child.children().size() == 1) {
                    Element cc = child.child(0);
                    if (cc.tagName().equals("b")) {
                        news.setType(Constant.BOLD_TITLE);
                    }
                }

            } catch (IndexOutOfBoundsException e) {
                e.printStackTrace();
            }
            String content = child.outerHtml();
            content = AppUtil.encoding(content,"utf-8");
            news.setContent(content);
            newses.add(news);
        }
        newsDto.setNewses(newses);
        return newsDto;
    }

   


}


URL的处理:

public class URLUtil {

    public static final String NEWS_LIST_URL = "http://www.csdn.net/headlines.html";
    public static final String NEWS_LIST_URL_YIDONG = "http://mobile.csdn.net/mobile";
    public static final String NEWS_LIST_URL_YANFA = "http://sd.csdn.net/sd";
    public static final String NEWS_LIST_URL_YUNJISUAN = "http://cloud.csdn.net/cloud";
    public static final String NEWS_LIST_URL_ZAZHI = "http://programmer.csdn.net/programmer";
    public static final String NEWS_LIST_URL_YEJIE = "http://news.csdn.net/news";

    public static String generateUrl(int newType,int currentPage){
        currentPage=currentPage>0 ? currentPage :1;
        String urlstr="";
        switch (newType){
            case Constant.NEW_TYPE_YEJIE:
                urlstr=NEWS_LIST_URL_YEJIE;
                break;
            case Constant.NEW_TYPE_YANFA:
                urlstr=NEWS_LIST_URL_YANFA;
                break;

            case Constant.NEW_TYPE_YUNJISUAN:
                urlstr=NEWS_LIST_URL_YUNJISUAN;
                break;
            case Constant.NEW_TYPE_YIDONG:
                urlstr=NEWS_LIST_URL_YIDONG;
                break;
            case Constant.NEW_TYPE_CHENGXUYUAN:
                urlstr=NEWS_LIST_URL_ZAZHI;
                break;
            default:
                urlstr=NEWS_LIST_URL;
        }
        urlstr +="/"+currentPage;
        return urlstr;
    }
}

访问网络:


public class DataUtil {
    /**
     * 通过传入url链接访问网络,获取网页的html数据
     *
     * @param urlstr
     * @return
     * @throws CommonExecption
     */
    public static String doGet(String urlstr, String uncode) throws CommonExecption {
        StringBuffer sb = new StringBuffer();
        try {
       /*     URL url = new URL(urlstr);
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("GET");
            conn.setConnectTimeout(5000);
            conn.setDoInput(true);
            conn.setDoOutput(true);
         */

            HttpClient client = new HttpClient();
            GetMethod getMethod = new GetMethod(urlstr);


            getMethod.addRequestHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            getMethod.addRequestHeader("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
            getMethod.addRequestHeader("Host","www.csdn.net");
            getMethod.addRequestHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; rv:43.0) Gecko/20100101 Firefox/43.0");
            getMethod.addRequestHeader("Connection","keep-alive");
             int state= client.executeMethod(getMethod);
            if (state== 200) {
                InputStream is = getMethod.getResponseBodyAsStream();
                int len = 0;
                byte[] buf = new byte[1024];
                while ((len = is.read(buf)) != -1) {
                    sb.append(new String(buf, 0, len, uncode));
                }
                is.close();
            } else {
                throw new CommonExecption("访问网络失败");
            }
        } catch (Exception e) {
            throw new CommonExecption("访问网络失败");
        }
        return sb.toString();
    }
}

关于异常处理:


public class CommonExecption extends Exception{

    public CommonExecption(){
        super();
    }

    public CommonExecption(String message,Throwable cause){
        super(message,cause);
    }

    public CommonExecption(String message){
        super(message);
    }

    public CommonExecption(Throwable casuse){
        super(casuse);
    }

二  博客园


访问网络:

public class BlogHouseDataUtil {

    /**
     * 返回该链接地址的html数据
     *
     * @param urlStr
     * @return
     * @throws
     */
    public static String doGet(String urlStr,int currentPage,int newType) throws CommonExecption
    {
        StringBuffer sb = new StringBuffer();
        try
        {

            HttpClient client=new HttpClient();
            PostMethod post =new PostMethod(urlStr);
            switch(newType){
                case Constant.NEWS_TYPE_HOME:
                    post.addParameter("CategoryType", "SiteHome");
                    post.addParameter("CategoryId",String.valueOf(808));
                    post.addParameter("ItemListActionName", "PostList");
                    break;
                case Constant.NEWS_TYPE_PICK:
                    post.addParameter("CategoryType", "Picked");
                    post.addParameter("CategoryId",String.valueOf(-2));
                    post.addParameter("ItemListActionName", "PostList");
                    break;
                case Constant.NEWS_TYPE_CANDIDATE:
                    post.addParameter("CategoryType", "HomeCandidate");
                    post.addParameter("CategoryId",String.valueOf(108697));
                    post.addParameter("ItemListActionName", "PostList");
                    break;
                case Constant.NEWS_TYPE_NEWS:
                    post.addParameter("CategoryType", "News");
                    post.addParameter("CategoryId",String.valueOf(-1));
                    post.addParameter("ItemListActionName", "NewsList");
                    break;

            }

            post.addParameter("PageIndex",String.valueOf(currentPage));
            post.addParameter("ParentCategoryId",String.valueOf(0));
            int state=client.executeMethod(post);

            if (state == 200)
            {
                InputStream is = post.getResponseBodyAsStream();
                int len = 0;
                byte[] buf = new byte[1024];

                while ((len = is.read(buf)) != -1)
                {
                    sb.append(new String(buf, 0, len, "UTF-8"));
                }
                is.close();
            } else
            {
                throw new CommonExecption("访问网络失败!");
            }

        } catch (Exception e)
        {
            e.printStackTrace();
            throw new CommonExecption("访问网络失败!");
        }

        return sb.toString();
    }
}


URL处理

public class Blog_URLUtil {

    public static final String HOME_URL="http://www.cnblogs.com/#p";                       //首页
    public static final String PICK_URL="http://www.cnblogs.com/pick/#p";                  //精华
    public static final String CANDIDATE_URL="http://www.cnblogs.com/candidate/#p";        //候选
    public static final String NEWS_URL="http://www.cnblogs.com/news/#p";                  //新闻

    /**
     * 根据文章类型,和当前页码生成url
     * @param newsType
     * @param currentPage
     * @return
     */
    public static String generateUrl(int newsType, int currentPage)
    {
        currentPage = currentPage > 0 ? currentPage : 1;
        String urlStr = "";

        switch (newsType)
        {
            case Constant.NEWS_TYPE_HOME:
                urlStr = HOME_URL;
                break;
            case Constant.NEWS_TYPE_PICK:
                urlStr = PICK_URL;
                break;
            case Constant.NEWS_TYPE_CANDIDATE:
                urlStr =  CANDIDATE_URL;
                break;
            case Constant.NEWS_TYPE_NEWS:
                urlStr = NEWS_URL;
                break;
            default:
                break;
        }

        urlStr += ""+currentPage;
        return urlStr;

    }

}

抓取文章 :


public class NewItemBlogHouse {

    public List<NewsItem> getNewsItems(int newsType, int currentPage) throws CommonExecption {
        String urlStr = Blog_URLUtil.generateUrl(newsType, currentPage);

        String htmlStr = BlogHouseDataUtil.doGet(urlStr, currentPage, newsType);
        System.out.println("htmlStr------" + htmlStr);
        List<NewsItem> newsItems = new ArrayList<NewsItem>();
        NewsItem newsItem = null;
        Document doc = Jsoup.parse(htmlStr);
        Elements units = doc.getElementsByClass("post_item_body");
        // System.out.println("--------"+units.toString());
        for (int i = 0; i < units.size(); i++) {
            newsItem = new NewsItem();
            newsItem.setNewsType(newsType);

            Element unit_ele = units.get(i);

            Element h1_ele = unit_ele.getElementsByTag("h3").get(0);
            Element h1_a_ele = h1_ele.child(0);
            String title = h1_a_ele.text();
            title = AppUtil.encoding(title, "utf-8");
            String href = h1_a_ele.attr("href");
            href = AppUtil.encoding(href, "utf-8");
            newsItem.setLink(href);
            newsItem.setTitle(title);
            //     System.out.println("href---------->"+href);
            //     System.out.println("title---------->"+title);

            Element div_date = unit_ele.getElementsByTag("div").get(1);
            String date = div_date.text();
            // String span_ele=p_ele.getElementsByTag("span").get(0).text();
            // System.out.println("---------"+text);
            date = AppUtil.encoding(date, "utf-8");
            newsItem.setDate(date);

            try {// 可能没有图片
                Element div_pic = unit_ele.getElementsByTag("div").get(0);
                Element p_pic = div_pic.getElementsByTag("a").get(1);
                Element img = p_pic.child(0);
                String imgLink = img.attr("src");
                // System.out.println(imgLink);
                imgLink = AppUtil.encoding(imgLink, "utf-8");
                newsItem.setPicLink(imgLink);
            } catch (IndexOutOfBoundsException e) {
                System.out.println("没有图片");
            }

            Element div_content = unit_ele.getElementsByTag("div").get(0);
            Element p_content = div_content.getElementsByTag("p").get(0);
            String content = p_content.text();
            // System.out.println("--------"+content);
            content = AppUtil.encoding(content, "utf-8");
            newsItem.setContent(content);
            newsItems.add(newsItem);
        }
        return newsItems;

    }

}

三    51CTO

网络请求:


public class DataUtil {
    /**
     * 通过传入url链接访问网络,获取网页的html数据
     *
     * @param urlstr
     * @return
     * @throws CommonExecption
     */
    public static String doGet(String urlstr, String uncode) throws CommonExecption {
        StringBuffer sb = new StringBuffer();
        try {
       /*     URL url = new URL(urlstr);
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("GET");
            conn.setConnectTimeout(5000);
            conn.setDoInput(true);
            conn.setDoOutput(true);
         */

            HttpClient client = new HttpClient();
            GetMethod getMethod = new GetMethod(urlstr);


            getMethod.addRequestHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            getMethod.addRequestHeader("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
            getMethod.addRequestHeader("Host","www.csdn.net");
            getMethod.addRequestHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; rv:43.0) Gecko/20100101 Firefox/43.0");
            getMethod.addRequestHeader("Connection","keep-alive");
             int state= client.executeMethod(getMethod);
            if (state== 200) {
                InputStream is = getMethod.getResponseBodyAsStream();
                int len = 0;
                byte[] buf = new byte[1024];
                while ((len = is.read(buf)) != -1) {
                    sb.append(new String(buf, 0, len, uncode));
                }
                is.close();
            } else {
                throw new CommonExecption("访问网络失败");
            }
        } catch (Exception e) {
            throw new CommonExecption("访问网络失败");
        }
        return sb.toString();
    }
}

URL处理


public class CTO_URLUtil {

    public static final String FIRST_URL="http://blog.51cto.com/artcommend";
    public static final String NETWORK_URL="http://blog.51cto.com/artcommend/14";   //网络开发
    public static final String DEVELOP_URL="http://blog.51cto.com/artcommend/8";    //开发技术������
    public static final String ADMIN_URL="http://blog.51cto.com/artcommend/9";       //IT管理���
    public static final String LIFE_URL="http://blog.51cto.com/artcommend/12";        //IT生活���


        /**
         * 根据文章类型,和当前页码生成url
         * @param newsType
         * @param currentPage
         * @return
         */
        public static String generateUrl(int newsType, int currentPage)
        {
            currentPage = currentPage > 0 ? currentPage : 1;
            String urlStr = "";

            switch (newsType)
            {
                case Constant.NEWS_TYPE_NETWORK:
                    urlStr = NETWORK_URL;
                    break;
                case Constant.NEWS_TYPE_DEVELOPMENT:
                    urlStr = DEVELOP_URL;
                    break;
                case Constant.NEWS_TYPE_IT_ADMIN:
                    urlStr =  ADMIN_URL;
                    break;
                case Constant.NEWS_TYPE_IT_LIFE:
                    urlStr = LIFE_URL;
                    break;
                default:
                    break;
            }

            urlStr += "/" + currentPage;

            return urlStr;

        }



}

抓取文章 :


public class NewItem51CTO {
    /**
     * 处理开发   网络管理  ,IT生活.....
     *
     * @param newsType
     * @param currentPage
     * @return
     * @throws CommonExecption
     */
    public List<NewsItem> getNewsItems(int newsType, int currentPage) throws CommonExecption {
        String urlStr = CTO_URLUtil.generateUrl(newsType, currentPage);

        String htmlStr = DataUtil.doGet(urlStr, "GB2312");
        List<NewsItem> newsItems = new ArrayList<>();
        NewsItem newsItem = null;

        Document doc = Jsoup.parse(htmlStr);
        Elements units = doc.getElementsByClass("r_li");

        for (int i = 0; i < units.size(); i++) {
            newsItem = new NewsItem();
            newsItem.setNewsType(newsType);

            Element unit_ele = units.get(i);

            Element h1_ele = unit_ele.getElementsByTag("h4").get(0);
            Element h1_a_ele = h1_ele.child(0);
            String title = h1_a_ele.text();
            title= AppUtil.encoding(title,"utf-8");
            String href = h1_a_ele.attr("href");
             href=AppUtil.encoding(href,"utf-8");
            newsItem.setLink(href);
            newsItem.setTitle(title);
            // System.out.println(href);

            Element div_date = unit_ele.getElementsByTag("div").get(2);
            Element p_ele = div_date.getElementsByTag("p").get(0);
            String span_ele = p_ele.getElementsByTag("span").get(0).text();
            // System.out.println("---------"+div_date.toString());
            span_ele=AppUtil.encoding(span_ele,"utf-8");
            newsItem.setDate(span_ele);

            try {// 可能没有图片
                Element div_pic = unit_ele.getElementsByTag("div").get(1);
                Element p_pic = div_pic.getElementsByTag("a").get(0);
                Element img = p_pic.child(0);
                String imgLink = img.attr("src");
                imgLink=AppUtil.encoding(imgLink,"utf-8");
                newsItem.setPicLink(imgLink);
            } catch (IndexOutOfBoundsException e) {
                System.out.println("数组边界异常");
            }
            Element div_content = unit_ele.getElementsByTag("div").get(1);
            Element p_content = div_content.getElementsByTag("p").get(0);
            String content = p_content.text();
            content=AppUtil.encoding(content,"utf-8");
            // System.out.println("--------"+content);
            newsItem.setContent(content);
            newsItems.add(newsItem);
        }
        return newsItems;
    }

}

四      ITeye


网络请求 :


public class ITeyeDataUtil {

    /**
     * 返回该链接地址的html数据
     *
     * @param urlStr
     * @return
     * @throws
     */
    public static String doGet1(String urlStr,int currentPage,int newType,int useAgentNum) throws CommonExecption
    {
        StringBuffer sb = new StringBuffer();
        try
        {
            URL url = new URL(urlStr);
          HttpURLConnection conn =(HttpURLConnection) url.openConnection();
            conn.setRequestMethod("GET");
            conn.setConnectTimeout(8000);
            conn.setDoInput(true);
            conn.setDoOutput(true);
            System.out.println("code-----" + conn.getResponseCode());
            // conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1");

            /**
             * 更换代理
             */
          String []useAgent=new String[]{"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
                    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
                    " Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"};

            switch(useAgentNum){
                case 14:
                    conn.setRequestProperty("User-Agent", useAgent[0]);
                    break;
                case 15:
                    conn.setRequestProperty("User-Agent", useAgent[1]);
                    break;
                case 16:
                    conn.setRequestProperty("User-Agent", useAgent[2]);
                    break;
                case 17:
                    conn.setRequestProperty("User-Agent", useAgent[3]);
                    break;
            }

            if (conn.getResponseCode()==HttpURLConnection.HTTP_OK)
            {
                InputStream is =conn.getInputStream();
                int len = 0;
                byte[] buf = new byte[1024];

                while ((len = is.read(buf)) != -1)
                {
                    sb.append(new String(buf, 0, len, "UTF-8"));
                }
                is.close();
            } else
            {
                throw new CommonExecption("访问网络失败!");
            }

        } catch (Exception e)
        {
            e.printStackTrace();
            throw new CommonExecption("访问网络失败!");
        }

        return sb.toString();
    }

    public static String doGet(String urlStr) throws CommonExecption
    {
        StringBuffer sb = new StringBuffer();
        try
        {

            HttpClient client=new HttpClient();
            GetMethod getMethod=new GetMethod(urlStr);

            getMethod.addRequestHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; rv:43.0) Gecko/20100101 Firefox/43.0");
            getMethod.addRequestHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            getMethod.addRequestHeader("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
            getMethod.addRequestHeader("Host","www.iteye.com");
            getMethod.addRequestHeader("Connection","keep-alive");
            getMethod.addRequestHeader("Referer","http://www.iteye.com/news");

            client.getParams().setParameter("http.protocol.cookie-policy", CookiePolicy.BROWSER_COMPATIBILITY);
            int state=client.executeMethod(getMethod);

            System.out.println("state2-----"+state);
            if (state==200)
            {
                InputStream is =getMethod.getResponseBodyAsStream();
                BufferedInputStream buff=new BufferedInputStream(is);
                BufferedReader reader=null;
                reader=new BufferedReader(new InputStreamReader(getMethod.getResponseBodyAsStream()));
                String line = "";
                while ((line=reader.readLine())!=null)
                {
                    sb.append(line + "\n");
                }
                is.close();
            } else
            {
                throw new CommonExecption("访问网络失败");
            }

        } catch (Exception e)
        {
            e.printStackTrace();
            throw new CommonExecption("访问网络失败");
        }

        return sb.toString();
    }
}

URL处理:

public class ITEYE_URLUtil {
    public static final String NEWS_URL="http://www.iteye.com/news?page=";                       //资讯
    public static final String MAGAZINES_URL="http://www.iteye.com/magazines?page=";            //精华
    public static final String BLOG_URL="http://www.iteye.com/blogs?page=";                    //博客
    public static final String SUBJECTS_URL="http://www.iteye.com/blogs/subjects?page=";      //专栏

    /**
     * 根据文章类型,和当前页码生成url
     * @param newsType
     * @param currentPage
     * @return
     */
    public static String generateUrl(int newsType, int currentPage)
    {
        currentPage = currentPage > 0 ? currentPage : 1;
        String urlStr = "";

        switch (newsType)
        {
            case Constant.NEWS_TYPE_NEW:
                urlStr = NEWS_URL;
                break;
            case Constant.NEWS_TYPE_MAGAZINES:
                urlStr = MAGAZINES_URL;
                break;
            case Constant.NEWS_TYPE_BLOGS:
                urlStr =BLOG_URL;
                break;
            case Constant.NEWS_TYPE_SUBJECTS:
                urlStr =SUBJECTS_URL;
                break;
            default:
                break;
        }

        urlStr +=currentPage;
        return urlStr;

    }

}

抓取文章 :


public class NewItemITeye {

    public List<NewsItem> getNewsItems(int newsType, int currentPage, int useAgentNum)
            throws CommonExecption {
        String urlStr = ITEYE_URLUtil.generateUrl(newsType, currentPage);
        String htmlStr = ITeyeDataUtil.doGet1(urlStr, newsType, currentPage, 12);
        List<NewsItem> newsItems = new ArrayList<NewsItem>();
        NewsItem newsItem = null;

        Document doc = Jsoup.parse(htmlStr);
        Elements units = doc.getElementsByClass("content");

        for (int i = 0; i < units.size(); i++) {
            newsItem = new NewsItem();
            newsItem.setNewsType(newsType);

            Element unit_ele = units.get(i);

            Element h3_ele = unit_ele.getElementsByTag("h3").get(0);
            //解析时间
            Element span_ele = null;
            switch (newsType) {
                case Constant.NEWS_TYPE_NEW:
                    Element a_ele = h3_ele.getElementsByTag("a").get(1);

                    String title = a_ele.text();
                    title = AppUtil.encoding(title, "utf-8");
                    newsItem.setTitle(title);
                    String href = a_ele.attr("href");
                    href = AppUtil.encoding(href, "utf-8");
                    StringBuffer sb = new StringBuffer();
                    sb.append("http://www.iteye.com").append(href);
                    newsItem.setLink(sb.toString());

                    Element div_ele = unit_ele.getElementsByTag("div").get(3);
                    if (div_ele.getElementsByTag("span").size() >= 3) {
                        span_ele = div_ele.getElementsByTag("span").get(2);
                    } else {
                        span_ele = div_ele.getElementsByTag("span").get(1);
                    }
                    //获取图片链接
                    try {// 可能没有图片
                        Element img_ele = h3_ele.child(0);
                        String imgLink = img_ele.attr("src");
                        // System.out.println(imgLink);
                        imgLink = AppUtil.encoding(imgLink, "utf-8");
                        newsItem.setPicLink(imgLink);
                    } catch (IndexOutOfBoundsException e) {
                        System.out.println("没有图片");
                    }
                    break;
                case Constant.NEWS_TYPE_BLOGS:
                    Element a_ele1 = h3_ele.getElementsByTag("a").get(1);
                    // System.out.println("a_ele----------->"+a_ele.toString());
                    String title1 = a_ele1.text();
                    title1 = AppUtil.encoding(title1, "utf-8");
                    newsItem.setTitle(title1);
                    String href1 = a_ele1.attr("href");
                    href1 = AppUtil.encoding(href1, "utf-8");
                    newsItem.setLink(href1);
                    Element div_ele2 = unit_ele.getElementsByTag("div").get(4);
                    if (div_ele2.getElementsByTag("span").size() >= 3) {
                        span_ele = div_ele2.getElementsByTag("span").get(4);
                    } else {
                        span_ele = div_ele2.getElementsByTag("span").get(1);
                    }
                    //获取图片链接
                    try {// 可能没有图片
                        Element img_ele = unit_ele.getElementsByTag("div").get(2);
                        Element a1_ele = img_ele.getElementsByTag("a").get(0);
                        Element img = a1_ele.getElementsByTag("img").get(0);
                        String imgLink = img.attr("src");
                        imgLink = AppUtil.encoding(imgLink, "utf-8");
                        newsItem.setPicLink(imgLink);
                        // System.out.println("img--------"+imgLink);
                    } catch (IndexOutOfBoundsException e) {
                        System.out.println("没有图片");
                    }
                    break;
                case Constant.NEWS_TYPE_MAGAZINES:
                    Element a_ele2 = h3_ele.getElementsByTag("a").get(0);
                    String title2 = a_ele2.text();
                    title2 = AppUtil.encoding(title2, "utf-8");
                    newsItem.setTitle(title2);
                    String href2 = a_ele2.attr("href");
                    href2 = AppUtil.encoding(href2, "utf-8");
                    StringBuffer sb_href = new StringBuffer();
                    sb_href.append("http://www.iteye.com").append(href2);
                    //System.out.println("sb_href-------"+sb_href.toString());
                    newsItem.setLink(sb_href.toString());

                    Element div_ele3 = unit_ele.getElementsByTag("div").get(3);
                    if (div_ele3.getElementsByTag("span").size() >= 3) {
                        span_ele = div_ele3.getElementsByTag("span").get(2);
                    } else {
                        span_ele = div_ele3.getElementsByTag("span").get(1);
                    }
                    //获取图片链接
                    try {// 可能没有图片
                        Element img_ele = h3_ele.child(0);
                        String imgLink = img_ele.attr("src");
                        // System.out.println(imgLink);
                        imgLink = AppUtil.encoding(imgLink, "utf-8");
                        newsItem.setPicLink(imgLink);
                    } catch (IndexOutOfBoundsException e) {
                        System.out.println("没有图片");
                    }
                    break;
                case Constant.NEWS_TYPE_SUBJECTS:
                    Element a_ele3 = h3_ele.getElementsByTag("a").get(1);
                    // System.out.println("a_ele----------->"+a_ele.toString());
                    String title3 = a_ele3.text();
                    title3 = AppUtil.encoding(title3, "utf-8");
                    newsItem.setTitle(title3);
                    String href3 = a_ele3.attr("href");
                    href3 = AppUtil.encoding(href3, "utf-8");
                    Element a_ele_3 = h3_ele.getElementsByTag("a").get(1);
                    // System.out.println("a_ele----------->"+a_ele.toString());
                    newsItem.setLink(href3);
                    Element div_ele1 = unit_ele.getElementsByTag("div").get(4);
                    if (div_ele1.getElementsByTag("span").size() >= 3) {
                        span_ele = div_ele1.getElementsByTag("span").get(1);
                    } else {
                        span_ele = div_ele1.getElementsByTag("span").get(2);
                    }
                    //获取图片链接
                    try {// 可能没有图片
                        Element img_ele = unit_ele.getElementsByTag("div").get(2);
                        Element a1_ele = img_ele.getElementsByTag("a").get(0);
                        Element img = a1_ele.getElementsByTag("img").get(0);
                        String imgLink = img.attr("src");
                        imgLink = AppUtil.encoding(imgLink, "utf-8");
                        newsItem.setPicLink(imgLink);
                    } catch (IndexOutOfBoundsException e) {
                        System.out.println("没有图片");
                    }
                    break;

            }

            String date = span_ele.text();
            date = AppUtil.encoding(date, "utf-8");
            StringBuffer date_buffer = new StringBuffer();
            date_buffer.append("发布于").append(" ").append(date);
            newsItem.setDate(date_buffer.toString());

            Element h1_ele = unit_ele.getElementsByTag("div").get(1);
            String content = h1_ele.text();
            content = AppUtil.encoding(content, "utf-8");
            // System.out.println("h1_ele---------->"+content);
            // System.out.println("--------"+content);
            newsItem.setContent(content);
            newsItems.add(newsItem);
        }
        return newsItems;
    }




目录
相关文章
|
7月前
|
开发框架 .NET Java
程序员必知:发个原创小工具,下载autohome论坛帖子离线浏览
程序员必知:发个原创小工具,下载autohome论坛帖子离线浏览
32 0
|
存储 前端开发 JavaScript
从零开始搭建博客02----发表博客个人中心
由于shiro标签不是html的原生标签,所有我们需要先引入一个额外的依赖,shiro的标签库(thymeleaf的拓展标签)。
137 0
从零开始搭建博客02----发表博客个人中心
|
数据采集 前端开发 Python
python爬虫案例 ---- 爬取小说
python爬虫案例 ---- 爬取小说
566 0
|
数据采集 Python
python爬虫抓取富贵论坛
本人是个爬虫小萌新,看了网上教程学着做了一些,如果有什么问题请大佬们反馈,谢谢。
284 0
python爬虫抓取富贵论坛
|
Web App开发 iOS开发 数据格式
|
前端开发
【全网最全的博客美化系列教程】02.添加QQ交谈链接
全网最全的博客美化系列教程相关文章目录 【全网最全的博客美化系列教程】01.添加Github项目链接 【全网最全的博客美化系列教程】02.添加QQ交谈链接 【全网最全的博客美化系列教程】03.给博客添加一只萌萌哒的小仓鼠 【全网最全的博客美化系列教程】04.
1316 0
|
数据采集 Python 数据格式
学习了《python网络爬虫实战》第一个爬虫,爬取新浪新闻
请安装anaconda,其中附带的spyder方便运行完查看变量 1.进入cmd控制台, 输入 pip install BeautifulSoup4 pip install requests 2.
1096 0
|
数据采集 Python
python爬虫-抓取腾讯招聘信息页面
本爬虫主要使用了requests、json、bs4(BeautifulSoup)等相关模块,不完善之处请大家不吝赐教!:) 出处:https://github.
1182 0
|
数据采集 Python Windows
python爬虫-抓取百度贴吧帖子图片
本爬虫可以爬取百度贴吧帖子中的图片,代码有待完善,欢迎大家指教! 出处:https://github.com/jingsupo/python-spider/blob/master/day03/07tieba.
1052 0