Java爬虫第三篇:使用Jsoup 抓取文章
概述
本章讲解Selenium模拟登录CSDN之后,使用Jsoup 抓取文章。
1. Jsoup maven配置
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency>
2. 测试
public class Test{ public static void main(String[] args) { String username = "xxxx@163.com"; String password = "***********"; String url = "https://passport.csdn.net/login"; System.setProperty("webdriver.chrome.driver", SeleniumUtil.CHROMEDRIVERPATH );// chromedriver localPath ChromeOptions chromeOptions = new ChromeOptions(); chromeOptions.addArguments("–no-sandbox"); //--start-maximized WebDriver driver = new ChromeDriver(chromeOptions); driver.get(url); SeleniumUtil.sleep(1000); WebElement mainSelectE = driver.findElement(By.cssSelector(".main-select")); List<WebElement> loginType = mainSelectE.findElements(By.tagName("a")); bgm:for (int i = 0; i < loginType.size(); i++) { WebElement aelement = loginType.get(i); if( aelement.getText().indexOf("帐号登录") != -1){ aelement.click(); System.out.println("切换到帐号登录...."); break bgm; } } System.out.println("继续操作...."); SeleniumUtil.sleep(1000); WebElement formE = driver.findElement(By.cssSelector("form")); WebElement nameE = formE.findElement(By.id("all")); WebElement passwordE = formE.findElement(By.id("password")); nameE.sendKeys(username); formE.findElement(By.id("password-number")).sendKeys(password); formE.findElement(By.cssSelector("button")).click(); driver.get("https://blog.csdn.net/forezp/column/info/15197/3"); SeleniumUtil.sleep(1000); //Selenium获取网页内容 WebElement body = driver.findElement(By.cssSelector("body")); //转化为Jsoup文档处理 Document doc = Jsoup.parse( body.getAttribute("outerHTML") ); Elements es = doc.select("ul.column_article_list>li"); for ( Element e : es ){ String absHref = e.select("a").attr("abs:href"); // "http://www.open-open.com/" String titile = e.select("h2.title").text(); String desc = e.select("div.column_article_desc").html(); System.out.println( "absHref:" + absHref ); Document detailDoc = Jsoup.connect(absHref).get(); Elements detaile = detailDoc.select("div#content_views"); String content = detaile.html(); System.out.println( "titile:" + titile ); System.out.println( "desc:" + desc ); System.out.println( "content:" + content ); System.out.println( ); } } }
3. 结果