<dependency>
<groupId>com.microsoft.playwright</groupId>
<artifactId>playwright</artifactId>
<version>1.20.0</version>
</dependency>
官方文档
出现问题首要的是查看官方文档
简单的页面抓取
public static BrowserContext createContext(Browser browser) {
BrowserContext context = browser.newContext(new Browser.NewContextOptions()
.setIgnoreHTTPSErrors(true)
.setJavaScriptEnabled(true)
.setViewportSize(1920, 1080)
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36"));
return context;
}
public static Browser createBrowser(String name, Playwright playwright) {
try {
switch (name) {
case "firefox":
return playwright.firefox().launch();
case "chromium":
return playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(false).setTimeout(120 * 1000));
case "webkit":
return playwright.webkit().launch();
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static Page initPage(){
return initPage("chromium");
}
public static Page initPage(String browserName){
Playwright playwright = Playwright.create();
Browser browser = createBrowser(browserName, playwright);
BrowserContext browserContext = createContext(browser);
Page page = browserContext.newPage();
return page;
}
抓取一个页面,获取内容
抓个使用了异步加载数据的页面,以 https://www.taobao.com/ 为例。
public static String getContent(String url){
String trueUrl = DomainUtils.formatUrl(url);
Page page = initPage();
try {
page.navigate(trueUrl);
String content = page.content();
return content;
}catch (Exception e){
e.printStackTrace();
}finally {
BrowserContext context = page.context();
Browser browser = context.browser();
page.close();
context.close();
browser.close();
}
return "";
}
有时候,页面会加载很久,然后超时,所以我们需要考虑超时的问题。
page.navigate(trueUrl, new Page.NavigateOptions().setTimeout(120 * 1000));
Page.navigate 官方地址 ,具体该函数的使用方法,参考官方文档的写法。
拦截请求和响应
public static void handleOn(Page page) {
page.onRequest(request -> {
System.out.println("请求 " + request.method() + " " + request.url());
});
page.onResponse(response -> {
System.out.println("响应 " + response.request().method() + " " + response.request().url() + " " + response.status());
});
}
将该函数放到调用 page.navigate 之前,得到的部分日志结果如下:
现在对handleOn函数进行函数化,使之能够传入函数作为参数。
函数式编程入门
修改后的handleOn函数如下:
public static void handleOn(Page page, Consumer<Request> requestConsumer, Consumer<Response> responseConsumer) {
page.onRequest(request -> {
requestConsumer.accept(request);
});
page.onResponse(response -> {
responseConsumer.accept(response);
});
}
调用方式如下:
handleOn(page,
request -> {
}, response -> {
});
获取页面中所有的链接
- 使用selector的方式抓取所有结构化数据中的链接
- playwright支持的selector的方式:传送门
- [href]
- [src]
- 利用响应拦截器,获取请求链接和响应头里面的链接
- 对页面进行正则匹配解析
代码如下:
public static Set<String> getAllUrls(String url) {
String trueUrl = DomainUtils.formatUrl(url);
Page page = initPage();
try {
Set<String> urls = new HashSet<>();
handleOn(page,
request -> {
urls.add(request.url());
}, response -> {
urls.add(response.request().url());
Map<String, String> allHeaders = response.allHeaders();
for (Map.Entry<String, String> header : allHeaders.entrySet()) {
List<String> tmpUrls = HtmlUtil.getUrls(header.getValue());
urls.addAll(tmpUrls);
}
});
page.navigate(trueUrl, new Page.NavigateOptions().setTimeout(120 * 1000));
List<String> hrefUrls = evalAndGetValue(page, "[href]", PlaywrightFunc.HREF.getFuncStr());
List<String> srcUrls = evalAndGetValue(page, "[src]", PlaywrightFunc.SRC.getFuncStr());
String content = page.content();
List<String> htmlUrls = HtmlUtil.getUrls(content);
urls.addAll(hrefUrls);
urls.addAll(htmlUrls);
urls.addAll(srcUrls);
return urls;
} catch (Exception e) {
e.printStackTrace();
} finally {
BrowserContext context = page.context();
Browser browser = context.browser();
page.close();
context.close();
browser.close();
}
return new HashSet<>();
}
这里使用到了 page.evalOnSelectorAll,我对此进行封装,封装的函数为evalAndGetValue,代码如下:
public static List<String> evalAndGetValue(Page page, String selector, String functionStr) {
List<String> ans = (List<String>) page.evalOnSelectorAll(selector, functionStr);
List<String> res = new ArrayList<>();
for (String x : ans) {
if (StringUtils.hasText(x)) {
res.add(x);
}
}
return res;
}
其实这个函数不封装页没啥问题,这里封装了一遍主要是为了去除为空的内容。该函数的目的是对该selector注入一个函数表达式进行执行,我这边目前积累的函数如下:
public enum PlaywrightFunc {
HREF(1,"function do123(as) {\n" +
" var ans = [];\n" +
" for (var i =0;i<as.length;i++){\n" +
" ans.push(as[i].href);\n" +
" }\n" +
" return ans;\n" +
"}","获取链接"),
INNER_TEXT(2,"function do123(as) {\n" +
" var ans = [];\n" +
" for (var i =0;i<as.length;i++){\n" +
" ans.push(as[i].innerText);\n" +
" }\n" +
" return ans;\n" +
"}","获取内容"),
INNER_HTML(3,"function do123(as) {\n" +
" var ans = [];\n" +
" for (var i =0;i<as.length;i++){\n" +
" ans.push(as[i].innerHtml);\n" +
" }\n" +
" return ans;\n" +
"}","获取里面的代码"),
SRC(4,"function do123(as) {\n" +
" var ans = [];\n" +
" for (var i =0;i<as.length;i++){\n" +
" ans.push(as[i].src);\n" +
" }\n" +
" return ans;\n" +
"}","获取链接"),
DELETE_WEBDRIVER(5,"Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});","去除webdriver属性"),
;
PlaywrightFunc(Integer code, String funcStr, String info) {
this.funcStr = funcStr;
this.code = code;
this.info = info;
}
private String funcStr;
private String info;
private Integer code;
public String getInfo() {
return info;
}
public Integer getCode() {
return code;
}
public String getFuncStr() {
return funcStr;
}
public void setInfo(String info) {
this.info = info;
}
public void setCode(Integer code) {
this.code = code;
}
public void setFuncStr(String funcStr) {
this.funcStr = funcStr;
}
}
再次对函数进行抽象
写到这里我发现每个地方都得写打开关闭浏览器的操作,这部分代码是重复代码,所以考虑使用函数式编程的方式,将这部分独立出来,代码如下:
public static void getAns(String url, BiConsumer<Page, String> a) {
String trueUrl = DomainUtils.formatUrl(url);
Playwright playwright = Playwright.create();
Page page = initPage(playwright);
try {
a.accept(page, trueUrl);
} catch (Exception e) {
e.printStackTrace();
} finally {
BrowserContext context = page.context();
Browser browser = context.browser();
page.close();
context.close();
browser.close();
playwright.close();
}
}
不过这里目前没有设置返回值,所以需要外部的数据结构作为要存储结果的结构。那么刚才的方法可以改写为如下部分。
Set<String> urls1 = new HashSet<>();
getAns(url, (Page page, String trueUrl) -> {
Set<String> tmp = new HashSet<>();
handleOn(page,
request -> {
tmp.add(request.url());
}, response -> {
tmp.add(response.request().url());
Map<String, String> allHeaders = response.allHeaders();
for (Map.Entry<String, String> header : allHeaders.entrySet()) {
List<String> tmpUrls = HtmlUtil.getUrls(header.getValue());
tmp.addAll(tmpUrls);
}
});
page.navigate(trueUrl, new Page.NavigateOptions().setTimeout(120 * 1000));
page.waitForTimeout(3*1000);
List<String> hrefUrls = evalAndGetValue(page, "[href]", PlaywrightFunc.HREF.getFuncStr());
List<String> srcUrls = evalAndGetValue(page, "[src]", PlaywrightFunc.SRC.getFuncStr());
String content = page.content();
List<String> htmlUrls = HtmlUtil.getUrls(content);
System.out.println("1 - 1 " + hrefUrls.size());
System.out.println("1 - 2 " + srcUrls.size());
System.out.println("1 - 3 " + htmlUrls.size());
System.out.println("1 - 4 " + tmp.size());
tmp.addAll(hrefUrls);
tmp.addAll(htmlUrls);
tmp.addAll(srcUrls);
urls1.addAll(tmp);
});
获取页面中所有图片资源
获取图片资源这个问题本质上是上一个问题的子集。我们以上面抽象的方法来实现这么一个方法。
public static Set<String> getAllImages(String url) {
Set<String> ans = new HashSet<>();
getAns(url, (Page page, String trueUrl) -> {
handleOn(page, request -> {
}, response -> {
if (HtmlUtil.isImage(response.request().url())) {
ans.add(response.request().url());
}
});
page.navigate(trueUrl, new Page.NavigateOptions().setTimeout(120 * 1000));
page.waitForTimeout(3 * 1000);
List<String> hrefUrls = evalAndGetValue(page, "[href]", PlaywrightFunc.HREF.getFuncStr());
for (int i=0;i<hrefUrls.size();i++){
if (HtmlUtil.isImage(hrefUrls.get(i))){
ans.add(hrefUrls.get(i));
}
}
List<String> srcUrls = evalAndGetValue(page, "[src]", PlaywrightFunc.SRC.getFuncStr());
for (int i=0;i<srcUrls.size();i++){
if (HtmlUtil.isImage(srcUrls.get(i))){
ans.add(srcUrls.get(i));
}
}
String content = page.content();
List<String> htmlUrls = HtmlUtil.getUrls(content);
for (int i=0;i<htmlUrls.size();i++){
if (HtmlUtil.isImage(htmlUrls.get(i))){
ans.add(htmlUrls.get(i));
}
}
});
return ans;
}
滚动到底
对于淘宝这种页面,很有可能会需要滚动,所以我们需要实现一个滚动,由于滚动有可能是局部组件滚动,所以我们不能简单的使用窗口滚动的方式。
/**
* 滚轮向下滚动
* 需要调整滚动量 和 检测的频率范围
* 400 8
* 这个滚动下降率和检测频率范围应该是变化的
* 这里采取Log函数来做下降算法
*
* @param cnt
* @param page
* @param selector
*/
public static void scroll(AtomicInteger cnt, Page page, String selector) {
int lastCnt = cnt.get();
int flag = 0;
int cntFlag = 2;
for (int i = 0; i < 100; i++) {
page.evalOnSelector(selector, "(node,i)=>{
{\n" +
" node.scrollTo((i-1)*400,i*400)\n" +
" }}", i);
page.waitForTimeout(1000);
flag++;
if (flag * 1.0 >= MathUtils.getLogAN(2, cntFlag) * 8.0) {
int nowCnt = cnt.get();
if (lastCnt == nowCnt) {
break;
}
flag = 0;
lastCnt = nowCnt;
cntFlag++;
}
}
}
这里的实现原理是每次利用Log函数进行次数下降,然后判断滚动这么多次之后,是否产生了新的请求,如果没有产生新的请求,就代表滚动无效,有可能滚动到底了,如果有新的请求就说明滚动有效,就继续滚动。
使用的方式如下:
AtomicInteger cnt = new AtomicInteger(0);
// 然后在handleOn里面加上监听
handleOn(page, request -> {
cnt.incrementAndGet();
}, response -> {
});
// 最后调用滚动方法即可
... ...
scroll(cnt, page, "html");
有了这些基础的知识点之后,我们就具备利用playwright实现web漏洞扫描器的前奏步骤的基础能力了。