xpath+jpath+re单字段测试
最近测试发现这三个每次用都要重复写的东西太多了,然后封装了一下,做了一个单字段测试的类和接口,方便以后测试使用,只需要把类和包导入然后就可以直接使用了,简单方便
import json, re from jsonpath import jsonpath import chardet from lxml import etree class Spiders(object): def jpath(self, html,regex): body = str(html) # 可能有乱码问题 if isinstance(body, str) or isinstance(body, str): body = json.loads(body) try: result = jsonpath(body, regex) except Exception as e: result = [] print (e) if not result: result = [] return result def rpath(self, html,regex): try: body = html detector = chardet.detect(str(body)) if detector.get("encoding") == "ascii": body = str(body).decode("unicode-escape") except Exception as e: body = str(html) result = re.findall(regex, body) return result def xpath(self, html,regex): parse_data = [""] if regex: regexs = regex.split("&") try: for i in range(len(regexs)): xml = etree.HTML(html) result = xml.xpath(regexs[i].strip()) if result: return result return [] except Exception as e: print (e) return parse_data
接口用的是post传参方式,根据困难度选择式调用requests或是phantomjs
def spider_xpath(request): result = {} # datas = [] spider = Spiders() try: data = request.body data = json.loads(data) url = data['url'] #url rule = data['rule'] #抓取规则 ru_type = data['type'] #xpath,re,jpath difficult = data['difficult'] #普通(a),特殊(b) if difficult == 'a': res = requests.get(url,headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}).text else: browser = webdriver.PhantomJS(executable_path=r"../phantomjs_w/bin/phantomjs.exe") browser.get(url) time.sleep(2) res = browser.page_source if ru_type == 'xpath': datas = spider.xpath(res,rule) elif ru_type == 'rpath': datas = spider.rpath(res,rule) else: datas = spider.jpath(res,rule) detector = chardet.detect(str(datas)) if detector.get("encoding") == "ascii": datas = str(datas).decode("unicode-escape") print datas result['page'] = datas result['message'] = '测试成功' result['code'] = 1 print '成功' except Exception as e: print e result['message'] = '测试失败' result['code'] = -1 return HttpResponse(json.dumps(result, cls=JSONEncoder), content_type="application/json")