http://www.baidu.com/s?wd=python
wd后面的参数就是在百度搜索引擎里面输入的关键字。
分析页面:
获取每一页的链接。
代码:
root@kali:~/py# more table.py import urllib import urllib2 from lxml import etree #输入python关键字进行查询 text = "python" starurl = "http://www.baidu.com/s?wd=%s" % text html = urllib.urlopen(starurl).read() PageUrlList = [] page = etree.HTML(html.lower().decode('utf-8')) #crapy pageurl list #解析出id为page的所有div下的a标签的href属性,如果要显示a标签的内容则把“@href”替换成“text()”即可 hrefs = page.xpath("//div[@id='page']//a/@href") for href in hrefs: hrefurl = "http://www.baidu.com"+href PageUrlList.append(hrefurl) print "list:" print PageUrlList |
运行结果
root@kali:~/py# python table.py list: ['http://www.baidu.com/s?wd=python&pn=10&oq=python&ie=utf-8&usm=4&rsv_pq=897a8df20000da9b&rsv_t=075dbfoz2dplnlb7ts%2boyopf06je%2bi1j1whmgcrvjurdkieecwvsl%2bhdvum', 'http://www.baidu.com/s?wd=python&pn=20&oq=python&ie=utf-8&usm=4&rsv_pq=897a8df20000da9b&rsv_t=075dbfoz2dplnlb7ts%2boyopf06je%2bi1j1whmgcrvjurdkieecwvsl%2bhdvum', 'http://www.baidu.com/s?wd=python&pn=30&oq=python&ie=utf-8&usm=4&rsv_pq=897a8df20000da9b&rsv_t=075dbfoz2dplnlb7ts%2boyopf06je%2bi1j1whmgcrvjurdkieecwvsl%2bhdvum', 'http://www.baidu.com/s?wd=python&pn=40&oq=python&ie=utf-8&usm=4&rsv_pq=897a8df20000da9b&rsv_t=075dbfoz2dplnlb7ts%2boyopf06je%2bi1j1whmgcrvjurdkieecwvsl%2bhdvum', 'http://www.baidu.com/s?wd=python&pn=50&oq=python&ie=utf-8&usm=4&rsv_pq=897a8df20000da9b&rsv_t=075dbfoz2dplnlb7ts%2boyopf06je%2bi1j1whmgcrvjurdkieecwvsl%2bhdvum', 'http://www.baidu.com/s?wd=python&pn=60&oq=python&ie=utf-8&usm=4&rsv_pq=897a8df20000da9b&rsv_t=075dbfoz2dplnlb7ts%2boyopf06je%2bi1j1whmgcrvjurdkieecwvsl%2bhdvum', 'http://www.baidu.com/s?wd=python&pn=70&oq=python&ie=utf-8&usm=4&rsv_pq=897a8df20000da9b&rsv_t=075dbfoz2dplnlb7ts%2boyopf06je%2bi1j1whmgcrvjurdkieecwvsl%2bhdvum', 'http://www.baidu.com/s?wd=python&pn=80&oq=python&ie=utf-8&usm=4&rsv_pq=897a8df20000da9b&rsv_t=075dbfoz2dplnlb7ts%2boyopf06je%2bi1j1whmgcrvjurdkieecwvsl%2bhdvum', 'http://www.baidu.com/s?wd=python&pn=90&oq=python&ie=utf-8&usm=4&rsv_pq=897a8df20000da9b&rsv_t=075dbfoz2dplnlb7ts%2boyopf06je%2bi1j1whmgcrvjurdkieecwvsl%2bhdvum', 'http://www.baidu.com/s?wd=python&pn=10&oq=python&ie=utf-8&usm=4&rsv_pq=897a8df20000da9b&rsv_t=075dbfoz2dplnlb7ts%2boyopf06je%2bi1j1whmgcrvjurdkieecwvsl%2bhdvum&rsv_page=1'] |