"
今天回忆廖大的多线程的时候,看到下面有人写了个多线程的爬虫点进去看了下,分析的很仔细,写了接近200行代码吧
让后我就研究了一下这个网站,emmmm,selenium + PhantomJS不就直接搞定了嘛,然后//代码效果参考:https://v.youku.com/v_show/id_XNjM5NTYxMjM5Ng==.html
就写了段code:然后发现,哇,selenium不支持PhantomJS了,因为chrome和firefox自带了headless的访问,然后就去各个blog看,最后爬下了这个网站:
1 import unittest
2 import requests
3 import time
4 import re
5 from random import randint
6 from selenium import webdriver
7 from selenium.webdriver.chrome.options import Options
8 from selenium.webdriver.common.keys import Keys
9
10 class ooxx_spider(unittest.TestCase):
11
12 def setUp(self):
13 chrome_options = Options()
14 chrome_options.add_argument('--headless')
15 chrome_options.add_argument('--disable-gpu')
16 self.driver = webdriver.Chrome('E:/chromedriver.exe', chrome_options=chrome_options)
17
18 def test_spider(self):
19 for i in range(1, 80):
20 url = '' + 'page-' + str(i)
21 self.driver.get(url)
22 //代码效果参考:https://v.youku.com/v_show/id_XNjM5OTMwMTE5Mg==.html
print(url)23 elem = self.driver.find_elements_by_xpath('//*【@class=""commentlist""】/li/div/div/div/p/img')#/li/div/div/div/p/img
24 for j in elem:
25 self.save_img(j.get_attribute('src'))
26 print('第{}页爬取成功'.format(i))
27
28 def save_img(self, res):
29 suffix = res.split('.')【-1】
30 destination = 'picture/' + str(randint(1, 1000)) + str(randint(1, 1000)) + '.'+ suffix
31 r = requests.get(res)
32 with open(destination, 'wb') as f:
33 f.write(r.content)
34
35 def tearDown(self):
36 self.driver.close()
37
38 if name == 'main':
39 unittest.main()
补上多线程的代码
核心代码:
1 def test_multiscraping(self):
2 p = Pool()#默认大小是cpu的核数,你可以修改比如说双核Pool(2)
3 #这里假设我是4个进程,所以range(5)
4 for i in range(5):
5 p.apply_async(scraping, args = (i, ))
6 p.close()
7 p.join()
cpu太垃圾了,晚上回去用同学的cpu测试一下(留下了穷人的眼泪)
不忘初心,方得始终
"
![image.png](https://ucc.alicdn.com/pic/developer-ecology/hnrk7epeorhrk_9c27b6a59f314dcd8cadd21f5fb06575.png)