from gevent import monkey
monkey.patch_socket()
import gevent
import gevent.pool
import requests
from bs4 import BeautifulSoup
import time
t1=time.time()
#获得所有教授的链接
web_links='http://www.cmu.edu/engineering/materials/people/faculty/index.html'
l=requests.get(web_links)
li=BeautifulSoup(l.text,'lxml')
li1=li.find('div',class_='bioIndex')
lin=li1.find_all('div',class_='row')
links=[]
for link in lin:
temp=link.find('div',class_='names').find_all('div',class_='name')
for i in temp:
t_link='http://www.cmu.edu/engineering/materials/people/faculty/'+str(i.find('a')['href'])
links.append(t_link)
#对每个连接进行解析
def gettext(links):
i = 0
try:
pro = requests.get( links ).text
print('第{}位教授'.format(i) + '#########################################\n',
'教授链接: ' + (links[i]) + '\n',
)
soup = BeautifulSoup(pro, 'lxml')
detail = soup.find('div', class_='content')
name = detail.find('h1').text
title = detail.find('h2')
contacts = detail.find_all('div', class_='bioContactInfo')
bio = detail.find('p')
print('教授姓名: ' + str(name) + '\n',
'title ' + title.text + '\n'
'orginzation'+' Carnegie Mellon University MECHANICAL ENGINEERING')
for contact in contacts[1:]:
print(contact.text)
print(bio.text)
i = i + 1
except Exception as e:
i = i + 1
print(e)
#定义并发的数量
pool=gevent.pool.Pool(10)
for i in links:
pool.add(gevent.spawn(gettext,i))
pool.join()
print("共计{}名学者".format(i))
t2=time.time()
print('用时',t2-t1)