"
1 #
2 import //代码效果参考:https://v.youku.com/v_show/id_XNjQwMDE0OTIyOA==.html
requests3 import re
4 from bs4 import BeautifulSoup
5 from urllib.parse import urlparse,parse_qs
6 import os
7
8
9 def get_url_content(url): //获取网站的源码
10 response=requests.get(url)
11 if response.status_code==200:
12 return response.text
13 else:
14 return False
15
16 def parse_Web_Content(content):
17 Object=BeautifulSoup(content,'html.parser')
18
19 filmName=get_film_name(Object)
20 filmCast=get_film_cast(Object)
21 filmIntro=get_film_introduction(Object)
22 filmUrl=get_film_url(Object)
23
24 film=【】
25 for i in range(len(filmName)):
26 indiv={
27 'fileName':filmName【i】,
28 'filmCast':filmCast【i】,
29 'filmIntro':filmIntro【i】,
30 'filmurl':''+filmUrl【i】
31 }
32 film.append(indiv)
3//代码效果参考:https://v.youku.com/v_show/id_XNjQwNjg1MjEyNA==.html
3 return film34
35
36 def get_film_name(Soup):
37 Name=Soup.select("".play_info"")
38 name_list=【】
39 for i in range(len(Name)):
40 parsedName=Name【i】.a.string
41 name_list.append(parsedName)
42 return name_list
43
44 def get_film_cast(Soup):
45 Cast=Soup.find_all('p',attrs={'class':'space'})
46 film_Cast = 【】
47 for i in range(len(Cast)):
48 parsedCast=Cast【i】.text
49 film_Cast.append(parsedCast)
50 return film_Cast
51
52 def get_film_introduction(Soup):
53 Introduction=Soup.find_all('p',attrs={'class':'content'})
54 intro_list=【】
55 for i in range(len(Introduction)):
56 parsedIntro=Introduction【i】.text
57 intro_list.append(parsedIntro)
58 return intro_list
59
60 def get_film_url(Soup):
61
62 filmUrl=Soup.select("".play_info"")
63 Url_list=【】
64 for i in range(len(filmUrl)):
65 href=filmUrl【i】.a【'href'】
66 Url_list.append(href)
67 return Url_list
68
69 def writeTofile(parsedWebcontent):
70 with open('film.txt','a',encoding='utf-8') as f:
71 for i in range(len(parsedWebcontent)):
72 f.write(parsedWebcontent【i】【'fileName'】+'\t')
73 f.write(parsedWebcontent【i】【'filmCast'】 + '\t')
74 f.write(parsedWebcontent【i】【'filmIntro'】 + '\t')
75 f.write(parsedWebcontent【i】【'filmurl'】 + '\t')
76 f.write('\n')
77 f.close()
78
79
80 link=""""
81 for i in range(1,4):
82 url=link + str(i) + "".html""
83 webContent=get_url_content(url)
84
85 if webContent!=False:
86 Content=parse_Web_Content(webContent)
87 writeTofile(Content)
"