Python电影天堂爬虫——通过lxml库

前言

进入电影天堂电影详情页,一望可知页面结构明晰,于是我们用lxml库对此页面进行解析提取。

(虽然平时多用request库,但是看到结构如此明晰的页面还是没忍住选择用基础的lxml库…)

实现思路

通过分析页面链接,易知,电影详情页urlhttps://www.dytt8.net + 超链接形式。

故,我们可以通过分析母页面,提取电影详情页超链接,添加至https://www.dytt8.net后,再进入电影详情页,提取电影信息。

具备思路,编码开始。

代码解析

  • 引入所需库,将母页面域名设置为全局变量,并准备请求头,备用。
1
2
3
4
5
6
7
8
import requests
from lxml import etree

BASE_DOMAIN = "https://www.dytt8.net"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.37",
"Referer": "https://www.dytt8.net/html/gndy/dyzz/list_23_3.html"
}
  • 获取电影详情页url
1
2
3
4
5
6
7
8
9
10
11
def get_detail_urls(url):
response = requests.get(url, headers=HEADERS)
# 检测当前页面编码格式为gbk,解码时有出现非法字符的可能,通过"ignore"忽略非法字符加以限制
text = response.content.decode("gbk", "ignore")
# 解析所在页面
html = etree.HTML(text)
# 获取电影详情页超链接
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
# 将各超链接添加链接头传入列表
detail_urls = map(lambda url:BASE_DOMAIN+url, detail_urls)
return detail_urls
  • 解析电影详情页
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def parse_detail_urls(url):
# 总列表下的子字典,包含电影信息
movie = {}
response = requests.get(url,headers=HEADERS)
# 提取页面,并解码
text = response.content.decode("gbk", "ignore")
# 解析所在页面
html = etree.HTML(text)
# (1)获取片名
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
# 向字典内填充
movie["title"] = title

# 电影详细信息在此标签下,统一提取出来
zoomE = html.xpath("//div[@id='Zoom']")[0]
# (2)获取海报链接
# 有些电影会返回两个链接,第二个是电影截图,没什么用,这里只取海报缩略图链接
post = zoomE.xpath(".//img/@src")[0]
# 向字典内填充
movie["post"] = post

# 定义解析电影详细信息的函数
def parse_info(info,rule):
# 将rule替换为空格,并通过strip()去除字符串首尾空格
return info.replace(rule,"").strip()

# (3)获取详细信息
infos = zoomE.xpath(".//text()")
for index,info in enumerate(infos):
# 若以"◎年  代"开头,则将"◎年  代"替换为空格,通过strip()去除字符串首尾空格,并向字典内填充,下同
if info.startswith("◎年  代"):
info = parse_info(info,"◎年  代")
movie["year"] = info
elif info.startswith("◎产  地"):
info = parse_info(info,"◎产  地")
movie["country"] = info
elif info.startswith("◎类  别"):
info = parse_info(info,"◎类  别")
movie["category"] = info
elif info.startswith("◎上映日期"):
info = parse_info(info,"◎上映日期")
movie["date"] = info
elif info.startswith("◎豆瓣评分"):
info = parse_info(info,"◎豆瓣评分")
movie["score"] = info
elif info.startswith("◎片  长"):
info = parse_info(info,"◎片  长")
movie["film length"] = info
elif info.startswith("◎导  演"):
info = parse_info(info, "◎导  演")
movie["director"] = info
elif info.startswith("◎编  剧"):
info = parse_info(info, "◎编  剧")
movie["story by"] = info
elif info.startswith("◎主  演"):
# 第一行演员
info = parse_info(info,"◎主  演")
# 新建列表储存演员,info即为第一行演员
actors = [info]
for i in range(index+1,len(infos)):
actor = infos[i].strip()
# 若出现下一个主题,如简介,则马上停止运行
if actor.startswith("◎"):
break
# 向列表中添加演员
actors.append(actor)
movie["actors"] = actors

# (4)获取下载链接
download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
# 向字典内填充
movie["download_url"] = download_url

# 返回子字典
return movie
  • 将代码汇总整合,并运行
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 3.获取网站第1页url,再获取各页面的电影详情页url,接着解析电影详情页,提取电影信息
def spider():
# 总列表,包含所有电影详情子字典
movies = []
base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
# 获取网站第1页url
for x in range(1,2):
url = base_url.format(x)
# 获取电影详情页url
detail_urls = get_detail_urls(url)
# 遍历提取出的电影详情页url,解析页面,提取电影信息
for detail_url in detail_urls:
movie = parse_detail_urls(detail_url)
# 将电影详情子字典向总列表里填充
movies.append(movie)
# 输出
print(movies)

# 运行
if __name__ == '__main__':
spider()

完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import requests
from lxml import etree


BASE_DOMAIN = "https://www.dytt8.net"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.37",
"Referer": "https://www.dytt8.net/html/gndy/dyzz/list_23_3.html"
}


def get_detail_urls(url):
response = requests.get(url, headers=HEADERS)
text = response.content.decode("gbk", "ignore")
html = etree.HTML(text)
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
detail_urls = map(lambda url:BASE_DOMAIN+url, detail_urls)
return detail_urls


def parse_detail_urls(url):
movie = {}
response = requests.get(url,headers=HEADERS)
text = response.content.decode("gbk", "ignore")
html = etree.HTML(text)
# (1)获取片名
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie["title"] = title

# 电影详细信息汇总标签
zoomE = html.xpath("//div[@id='Zoom']")[0]
# (2)获取海报链接
post = zoomE.xpath(".//img/@src")[0]
movie["post"] = post

def parse_info(info,rule):
return info.replace(rule,"").strip()

# (3)获取各详细信息
infos = zoomE.xpath(".//text()")
for index,info in enumerate(infos):
if info.startswith("◎年  代"):
info = parse_info(info,"◎年  代")
movie["year"] = info
elif info.startswith("◎产  地"):
info = parse_info(info,"◎产  地")
movie["country"] = info
elif info.startswith("◎类  别"):
info = parse_info(info,"◎类  别")
movie["category"] = info
elif info.startswith("◎上映日期"):
info = parse_info(info,"◎上映日期")
movie["date"] = info
elif info.startswith("◎豆瓣评分"):
info = parse_info(info,"◎豆瓣评分")
movie["score"] = info
elif info.startswith("◎片  长"):
info = parse_info(info,"◎片  长")
movie["film length"] = info
elif info.startswith("◎导  演"):
info = parse_info(info, "◎导  演")
movie["director"] = info
elif info.startswith("◎编  剧"):
info = parse_info(info, "◎编  剧")
movie["story by"] = info
elif info.startswith("◎主  演"):
info = parse_info(info,"◎主  演")
actors = [info]
for i in range(index+1,len(infos)):
actor = infos[i].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie["actors"] = actors

# (4)获取下载链接
download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
movie["download_url"] = download_url

return movie


def spider():
# 总列表
movies = []
base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
for x in range(1,2):
url = base_url.format(x)
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
movie = parse_detail_urls(detail_url)
movies.append(movie)
print(movies)


if __name__ == '__main__':
spider()