defparse_detail_urls(url): movie = {} response = requests.get(url,headers=HEADERS) text = response.content.decode("gbk", "ignore") html = etree.HTML(text) # (1)获取片名 title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0] movie["title"] = title
# 电影详细信息汇总标签 zoomE = html.xpath("//div[@id='Zoom']")[0] # (2)获取海报链接 post = zoomE.xpath(".//img/@src")[0] movie["post"] = post
# (3)获取各详细信息 infos = zoomE.xpath(".//text()") for index,info inenumerate(infos): if info.startswith("◎年 代"): info = parse_info(info,"◎年 代") movie["year"] = info elif info.startswith("◎产 地"): info = parse_info(info,"◎产 地") movie["country"] = info elif info.startswith("◎类 别"): info = parse_info(info,"◎类 别") movie["category"] = info elif info.startswith("◎上映日期"): info = parse_info(info,"◎上映日期") movie["date"] = info elif info.startswith("◎豆瓣评分"): info = parse_info(info,"◎豆瓣评分") movie["score"] = info elif info.startswith("◎片 长"): info = parse_info(info,"◎片 长") movie["film length"] = info elif info.startswith("◎导 演"): info = parse_info(info, "◎导 演") movie["director"] = info elif info.startswith("◎编 剧"): info = parse_info(info, "◎编 剧") movie["story by"] = info elif info.startswith("◎主 演"): info = parse_info(info,"◎主 演") actors = [info] for i inrange(index+1,len(infos)): actor = infos[i].strip() if actor.startswith("◎"): break actors.append(actor) movie["actors"] = actors # (4)获取下载链接 download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href") movie["download_url"] = download_url
return movie
defspider(): # 总列表 movies = [] base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html" for x inrange(1,2): url = base_url.format(x) detail_urls = get_detail_urls(url) for detail_url in detail_urls: movie = parse_detail_urls(detail_url) movies.append(movie) print(movies)