一、效果预览
二、基本步骤
1. 发送请求, 确定url地址 然后对其发送请求
2. 获取数据, 获取服务器返回的响应数据内容
3. 解析数据, 提取我们想要内容(本文采用CSS选择器获取数据)
4. 保存数据
5. 多页数据爬取
三、完整代码
import requests
import parsel
import csv
import time
import pandas as pd
f = open('../豆瓣Top250.csv', mode='a', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
'标题',
'导演',
'演员',
'电影年份',
'拍摄国家',
'电影类型',
'电影评分',
'评论人数',
'电影简介',
])
csv_writer.writeheader()
data_list = [] # 用于存储数据的列表
num = 1
for page in range(0, 250, 25):
print(f'正在爬取第{num}页数据内容')
num += 1
time.sleep(1)
url = f'https://movie.douban.com/top250?start={page}&filter='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
selector = parsel.Selector(response.text)
lis = selector.css('.grid_view li')
for li in lis:
title = li.css('.info .hd span.title:nth-child(1)::text').get()
movie_info_list = li.css('.bd p:nth-child(1)::text').getall()
actor_list = movie_info_list[0].strip().split(' ')
if len(actor_list) > 1:
actor_1 = actor_list[0].replace('导演: ', '')
actor_2 = actor_list[1].replace('主演: ', '').replace('/...', '')
movie_info = movie_info_list[1].strip().split(' / ')
movie_year = movie_info[0]
movie_country = movie_info[1]
movie_type = movie_info[2]
movie_sum = li.css('.inq::text').get()
movie_num = li.css('.rating_num::text').get()
comment = li.css('.star span:nth-child(4)::text').get().replace('人评价', '')
else:
actor_1 = actor_list[0]
actor_2 = 'None'
dit = {
'标题': title,
'导演': actor_1,
'演员': actor_2,
'电影年份': movie_year,
'拍摄国家': movie_country,
'电影类型': movie_type,
'电影评分': movie_num,
'评论人数': comment,
'电影简介': movie_sum,
}
csv_writer.writerow(dit)
data_list.append(dit) # 将数据添加到列表中
f.close() # 关闭CSV文件
# 将数据保存到Excel文件
df = pd.DataFrame(data_list)
df.to_excel('豆瓣Top250.xlsx', index=False)
print('数据已保存至excel中')