Python爬虫第二弹

爬虫第二弹

我也没有想到会出第二弹,但是爬虫还是很好玩的
话不多说,开始吧 ~

复习一下~

爬取的豆瓣排名前250,并且带有排名标号、名字和评分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}

for page in range(0, 250, 25):
url = f'https://movie.douban.com/top250?start={page}'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

movies = soup.find_all('div', class_='info')
for idx, movie in enumerate(movies, start=page+1):
title = movie.find('span', class_='title').text
rating = movie.find('span', class_='rating_num').text
print(f'排名: {idx}\t电影名字: {title}\t评分: {rating}')

运行完,桌面上就会有一个douban_top250.xlsx文件,就可以看到

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}

# 创建一个新的Excel工作簿
wb = Workbook()
# 获取默认的工作表
ws = wb.active

for page in range(0, 250, 25):
url = f'https://movie.douban.com/top250?start={page}'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

movies = soup.find_all('div', class_='info')
for idx, movie in enumerate(movies, start=page+1):
title = movie.find('span', class_='title').text
rating = movie.find('span', class_='rating_num').text
ws.cell(row=idx, column=1, value=idx) # 写入排名
ws.cell(row=idx, column=2, value=title) # 写入电影名字
ws.cell(row=idx, column=3, value=rating) # 写入评分

# 保存Excel文件
wb.save('douban_top250.xlsx')

使用ws['A1'] = '排名'将排名列标题写入A1单元格,使用ws['B1'] = '电影名字'将电影名字列标题写入B1单元格,使用ws['C1'] = '评分'将评分列标题写入C1单元格。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}

# 创建一个新的Excel工作簿
wb = Workbook()
# 获取默认的工作表
ws = wb.active

# 添加列标题
ws['A1'] = '排名'
ws['B1'] = '电影名字'
ws['C1'] = '评分'

for page in range(0, 250, 25):
url = f'https://movie.douban.com/top250?start={page}'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

movies = soup.find_all('div', class_='info')
for idx, movie in enumerate(movies, start=page+2):
title = movie.find('span', class_='title').text
rating = movie.find('span', class_='rating_num').text
ws.cell(row=idx, column=1, value=idx-1) # 写入排名
ws.cell(row=idx, column=2, value=title) # 写入电影名字
ws.cell(row=idx, column=3, value=rating) # 写入评分

# 保存Excel文件
wb.save('douban_top250.xlsx')

接上集,上次我们已经可以将爬虫的文字数据导入到excel表格里面了,但是我们不仅仅需要文字类型的数据,我们还需要一些图片类型的。

爬虫爬取豆瓣的海报

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import requests
from bs4 import BeautifulSoup

url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def get_movies():
movies = []
for page in range(10): # 豆瓣前250名电影共有10个页面,每页25部电影
params = {
'start': page * 25
}
response = requests.get(url, headers=headers, params=params)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find_all('div', class_='item')
for item in items:
movie = {}
movie['rank'] = item.find('em').text
movie['title'] = item.find('span', class_='title').text
movie['score'] = item.find('span', class_='rating_num').text
movie['image'] = item.find('img')['src']
movies.append(movie)
return movies

movies = get_movies()
for movie in movies:
print(movie['rank'], movie['title'], movie['score'])
response = requests.get(movie['image'])
with open(f"{movie['rank']}_{movie['title']}.jpg", 'wb') as f:
f.write(response.content)

添加了一个movie['image']字段来存储电影海报的URL。然后,我们使用requests.get()方法来发送GET请求获取图片数据,并使用open()方法将图片数据写入到本地文件中,以电影的排名和标题作为文件名。

需要设定目标路径

有时候我们需要把爬取的图片和文件存放到指定路径,下面就是更改之后的代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import requests
from bs4 import BeautifulSoup

url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

save_path = os.path.expanduser('~/Desktop/img')

def get_movies():
movies = []
for page in range(10): # 豆瓣前250名电影共有10个页面,每页25部电影
params = {
'start': page * 25
}
response = requests.get(url, headers=headers, params=params)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find_all('div', class_='item')
for item in items:
movie = {}
movie['rank'] = item.find('em').text
movie['title'] = item.find('span', class_='title').text
movie['score'] = item.find('span', class_='rating_num').text
movie['image'] = item.find('img')['src']
movies.append(movie)
return movies

# 创建目录
os.makedirs(save_path, exist_ok=True)
# 设置自定义目录
save_path = 'C:/Users/ASUS/Desktop/img'

movies = get_movies()
for movie in movies:
print(movie['rank'], movie['title'], movie['score'])
response = requests.get(movie['image'])
file_path = os.path.join(save_path, f"{movie['rank']}_{movie['title']}.jpg")
with open(file_path, 'wb') as f:
f.write(response.content)

OK,完成了编写,结束