为什么要这么做
我不知道空白大佬是怎么实现的,但是我想要的是把豆瓣图片全部爬到本地
但是问题来了 dou.img.lithub.cc
有防爬虫,然后我就想能不能通过获取到的豆瓣地址,去爬取电影的封面
事实告诉我豆瓣也有防爬虫,过一段时间还会有验证码预防机器人爬取 这就很尴尬了
怎么做
使用cookie 模仿登录豆瓣,获取到豆瓣的列表,然后通过豆瓣的id去获取电影的封面,但是这样还是会出现验证码.
如果使用github action去获取时 就会频繁爬取会导致风控 我的想法是在爬取图片时检测本地目录是否存在相同ID的图片,如果存在就不再爬取,这样大大减少了爬取的次数
代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
豆瓣封面爬取优化版(带本地文件检查)
- 自动跳过已存在的图片
- 减少不必要的下载请求
- 保留原有所有功能
"""
import os
import re
import time
import random
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, quote
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
class OptimizedDoubanCrawler:
def __init__(self, cookies=None):
self.session = self._create_session()
self.cookies = cookies or self._generate_cookies()
self.request_count = 0
self.last_request_time = 0
self.existing_files = set()
def _generate_cookies(self):
"""生成基础Cookies"""
return {
'bid': ''.join(random.choices('abcdefghijklmnopqrstuvwxyz1234567890', k=11)),
'll': '"118287"',
'frodotk': ''.join(random.choices('abcdefghijklmnopqrstuvwxyz1234567890', k=32))
}
def _create_session(self):
"""创建智能Session"""
session = requests.Session()
retry = Retry(
total=3,
backoff_factor=1,
status_forcelist=[403, 404, 500, 502, 503, 504, 429],
allowed_methods=['GET', 'POST'],
respect_retry_after_header=True
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def _get_headers(self, referer=None):
"""动态生成请求头"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': random.choice(user_agents)
}
if referer:
headers['Referer'] = referer
else:
headers['Referer'] = 'https://www.douban.com/'
return headers
def _request_delay(self):
"""智能请求延迟"""
self.request_count += 1
elapsed = time.time() - self.last_request_time
# 每10次请求后长暂停
if self.request_count % 10 == 0:
delay = random.uniform(10, 20)
else:
delay = random.uniform(3, 8)
if elapsed < delay:
time.sleep(delay - elapsed)
self.last_request_time = time.time()
def _scan_existing_files(self, save_dir):
"""扫描已存在的文件"""
if not os.path.exists(save_dir):
return set()
pattern = re.compile(r'douban_(\d+)\.jpg$')
existing_ids = set()
for filename in os.listdir(save_dir):
match = pattern.match(filename)
if match:
existing_ids.add(match.group(1))
return existing_ids
def _extract_cover_url(self, html):
"""多方式提取封面URL"""
soup = BeautifulSoup(html, 'html.parser')
# 方式1: 标准封面元素
cover_elem = soup.find('img', {'rel': 'v:image'})
if cover_elem:
return cover_elem['src'].replace('s_ratio_poster', 'l_ratio_poster')
# 方式2: 从脚本数据中提取
script_data = soup.find('script', {'type': 'application/ld+json'})
if script_data:
try:
data = json.loads(script_data.string)
return data.get('image', '')
except:
pass
# 方式3: 从隐藏的meta标签提取
meta_image = soup.find('meta', {'property': 'og:image'})
if meta_image and meta_image.get('content'):
return meta_image['content']
# 方式4: 从背景图中提取
bg_div = soup.find('div', {'class': 'pic'})
if bg_div and bg_div.get('style'):
match = re.search(r'url\((.*?)\)', bg_div['style'])
if match:
return match.group(1).strip('"\'')
raise ValueError("无法从页面提取封面URL")
def get_movie_cover(self, douban_url, save_dir='covers'):
"""获取电影封面(带存在性检查)"""
try:
# 获取电影ID
movie_id = re.search(r'subject/(\d+)', douban_url).group(1)
if not movie_id:
raise ValueError("无效的豆瓣URL")
# 检查是否已存在
filename = f'douban_{movie_id}.jpg'
save_path = os.path.join(save_dir, filename)
if os.path.exists(save_path):
print(f"⏩ 已存在,跳过: {filename}")
return save_path
# 准备请求
self._request_delay()
# 获取页面内容
response = self.session.get(
douban_url,
headers=self._get_headers(),
cookies=self.cookies,
timeout=20
)
if response.status_code == 403:
raise Exception("403禁止访问,请检查Cookie或降低频率")
response.raise_for_status()
# 检查反爬
if "检测到异常请求" in response.text:
raise Exception("触发反爬机制,请更新Cookie")
# 提取封面URL
cover_url = self._extract_cover_url(response.text)
if not cover_url.startswith(('http://', 'https://')):
cover_url = 'https:' + cover_url
# 下载封面
self._request_delay()
img_response = self.session.get(
cover_url,
headers=self._get_headers(referer=douban_url),
cookies=self.cookies,
stream=True,
timeout=30
)
img_response.raise_for_status()
# 保存图片
os.makedirs(save_dir, exist_ok=True)
with open(save_path, 'wb') as f:
for chunk in img_response.iter_content(8192):
f.write(chunk)
print(f"✅ 成功下载: {filename}")
return save_path
except Exception as e:
print(f"❌ 处理失败: {douban_url} | 错误: {str(e)}")
return None
def batch_process(csv_path, url_column='url', output_dir='content/images/douban'):
"""批量处理CSV文件(优化版)"""
import pandas as pd
# 使用完整Cookie
USER_COOKIES = {
'll': '"118287"',
'bid': 'jLsUuRC3yr0',
'dbcl2': '"269547702:ZWJYaBTrUyM"',
'ck': 'p4si',
'frodotk_db': '"4075091eefd002b8c15f1b5ccf700828"',
'frodotk': '"df131ed940ebccdc9d395c8bc2c36694"'
}
crawler = OptimizedDoubanCrawler(cookies=USER_COOKIES)
df = pd.read_csv(csv_path)
failed_urls = []
# 预先扫描已存在文件
print("🔍 扫描现有文件...")
existing_ids = crawler._scan_existing_files(output_dir)
print(f"发现 {len(existing_ids)} 个已存在的封面")
for index, row in df.iterrows():
if pd.notna(row[url_column]):
# 直接从URL提取ID进行检查
match = re.search(r'subject/(\d+)', row[url_column])
if match and match.group(1) in existing_ids:
print(f"⏩ 跳过已存在: {match.group(1)}")
continue
result = crawler.get_movie_cover(row[url_column], output_dir)
if not result:
failed_urls.append(row[url_column])
if failed_urls:
print("\n以下URL需要手动处理:")
for url in failed_urls:
print(url)
with open('failed_urls.log', 'w') as f:
f.write("\n".join(failed_urls))
print("\n失败记录已保存到 failed_urls.log")
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='豆瓣电影封面爬取优化版')
parser.add_argument('csv', help='CSV文件路径')
parser.add_argument('--column', default='url', help='URL列名')
parser.add_argument('--output', default='content/images/douban', help='输出目录')
args = parser.parse_args()
print("🚀 开始处理豆瓣电影封面(优化版)...")
batch_process(args.csv, args.column, args.output)
print("🎉 处理完成!")
运行
python download-douban-image.py assets/movie.csv --column url --output content/images/douban
结语
问题还是存在的,但是这样的方式可以减少一部分的风险,但是还是需要手动处理一部分的图片