使用github action自动化更新豆瓣观影页面
前文使用github action获取豆瓣列表和电影封面中,我们已经实现了使用github action获取豆瓣列表和电影封面,但是这个工作流还不够完善,我们希望能够实现自动化更新豆瓣观影页面,并且能够定期更新.
为什么要这么做
我希望能够定期更新豆瓣观影页面,这样可以让页面内容更加及时. 同时,通过自动化的方式,可以减少手动操作的时间和精力.
怎么做
我们可以通过github action的定时任务功能,来实现定期更新豆瓣观影页面. 同时,我们可以使用hugo命令来生成静态页面,并且将生成的页面推送到github pages. 具体步骤如下:
- 创建一个新的github action工作流文件,例如
.github/workflows/update-douban-page.yml
. - 在工作流文件中,定义一个定时任务,例如每天凌晨1点执行一次.
- 在工作流中,使用之前的python脚本获取豆瓣列表和电影封面.
- 使用hugo命令生成静态页面.
- 将生成的页面推送到github pages.
代码
工作流文件内容如下:
name: Update Douban Data and Deploy Blog
on:
schedule:
- cron: '0 0 * * *'
workflow_dispatch:
push:
branches:
- main
permissions:
contents: write
jobs:
build-and-deploy:
runs-on: ubuntu-latest
steps:
# 1. 【修正】检出代码,包括 Submodule (主题)
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
submodules: 'recursive' # 确保主题被下载
# 2. 获取豆瓣数据
- name: Sync Douban Data
uses: lizheming/doumark-action@master
with:
id: imsunpw
type: movie
format: csv
dir: ./data
# 3. 【修正后的检查步骤】检查 movie.csv 是否有变化(包括新文件)
- name: Check for CSV changes
id: check-csv
run: |
# git status --porcelain 会列出所有变更
# '??' 表示新文件,' M' 表示修改过的文件
# 我们用 grep 检查输出中是否包含我们的目标文件
if git status --porcelain | grep -q 'data/movie.csv'; then
echo "csv_changed=true" >> $GITHUB_OUTPUT
echo "Douban CSV data is new or has changed."
else
echo "csv_changed=false" >> $GITHUB_OUTPUT
echo "Douban CSV data has not changed."
fi
# 4. (条件执行) 如果CSV有变化,则将其转换为 JSON
- name: Convert CSV to JSON
if: steps.check-csv.outputs.csv_changed == 'true'
run: python csv2json.py
# 5. (条件执行) 如果CSV有变化,则设置 Python 环境并下载图片
- name: Set up Python
if: steps.check-csv.outputs.csv_changed == 'true'
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
if: steps.check-csv.outputs.csv_changed == 'true'
run: pip install requests beautifulsoup4 pandas urllib3
- name: Download movie images
if: steps.check-csv.outputs.csv_changed == 'true'
run: |
python download-douban-image.py data/movie.csv --column url --output content/images/douban
# 6. (条件执行) 如果CSV有变化,则提交所有变更
- name: Commit and push changes
if: steps.check-csv.outputs.csv_changed == 'true'
uses: stefanzweifel/git-auto-commit-action@v4
with:
commit_message: "chore: auto-update douban data and images"
# 强烈建议使用 PAT 来提交
token: ${{ secrets.PERSONAL_TOKEN }}
file_pattern: 'data/movie.csv data/movies.json content/images/douban/*'
# 7. 【新增】删除原始的 CSV 文件,防止 Hugo 尝试解析它
- name: Remove original CSV file before build
run: rm data/movie.csv
# 8. 设置 Hugo 和部署
- name: Setup Hugo
uses: peaceiris/actions-hugo@v2
with:
hugo-version: 'latest'
extended: true
- name: Build Hugo site
run: hugo --minify
- name: Deploy to GitHub Pages
uses: peaceiris/actions-gh-pages@v3
with:
personal_token: ${{ secrets.PERSONAL_TOKEN }}
external_repository: jkjoy/blog.loliko.cn
publish_dir: ./public
publish_branch: gh-pages
csv2json.py
内容如下:
import csv
import json
# CSV 文件路径
csv_file = "data/movie.csv"
# 输出 JSON 文件路径
json_file = "data/movies.json"
movies = []
with open(csv_file, newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# 调整 poster 为本地路径,如果图片下载在 /images/douban/{id}.jpg
row['poster'] = f"/images/douban/douban_{row['id']}.jpg"
movies.append({
"id": row['id'],
"title": row['title'],
"intro": row['intro'],
"poster": row['poster'],
"pubdate": row['pubdate'],
"url": row['url'],
"rating": float(row['rating']) if row['rating'] else 0,
"genres": row['genres'],
"star": int(row['star']) if row['star'] else 0,
"comment": row['comment'],
"tags": row['tags'],
"star_time": row['star_time'],
"card": row['card']
})
# 输出 JSON 文件
data = {"movies": movies}
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"已生成 {json_file}")
download-douban-image.py
内容如下:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
豆瓣封面爬取优化版(带本地文件检查)
- 自动跳过已存在的图片
- 减少不必要的下载请求
- 保留原有所有功能
"""
import os
import re
import time
import random
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, quote
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
class OptimizedDoubanCrawler:
def __init__(self, cookies=None):
self.session = self._create_session()
self.cookies = cookies or self._generate_cookies()
self.request_count = 0
self.last_request_time = 0
self.existing_files = set()
def _generate_cookies(self):
"""生成基础Cookies"""
return {
'bid': ''.join(random.choices('abcdefghijklmnopqrstuvwxyz1234567890', k=11)),
'll': '"118287"',
'frodotk': ''.join(random.choices('abcdefghijklmnopqrstuvwxyz1234567890', k=32))
}
def _create_session(self):
"""创建智能Session"""
session = requests.Session()
retry = Retry(
total=3,
backoff_factor=1,
status_forcelist=[403, 404, 500, 502, 503, 504, 429],
allowed_methods=['GET', 'POST'],
respect_retry_after_header=True
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def _get_headers(self, referer=None):
"""动态生成请求头"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': random.choice(user_agents)
}
if referer:
headers['Referer'] = referer
else:
headers['Referer'] = 'https://www.douban.com/'
return headers
def _request_delay(self):
"""智能请求延迟"""
self.request_count += 1
elapsed = time.time() - self.last_request_time
# 每10次请求后长暂停
if self.request_count % 10 == 0:
delay = random.uniform(10, 20)
else:
delay = random.uniform(3, 8)
if elapsed < delay:
time.sleep(delay - elapsed)
self.last_request_time = time.time()
def _scan_existing_files(self, save_dir):
"""扫描已存在的文件"""
if not os.path.exists(save_dir):
return set()
pattern = re.compile(r'douban_(\d+)\.jpg$')
existing_ids = set()
for filename in os.listdir(save_dir):
match = pattern.match(filename)
if match:
existing_ids.add(match.group(1))
return existing_ids
def _extract_cover_url(self, html):
"""多方式提取封面URL"""
soup = BeautifulSoup(html, 'html.parser')
# 方式1: 标准封面元素
cover_elem = soup.find('img', {'rel': 'v:image'})
if cover_elem:
return cover_elem['src'].replace('s_ratio_poster', 'l_ratio_poster')
# 方式2: 从脚本数据中提取
script_data = soup.find('script', {'type': 'application/ld+json'})
if script_data:
try:
data = json.loads(script_data.string)
return data.get('image', '')
except:
pass
# 方式3: 从隐藏的meta标签提取
meta_image = soup.find('meta', {'property': 'og:image'})
if meta_image and meta_image.get('content'):
return meta_image['content']
# 方式4: 从背景图中提取
bg_div = soup.find('div', {'class': 'pic'})
if bg_div and bg_div.get('style'):
match = re.search(r'url\((.*?)\)', bg_div['style'])
if match:
return match.group(1).strip('"\'')
raise ValueError("无法从页面提取封面URL")
def get_movie_cover(self, douban_url, save_dir='covers'):
"""获取电影封面(带存在性检查)"""
try:
# 获取电影ID
movie_id = re.search(r'subject/(\d+)', douban_url).group(1)
if not movie_id:
raise ValueError("无效的豆瓣URL")
# 检查是否已存在
filename = f'douban_{movie_id}.jpg'
save_path = os.path.join(save_dir, filename)
if os.path.exists(save_path):
print(f"⏩ 已存在,跳过: {filename}")
return save_path
# 准备请求
self._request_delay()
# 获取页面内容
response = self.session.get(
douban_url,
headers=self._get_headers(),
cookies=self.cookies,
timeout=20
)
if response.status_code == 403:
raise Exception("403禁止访问,请检查Cookie或降低频率")
response.raise_for_status()
# 检查反爬
if "检测到异常请求" in response.text:
raise Exception("触发反爬机制,请更新Cookie")
# 提取封面URL
cover_url = self._extract_cover_url(response.text)
if not cover_url.startswith(('http://', 'https://')):
cover_url = 'https:' + cover_url
# 下载封面
self._request_delay()
img_response = self.session.get(
cover_url,
headers=self._get_headers(referer=douban_url),
cookies=self.cookies,
stream=True,
timeout=30
)
img_response.raise_for_status()
# 保存图片
os.makedirs(save_dir, exist_ok=True)
with open(save_path, 'wb') as f:
for chunk in img_response.iter_content(8192):
f.write(chunk)
print(f"✅ 成功下载: {filename}")
return save_path
except Exception as e:
print(f"❌ 处理失败: {douban_url} | 错误: {str(e)}")
return None
def batch_process(csv_path, url_column='url', output_dir='content/images/douban'):
"""批量处理CSV文件(优化版)"""
import pandas as pd
# 使用完整Cookie
USER_COOKIES = {
'll': '"1182875"',
'bid': 'jLsUuRC3yr03',
'dbcl2': '"269547702:ZWJYaBTrUyM4"',
'ck': 'p4si',
'frodotk_db': '"4075091eefd002b8c15f1b5ccf7008281"',
'frodotk': '"df131ed940ebccdc9d395c8bc2c366942"'
}
crawler = OptimizedDoubanCrawler(cookies=USER_COOKIES)
df = pd.read_csv(csv_path)
failed_urls = []
# 预先扫描已存在文件
print("🔍 扫描现有文件...")
existing_ids = crawler._scan_existing_files(output_dir)
print(f"发现 {len(existing_ids)} 个已存在的封面")
for index, row in df.iterrows():
if pd.notna(row[url_column]):
# 直接从URL提取ID进行检查
match = re.search(r'subject/(\d+)', row[url_column])
if match and match.group(1) in existing_ids:
print(f"⏩ 跳过已存在: {match.group(1)}")
continue
result = crawler.get_movie_cover(row[url_column], output_dir)
if not result:
failed_urls.append(row[url_column])
if failed_urls:
print("\n以下URL需要手动处理:")
for url in failed_urls:
print(url)
with open('failed_urls.log', 'w') as f:
f.write("\n".join(failed_urls))
print("\n失败记录已保存到 failed_urls.log")
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='豆瓣电影封面爬取优化版')
parser.add_argument('csv', help='CSV文件路径')
parser.add_argument('--column', default='url', help='URL列名')
parser.add_argument('--output', default='content/images/douban', help='输出目录')
args = parser.parse_args()
print("🚀 开始处理豆瓣电影封面(优化版)...")
batch_process(args.csv, args.column, args.output)
print("🎉 处理完成!")
大功告成,这样就可以实现每天凌晨0点自动执行,获取豆瓣的列表,如果有更新,就会下载新的封面图片,并且提交到仓库,然后使用hugo生成静态页面并部署到github pages.