初始化项目

This commit is contained in:
ifui 2025-06-08 16:25:53 +08:00
commit d3834eb37e
28 changed files with 5119 additions and 0 deletions

10
.gitignore vendored Normal file
View File

@ -0,0 +1,10 @@
.venv
venv
__pycache__
*.pyc
.env
data/videos.db
output

1
.version Normal file
View File

@ -0,0 +1 @@
0.2

5
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,5 @@
{
"python.analysis.extraPaths": [
"./scrapy_proj"
]
}

43
Dockerfile Normal file
View File

@ -0,0 +1,43 @@
# 使用官方Python基础镜像
FROM python:3.9-bookworm
# 设置工作目录
WORKDIR /app
# 设置时区(可选)
ENV TZ=Asia/Shanghai
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
# 使用阿里云镜像源
RUN echo "deb https://mirrors.aliyun.com/debian/ bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \
echo "deb https://mirrors.aliyun.com/debian/ bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \
echo "deb https://mirrors.aliyun.com/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list
# 安装系统依赖
RUN apt-get update && apt-get install -y \
gcc \
python3-dev \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
# 复制requirements.txt并安装Python依赖使用阿里pip源
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt \
-i https://mirrors.aliyun.com/pypi/simple/ \
--trusted-host mirrors.aliyun.com
# 复制项目文件
COPY . .
# 创建数据目录
RUN mkdir -p /app/data
# 暴露端口
EXPOSE 8000
# 设置环境变量
ENV PYTHONPATH=/app
ENV DATABASE_URL=sqlite:////app/data/videos.db
# 启动命令
CMD ["python", "main.py"]

5
README.md Normal file
View File

@ -0,0 +1,5 @@
创建虚拟环境python -m venv .venv
激活虚拟环境:.venv\Scripts\Activate.ps1
source ./.venv/bin/activate

18
docker-compose.yml Normal file
View File

@ -0,0 +1,18 @@
version: "3.8"
services:
app:
image: crawler_zgjs:0.1
ports:
- "${PORT:-8000}:8000"
volumes:
- ./data:/app/data
env_file:
- .env
restart: unless-stopped
networks:
- app-network
networks:
app-network:
driver: bridge

18
init_db.py Normal file
View File

@ -0,0 +1,18 @@
import os
from sqlalchemy import create_engine
from scrapy_proj.models import Base
# 确保data目录存在
os.makedirs("data", exist_ok=True)
# 创建数据库引擎
DATABASE_URL = "sqlite:///data/videos.db"
engine = create_engine(DATABASE_URL)
def init_db():
"""初始化数据库,创建所有表"""
Base.metadata.create_all(bind=engine)
print("数据库表创建成功!")
if __name__ == "__main__":
init_db()

744
main.py Normal file
View File

@ -0,0 +1,744 @@
import os
import sys
import uuid
import logging
import time
from typing import Dict, List, Optional, Literal
from datetime import datetime
from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from pydantic import BaseModel, HttpUrl
import uvicorn
from multiprocessing import Process
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
# 加载环境变量
load_dotenv()
# 添加scrapy项目到Python路径
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "scrapy_proj"))
# 导入爬虫
from scrapy_proj.spiders.zgjs import ZGJSSpider
from scrapy_proj.models import ScheduledTask
# 配置日志
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# 定时任务相关的Pydantic模型
class ScheduledTaskCreate(BaseModel):
"""创建定时任务请求模型"""
name: str
cron_expression: str
spider_name: str
url: str
video_list: int
enabled: bool = True
class ScheduledTaskUpdate(BaseModel):
"""更新定时任务请求模型"""
name: Optional[str] = None
cron_expression: Optional[str] = None
spider_name: Optional[str] = None
url: Optional[str] = None
video_list: Optional[int] = None
enabled: Optional[bool] = None
class TaskStatus(BaseModel):
"""任务状态响应模型"""
status: Literal['pending', 'running', 'completed', 'failed']
message: Optional[str] = None
start_time: Optional[str] = None
end_time: Optional[str] = None
class ScheduledTaskResponse(BaseModel):
"""定时任务响应模型"""
id: int
name: str
cron_expression: str
spider_name: str
url: str
video_list: int
enabled: bool
create_time: str
update_time: str
# 存储任务状态
task_status_store: Dict[str, Dict] = {}
# 创建数据库引擎和会话
DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///data/videos.db")
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
# 创建调度器
scheduler = BackgroundScheduler()
scheduler.start()
# 创建FastAPI应用
app = FastAPI(
title="Scrapy API",
description="影视资源爬虫API",
version="0.1.0",
)
# 存储爬虫任务状态
spider_tasks = {}
# 获取数据库会话
def get_db():
db = SessionLocal()
try:
return db
finally:
db.close()
class SpiderRequest(BaseModel):
"""爬虫请求模型"""
url: Optional[HttpUrl] = None
spider_name: str = "example"
settings: Optional[Dict] = None
video_list: int
class SpiderResponse(BaseModel):
"""爬虫响应模型"""
task_id: str
status: str
spider_name: str
task_name: Optional[str] = None # 添加任务名称字段
message: str
start_time: Optional[str] = None
end_time: Optional[str] = None
class PaginatedSpiderResponse(BaseModel):
"""分页爬虫响应模型"""
items: List[SpiderResponse]
total: int
page: int
page_size: int
total_pages: int
def _run_spider_process(spider_name: str, url: Optional[str], video_list: int, settings: Dict):
"""实际运行爬虫的进程函数"""
try:
# 获取项目设置
crawler_settings = get_project_settings()
# 如果提供了自定义设置,则更新
if settings:
for key, value in settings.items():
crawler_settings.set(key, value)
# 创建爬虫进程
process = CrawlerProcess(settings=crawler_settings)
# 选择爬虫
if spider_name == "zgjs":
logger.info(f"启动爬虫 {spider_name}URL: {url}")
process.crawl(ZGJSSpider, url=url, video_list=video_list)
else:
raise ValueError(f"未知的爬虫: {spider_name}")
# 启动爬虫
process.start()
# 确保爬虫进程正确关闭
try:
if hasattr(process, '_active') and not process._active:
logger.info("爬虫进程已完成")
elif hasattr(process, 'bootstrap_stopped') and process.bootstrap_stopped: # type: ignore
logger.info("爬虫进程正常停止")
else:
logger.warning("爬虫进程未正常停止,强制关闭")
process.stop()
# 确保所有reactor线程都停止
from twisted.internet import reactor
if reactor.running: # type: ignore
logger.info("停止reactor")
reactor.stop() # type: ignore
except Exception as e:
logger.error(f"关闭爬虫进程时出错: {str(e)}")
sys.exit(1) # 非正常退出
# 确保进程退出
logger.info("爬虫进程正常退出")
sys.exit(0)
except Exception as e:
logger.error(f"爬虫进程运行错误: {str(e)}", exc_info=True)
sys.exit(1) # 非正常退出
def run_spider(task_id: str, spider_name: str, task_name: Optional[str] = None, url: Optional[str] = None, video_list: int = 0, settings: Optional[Dict] = None):
"""在后台运行爬虫
Args:
task_id: 任务ID
spider_name: 爬虫名称
task_name: 任务名称
url: 开始URL
video_list: 视频列表数量
settings: 爬虫设置
"""
try:
# 确保settings是字典
settings = settings or {}
# 初始化爬虫任务状态
if task_id not in spider_tasks:
spider_tasks[task_id] = {
"status": "pending",
"spider_name": spider_name,
"task_name": task_name,
"message": "爬虫任务初始化中",
"started_at": time.time(),
"finished_at": None
}
# 创建并启动新进程
p = Process(
target=_run_spider_process,
args=(spider_name, url, video_list, settings),
daemon=True # 设置为守护进程,确保主进程退出时子进程也会退出
)
p.start()
# 更新任务状态为运行中
spider_tasks[task_id].update({
"status": "running",
"message": "爬虫任务正在运行",
"process": p,
"started_at": time.time()
})
# 启动一个线程来监控进程状态
def monitor_process(process, task_id):
process.join() # 等待进程结束
if task_id in spider_tasks:
finish_time = time.time()
spider_tasks[task_id]["finished_at"] = finish_time
if process.exitcode == 0:
spider_tasks[task_id].update({
"status": "completed",
"message": "爬虫任务成功完成"
})
else:
spider_tasks[task_id].update({
"status": "failed",
"message": f"爬虫任务失败,退出码: {process.exitcode}"
})
# 同步更新定时任务状态(如果存在)
for scheduled_task_id, status_info in task_status_store.items():
if status_info.get("spider_task_id") == task_id:
status_info.update({
"status": spider_tasks[task_id]["status"],
"message": spider_tasks[task_id]["message"],
"end_time": datetime.fromtimestamp(finish_time).strftime("%Y-%m-%d %H:%M:%S")
})
from threading import Thread
monitor_thread = Thread(target=monitor_process, args=(p, task_id))
monitor_thread.daemon = True
monitor_thread.start()
except Exception as e:
error_time = time.time()
# 更新任务状态为失败
spider_tasks[task_id].update({
"status": "failed",
"message": f"启动爬虫进程错误: {str(e)}",
"finished_at": error_time
})
# 同步更新定时任务状态(如果存在)
for scheduled_task_id, status_info in task_status_store.items():
if status_info.get("spider_task_id") == task_id:
status_info.update({
"status": "failed",
"message": f"启动爬虫进程错误: {str(e)}",
"end_time": datetime.fromtimestamp(error_time).strftime("%Y-%m-%d %H:%M:%S")
})
logger.error(f"启动爬虫进程错误: {str(e)}")
def cleanup_finished_processes():
"""清理已完成的进程并更新状态"""
for task_id, task_info in list(spider_tasks.items()):
if "process" in task_info:
process = task_info["process"]
if not process.is_alive():
# 获取进程退出码
exitcode = process.exitcode
# 清理进程资源
process.join()
process.close()
# 当前时间
current_time = time.time()
# 根据退出码更新状态
if exitcode == 0:
task_info["status"] = "completed"
task_info["message"] = "爬虫任务成功完成"
else:
task_info["status"] = "failed"
task_info["message"] = f"爬虫任务失败,退出码: {exitcode}"
# 记录完成时间(如果还没有设置的话)
if "finished_at" not in task_info:
task_info["finished_at"] = current_time
# 确保有开始时间
if "started_at" not in task_info:
task_info["started_at"] = task_info.get("finished_at", current_time)
del spider_tasks[task_id]["process"]
@app.post("/api/spiders/run", response_model=SpiderResponse)
async def start_spider(spider_request: SpiderRequest, background_tasks: BackgroundTasks):
"""启动爬虫任务
Args:
spider_request: 爬虫请求参数
background_tasks: 后台任务
Returns:
SpiderResponse: 爬虫响应
"""
# 生成任务ID
task_id = str(uuid.uuid4())
if (spider_request.url is None):
raise HTTPException(status_code=400, detail="缺少url参数")
if (spider_request.video_list is None):
raise HTTPException(status_code=400, detail="缺少video_list参数")
# 当前时间戳
current_time = time.time()
# 记录任务信息
spider_tasks[task_id] = {
"status": "pending",
"spider_name": spider_request.spider_name,
"message": "爬虫任务已创建,等待执行",
"started_at": current_time, # 添加开始时间
"finished_at": None # 初始化结束时间为None
}
# 在后台运行爬虫
background_tasks.add_task(
run_spider,
task_id=task_id,
spider_name=spider_request.spider_name,
url=str(spider_request.url) if spider_request.url else None,
video_list=spider_request.video_list,
settings=spider_request.settings
)
# 格式化时间为ISO格式
start_time = datetime.fromtimestamp(current_time).isoformat()
return SpiderResponse(
task_id=task_id,
status="pending",
spider_name=spider_request.spider_name,
message="爬虫任务已创建,等待执行",
start_time=start_time,
end_time=None
)
@app.get("/api/spiders/status/{task_id}", response_model=SpiderResponse)
async def get_spider_status(task_id: str):
"""获取爬虫任务状态
Args:
task_id: 任务ID
Returns:
SpiderResponse: 爬虫响应
"""
# 先清理已完成进程
cleanup_finished_processes()
if task_id not in spider_tasks:
raise HTTPException(status_code=404, detail="任务不存在")
task_info = spider_tasks[task_id]
return SpiderResponse(
task_id=task_id,
status=task_info["status"],
spider_name=task_info["spider_name"],
task_name=task_info.get("task_name"),
message=task_info["message"],
start_time=datetime.fromtimestamp(task_info["started_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("started_at") else None,
end_time=datetime.fromtimestamp(task_info["finished_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("finished_at") else None
)
@app.post("/api/spiders/cleanup")
async def cleanup_spiders():
"""清理已完成的任务进程
Returns:
Dict: 清理结果
"""
cleanup_finished_processes()
return {"message": "已完成进程清理"}
@app.get("/api/spiders/list", response_model=PaginatedSpiderResponse)
async def list_spiders(page: int = 1, page_size: int = 10):
"""列出爬虫任务(分页)
Args:
page: 页码从1开始
page_size: 每页数量
Returns:
PaginatedSpiderResponse: 分页的爬虫任务列表
"""
# 先清理已完成进程,确保状态最新
cleanup_finished_processes()
# 计算分页参数
all_tasks = list(spider_tasks.items())
all_tasks.reverse() # 倒序
total = len(all_tasks)
total_pages = (total + page_size - 1) // page_size
page = max(1, min(page, total_pages))
# 获取当前页的数据
start = (page - 1) * page_size
end = start + page_size
paginated_tasks = all_tasks[start:end]
return PaginatedSpiderResponse(
items=[
SpiderResponse(
task_id=str(task_id),
status=task_info["status"],
spider_name=task_info["spider_name"],
task_name=task_info.get("task_name"),
message=task_info["message"],
start_time=datetime.fromtimestamp(task_info["started_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("started_at") else None,
end_time=datetime.fromtimestamp(task_info["finished_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("finished_at") else None
)
for task_id, task_info in paginated_tasks
],
total=total,
page=page,
page_size=page_size,
total_pages=total_pages
)
def add_job_to_scheduler(task: ScheduledTask):
"""添加任务到调度器"""
if task.enabled: # type: ignore
# 创建一个包装函数来处理定时任务的状态更新
def scheduled_spider_run():
# 生成唯一的爬虫任务ID
spider_task_id = str(uuid.uuid4())
task_id_str = str(task.id)
current_time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# 记录定时任务状态
task_status_store[task_id_str] = {
"status": "running",
"message": "定时任务自动执行中",
"start_time": current_time_str,
"end_time": None,
"spider_task_id": spider_task_id
}
# 运行爬虫
run_spider(
task_id=spider_task_id,
spider_name=str(task.spider_name),
task_name=str(task.name),
url=str(task.url),
video_list=task.video_list # type: ignore
)
# 记录日志
logger.info(f"定时任务 {task.name} (ID: {task.id}) 已自动执行爬虫任务ID: {spider_task_id}")
# 添加到调度器
scheduler.add_job(
scheduled_spider_run,
CronTrigger.from_crontab(task.cron_expression),
id=str(task.id),
replace_existing=True
)
@app.post("/api/scheduled-tasks", response_model=ScheduledTaskResponse)
async def create_scheduled_task(task: ScheduledTaskCreate):
"""创建定时任务"""
db = get_db()
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
db_task = ScheduledTask(
name=task.name,
cron_expression=task.cron_expression,
spider_name=task.spider_name,
url=task.url,
video_list=task.video_list,
enabled=task.enabled,
create_time=current_time,
update_time=current_time
)
try:
db.add(db_task)
db.commit()
db.refresh(db_task)
# 如果任务启用,添加到调度器
if task.enabled:
add_job_to_scheduler(db_task)
return db_task
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@app.get("/api/scheduled-tasks", response_model=List[ScheduledTaskResponse])
async def list_scheduled_tasks():
"""获取所有定时任务"""
db = get_db()
try:
tasks = db.query(ScheduledTask).all()
return tasks
finally:
db.close()
@app.get("/api/scheduled-tasks/{task_id}", response_model=ScheduledTaskResponse)
async def get_scheduled_task(task_id: int):
"""获取指定定时任务"""
db = get_db()
try:
task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
if task is None:
raise HTTPException(status_code=404, detail="Task not found")
return task
finally:
db.close()
@app.put("/api/scheduled-tasks/{task_id}", response_model=ScheduledTaskResponse)
async def update_scheduled_task(task_id: int, task_update: ScheduledTaskUpdate):
"""更新定时任务"""
db = get_db()
try:
db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
if db_task is None:
raise HTTPException(status_code=404, detail="Task not found")
update_data = task_update.dict(exclude_unset=True)
for key, value in update_data.items():
setattr(db_task, key, value)
db_task.update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # type: ignore
db.commit()
db.refresh(db_task)
# 更新调度器中的任务
job_id = str(db_task.id)
if scheduler.get_job(job_id):
scheduler.remove_job(job_id)
if db_task.enabled: # type: ignore
add_job_to_scheduler(db_task)
return db_task
finally:
db.close()
@app.delete("/api/scheduled-tasks/{task_id}")
async def delete_scheduled_task(task_id: int):
"""删除定时任务"""
db = get_db()
try:
db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
if db_task is None:
raise HTTPException(status_code=404, detail="Task not found")
# 从调度器中移除任务
job_id = str(db_task.id)
if scheduler.get_job(job_id):
scheduler.remove_job(job_id)
db.delete(db_task)
db.commit()
return {"message": "Task deleted successfully"}
finally:
db.close()
@app.post("/api/scheduled-tasks/{task_id}/toggle")
async def toggle_scheduled_task(task_id: int):
"""启用/禁用定时任务"""
db = get_db()
try:
db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
if db_task is None:
raise HTTPException(status_code=404, detail="Task not found")
db_task.enabled = not db_task.enabled # type: ignore
db_task.update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # type: ignore
# 更新调度器中的任务
job_id = str(db_task.id)
if scheduler.get_job(job_id):
scheduler.remove_job(job_id)
if db_task.enabled: # type: ignore
add_job_to_scheduler(db_task)
db.commit()
db.refresh(db_task)
return {"message": f"Task {'enabled' if db_task.enabled else 'disabled'} successfully"} # type: ignore
finally:
db.close()
@app.post("/api/scheduled-tasks/{task_id}/run")
async def run_scheduled_task(task_id: int, background_tasks: BackgroundTasks):
"""手动执行定时任务"""
db = get_db()
try:
db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
if db_task is None:
raise HTTPException(status_code=404, detail="Task not found")
# 生成唯一的爬虫任务ID
spider_task_id = str(uuid.uuid4())
# 当前时间戳
current_time = time.time()
current_time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# 记录爬虫任务信息
spider_tasks[spider_task_id] = {
"status": "pending",
"spider_name": db_task.spider_name,
"task_name": db_task.name,
"message": "爬虫任务已创建,等待执行",
"started_at": current_time,
"finished_at": None
}
# 记录定时任务状态
task_status_store[str(task_id)] = {
"status": "running",
"message": "定时任务正在执行",
"start_time": current_time_str,
"end_time": None,
"spider_task_id": spider_task_id, # 关联爬虫任务ID
"task_name": db_task.name
}
# 在后台运行爬虫
background_tasks.add_task(
run_spider,
task_id=spider_task_id,
spider_name=str(db_task.spider_name),
task_name=str(db_task.name),
url=str(db_task.url),
video_list=db_task.video_list # type: ignore
)
return {
"message": "任务已开始执行",
"task_id": task_id,
"spider_task_id": spider_task_id,
"task_name": db_task.name # 添加任务名称到响应
}
finally:
db.close()
@app.get("/api/task-status/{task_id}", response_model=TaskStatus)
async def get_task_status(task_id: int):
"""获取定时任务的执行状态"""
task_id_str = str(task_id)
# 先清理已完成的爬虫进程,确保状态最新
cleanup_finished_processes()
if task_id_str not in task_status_store:
return TaskStatus(status="pending", message="任务未执行")
status_info = task_status_store[task_id_str]
# 如果任务正在运行,检查爬虫任务的状态
if "spider_task_id" in status_info:
spider_task_id = status_info["spider_task_id"]
if spider_task_id in spider_tasks:
spider_info = spider_tasks[spider_task_id]
spider_status = spider_info["status"]
spider_message = spider_info["message"]
# 同步状态
if spider_status != status_info["status"]:
status_info["status"] = spider_status
status_info["message"] = spider_message
# 如果爬虫任务完成或失败,更新结束时间
if spider_status in ["completed", "failed"]:
status_info["end_time"] = datetime.fromtimestamp(
spider_info.get("finished_at", time.time())
).strftime("%Y-%m-%d %H:%M:%S")
else:
# 如果爬虫任务不存在且状态是running可能是异常终止
if status_info["status"] == "running":
status_info["status"] = "failed"
status_info["message"] = "爬虫任务异常终止"
status_info["end_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return TaskStatus(
status=status_info["status"],
message=status_info["message"],
start_time=status_info.get("start_time"),
end_time=status_info.get("end_time")
)
# 挂载静态文件目录
app.mount("/static", StaticFiles(directory="static"), name="static")
@app.get("/")
async def read_index():
return FileResponse("static/index.html")
if __name__ == "__main__":
host = os.getenv("HOST", "0.0.0.0")
port = int(os.getenv("PORT", "8000"))
log_level = os.getenv("LOG_LEVEL", "info")
# 启动服务器
uvicorn.run(
app if not os.getenv("RELOAD") else "main:app",
host=host,
port=port,
log_level=log_level,
reload=bool(os.getenv("RELOAD"))
)

15
requirements.txt Normal file
View File

@ -0,0 +1,15 @@
scrapy>=2.11.0
aliyun-python-sdk-core>=2.13.3
aliyun-python-sdk-vod>=2.16.16
requests>=2.31.0
fastapi>=0.104.1
uvicorn>=0.24.0
pydantic>=2.5.1
python-multipart>=0.0.6
SQLAlchemy>=2.0.23
alembic>=1.12.1
pymysql>=1.1.0
oss2>=2.19.1
apscheduler>=3.11.0
jinja2>=3.1.6
dotenv>=0.9.9

11
scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = scrapy_proj.settings
[deploy]
#url = http://localhost:6800/
project = scrapy_proj

0
scrapy_proj/__init__.py Normal file
View File

99
scrapy_proj/database.py Normal file
View File

@ -0,0 +1,99 @@
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import QueuePool
from contextlib import contextmanager
from .models import Base
class DatabaseManager:
"""数据库管理器"""
def __init__(self, settings):
"""初始化数据库管理器
Args:
settings: Scrapy设置对象
"""
self.sqlite_file = settings.get('SQLITE_FILE', 'videos.db')
self.mysql_config = {
'host': settings.get('MYSQL_HOST', 'localhost'),
'port': settings.get('MYSQL_PORT', 3306),
'user': settings.get('MYSQL_USER', 'root'),
'password': settings.get('MYSQL_PASSWORD', ''),
'database': settings.get('MYSQL_DATABASE', 'crawler'),
}
# 初始化数据库引擎
self._init_sqlite()
self._init_mysql()
# 创建会话工厂
self.sqlite_session_maker = sessionmaker(bind=self.sqlite_engine)
self.mysql_session_maker = sessionmaker(bind=self.mysql_engine)
def _init_sqlite(self):
"""初始化SQLite数据库"""
# 确保数据库目录存在
db_dir = os.path.dirname(self.sqlite_file)
if db_dir and not os.path.exists(db_dir):
os.makedirs(db_dir)
# 创建SQLite引擎
self.sqlite_engine = create_engine(
f'sqlite:///{self.sqlite_file}',
poolclass=QueuePool,
pool_size=5,
max_overflow=10,
pool_timeout=30
)
# 自动创建所有表
Base.metadata.create_all(self.sqlite_engine)
def _init_mysql(self):
"""初始化MySQL/MariaDB数据库"""
# 创建MySQL引擎
self.mysql_engine = create_engine(
'mysql+pymysql://{user}:{password}@{host}:{port}/{database}?charset=utf8mb4'.format(
**self.mysql_config
),
poolclass=QueuePool,
pool_size=5,
max_overflow=10,
pool_timeout=30,
pool_pre_ping=True # 自动检测断开的连接
)
@contextmanager
def sqlite_session(self):
"""SQLite会话上下文管理器
Yields:
Session: SQLite数据库会话
"""
session = self.sqlite_session_maker()
try:
yield session
session.commit()
except Exception:
session.rollback()
raise
finally:
session.close()
@contextmanager
def mysql_session(self):
"""MySQL会话上下文管理器
Yields:
Session: MySQL数据库会话
"""
session = self.mysql_session_maker()
try:
yield session
session.commit()
except Exception:
session.rollback()
raise
finally:
session.close()

31
scrapy_proj/items.py Normal file
View File

@ -0,0 +1,31 @@
from scrapy import Item, Field
from datetime import datetime
class VideoItem(Item):
"""视频信息数据模型"""
# 基本信息
title = Field() # 标题
description = Field() # 描述
source_url = Field() # 来源URL
publish_time = Field() # 发布时间
create_time = Field(serializer=lambda x: datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # 创建时间
update_time = Field(serializer=lambda x: datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # 更新时间
# 媒体信息
video_url = Field() # 视频URL
source_thumbnail_url = Field() # 原始缩略图URL
thumbnail_url = Field() # 缩略图URL
duration = Field() # 视频时长
# 附加信息,辅助数据导入迁移
video_list = Field() # 视频分类ID
# 阿里云点播信息
aliyun_video_id = Field() # 阿里云视频ID
aliyun_status = Field() # 阿里云处理状态
# 其他信息
status = Field() # 状态0待处理1处理中2处理完成-1处理失败
sqlite_id = Field() # sqlite处理ID

100
scrapy_proj/middlewares.py Normal file
View File

@ -0,0 +1,100 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class ScrapyProjSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
async def process_start(self, start):
# Called with an async iterator over the spider start() method or the
# maching method of an earlier spider middleware.
async for item_or_request in start:
yield item_or_request
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class ScrapyProjDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)

38
scrapy_proj/models.py Normal file
View File

@ -0,0 +1,38 @@
from sqlalchemy import Column, Integer, String, Text, DateTime, SmallInteger, Boolean
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class ScheduledTask(Base):
"""定时任务模型"""
__tablename__ = 'scheduled_tasks'
id = Column(Integer, primary_key=True, autoincrement=True)
name = Column(String(100), nullable=False)
cron_expression = Column(String(100), nullable=False) # cron表达式
spider_name = Column(String(50), nullable=False) # 爬虫名称
url = Column(Text, nullable=False) # 爬取的URL
video_list = Column(Integer, nullable=False) # 视频分类ID
enabled = Column(Boolean, default=True) # 是否启用
create_time = Column(Text) # SQLite中使用TEXT存储时间
update_time = Column(Text) # SQLite中使用TEXT存储时间
class VideoSQLite(Base):
"""SQLite视频模型"""
__tablename__ = 'videos'
id = Column(Integer, primary_key=True, autoincrement=True)
title = Column(Text)
description = Column(Text)
source_url = Column(Text, unique=True)
publish_time = Column(Text) # SQLite中使用TEXT存储时间
create_time = Column(Text)
update_time = Column(Text)
video_url = Column(Text)
source_thumbnail_url = Column(Text)
thumbnail_url = Column(Text)
duration = Column(Text) # SQLite中duration是TEXT类型
aliyun_video_id = Column(Text)
aliyun_status = Column(Text)
status = Column(Integer) # 0 默认 1 已迁移
video_list = Column(Integer) # 视频分类ID

479
scrapy_proj/pipelines.py Normal file
View File

@ -0,0 +1,479 @@
from datetime import datetime
import logging
import json
import requests
import time
import base64
import oss2
from aliyunsdkcore.client import AcsClient
from aliyunsdkvod.request.v20170321.CreateUploadVideoRequest import CreateUploadVideoRequest
from aliyunsdkvod.request.v20170321.GetVideoInfoRequest import GetVideoInfoRequest
from aliyunsdkvod.request.v20170321.UpdateVideoInfoRequest import UpdateVideoInfoRequest
from aliyunsdkvod.request.v20170321.CreateUploadImageRequest import CreateUploadImageRequest
from sqlalchemy import text
from .database import DatabaseManager
from .models import VideoSQLite
logger = logging.getLogger(__name__)
class AliyunVodPipeline:
"""阿里云视频点播处理中间件"""
def __init__(self, settings):
"""初始化阿里云视频点播中间件
初始化SQLite中间件
Args:
settings: Scrapy设置对象
"""
self.settings = settings
self.access_key_id = settings.get('ALIYUN_ACCESS_KEY_ID')
self.access_key_secret = settings.get('ALIYUN_ACCESS_KEY_SECRET')
self.template_group_id = settings.get('ALIYUN_TEMPLATE_GROUP_ID')
self.client = AcsClient(self.access_key_id, self.access_key_secret, 'cn-shanghai')
self.oss_client = oss2.Auth(self.access_key_id, self.access_key_secret)
self.db_manager = DatabaseManager(settings)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def upload_media_by_url(self, video_url, title, cover_url=None):
"""通过URL上传视频到阿里云VOD
Args:
video_url: 视频URL
title: 视频标题
cover_url: 封面URL
Returns:
str: 上传任务ID (JobId)
"""
from aliyunsdkvod.request.v20170321.UploadMediaByURLRequest import UploadMediaByURLRequest
request = UploadMediaByURLRequest()
request.set_accept_format('JSON')
# 设置视频URL
logger.info(f"上传视频URL: {video_url}")
request.set_UploadURLs(video_url)
# 设置视频信息需要是JSON数组字符串
upload_metadata = [{
'Title': title,
'SourceURL': video_url,
'TemplateGroupId': self.template_group_id
}]
# 设置封面URL
# if cover_url:
# upload_metadata[0]['CoverURL'] = cover_url
request.set_UploadMetadatas(json.dumps(upload_metadata))
response = self.client.do_action_with_exception(request)
result = json.loads(response)
# 返回第一个上传任务的JobId
upload_jobs = result.get('UploadJobs', [])
if not upload_jobs:
raise Exception("No upload job created")
job = upload_jobs[0]
# if job.get('Code') != 'Success':
# raise Exception(f"Upload job failed: {job}")
return job.get('JobId') # 返回JobId而不是VideoId
def get_upload_job_status(self, job_id):
"""获取上传任务状态
Args:
job_id: 上传任务ID
Returns:
dict: 任务状态信息包含VideoId如果上传完成
"""
from aliyunsdkvod.request.v20170321.GetURLUploadInfosRequest import GetURLUploadInfosRequest
request = GetURLUploadInfosRequest()
request.set_accept_format('JSON')
request.set_JobIds(job_id)
response = self.client.do_action_with_exception(request)
result = json.loads(response)
upload_jobs = result.get('URLUploadInfoList', [])
if not upload_jobs:
raise Exception(f"No upload job found with ID: {job_id}")
job = upload_jobs[0]
return job
def wait_for_video_id(self, job_id, max_retries=5, retry_interval=2):
"""等待上传任务完成并获取VideoId
Args:
job_id: 上传任务ID
max_retries: 最大重试次数
retry_interval: 重试间隔
Returns:
str: 视频ID
"""
import time
for i in range(max_retries):
job_status = self.get_upload_job_status(job_id)
if job_status.get('MediaId'):
return job_status.get('MediaId')
# 等待一段时间后重试
time.sleep(retry_interval)
raise Exception(f"Max retries reached, upload job not completed: {job_id}")
def upload_image_to_oss(self, image_url, title):
"""直接上传图片到阿里云OSS
Args:
image_url: 图片URL
title: 图片标题
Returns:
str: OSS中的图片URL
"""
logger.info(f"开始上传图片到OSS: {image_url}")
try:
# 1. 下载远程图片
image_response = requests.get(image_url, timeout=30)
image_response.raise_for_status()
image_content = image_response.content
except Exception as e:
logger.error(f"下载图片失败: {str(e)}")
raise Exception(f"下载图片失败: {str(e)}")
try:
# 2. 生成OSS中的文件名使用时间戳和原始文件名的组合
timestamp = int(time.time())
file_ext = image_url.split('.')[-1] if '.' in image_url else 'jpg'
oss_filename = f"images/{timestamp}_{title[:30]}.{file_ext}" # 限制标题长度,避免文件名过长
# 3. 获取OSS bucket从settings中获取配置
bucket_name = self.settings.get('ALIYUN_OSS_BUCKET')
endpoint = self.settings.get('ALIYUN_OSS_ENDPOINT')
oss_bucket = oss2.Bucket(self.oss_client, endpoint, bucket_name)
# 4. 上传图片到OSS
upload_response = oss_bucket.put_object(oss_filename, image_content)
if upload_response.status == 200:
# 5. 返回可访问的URL
oss_url = f"https://{bucket_name}.{endpoint}/{oss_filename}"
logger.info(f"图片上传成功: {oss_url}")
return oss_url
else:
raise Exception(f"图片上传失败: {upload_response.status}")
except Exception as e:
logger.error(f"上传图片到OSS失败: {str(e)}")
raise Exception(f"上传图片到OSS失败: {str(e)}")
def process_item(self, item, spider):
"""处理数据项通过URL上传视频到阿里云VOD
Args:
item: 爬取的数据项
spider: 爬虫实例
Returns:
item: 处理后的数据项
"""
# 如果已经有阿里云视频ID跳过处理
with self.db_manager.sqlite_session() as session:
# 检查是否存在相同source_url的记录
existing_video = session.query(VideoSQLite).filter_by(source_url=item.get('source_url')).first()
if existing_video and existing_video.aliyun_video_id:
logger.info(f"阿里云视频ID已存在跳过该任务: {item.get('title')}")
return item
video_url = item.get('video_url')
if not video_url:
logger.warning(f"视频URL为空跳过处理: {item.get('source_url')}")
return item
try:
# 1. 上传封面图片到OSS如果有
cover_url = item.get('source_thumbnail_url')
if cover_url:
try:
oss_url = self.upload_image_to_oss(
image_url=cover_url,
title=item.get('title', '')
)
# 更新item中的封面URL为OSS URL
item['thumbnail_url'] = oss_url
logger.info(f"封面图片上传到OSS成功: {oss_url}")
except Exception as e:
logger.error(f"封面图片上传到OSS失败: {str(e)}")
# 如果封面上传失败,继续处理视频,不中断流程
# 2. 通过URL上传视频获取JobId
title = item.get('title', '')
job_id = self.upload_media_by_url(
video_url=video_url,
title=title,
cover_url=oss_url # 使用刚上传的OSS封面URL
)
logger.info(f"成功创建阿里云视频URL上传任务: job_id={job_id}, title={title}")
# 2. 等待上传完成并获取VideoId
try:
video_id = self.wait_for_video_id(job_id)
logger.info(f"视频上传完成: video_id={video_id}, job_id={job_id}")
# 3. 更新item中的阿里云视频ID和状态
item['aliyun_video_id'] = video_id
item['aliyun_status'] = 'Success'
except Exception as e:
logger.error(f"等待视频上传完成失败: job_id={job_id}, error={str(e)}")
item['aliyun_video_id'] = ""
item['aliyun_status'] = 'Uploading'
raise # 重新抛出异常,让上层错误处理来处理
except Exception as e:
logger.error(f"阿里云视频URL上传失败: {str(e)}")
item['aliyun_status'] = 'Failed'
return item
class SQLitePipeline:
"""SQLite数据库处理中间件"""
def __init__(self, settings):
"""初始化SQLite中间件
Args:
settings: Scrapy设置对象
"""
self.db_manager = DatabaseManager(settings)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_item(self, item, spider):
"""处理数据项保存到SQLite数据库
Args:
item: 爬取的数据项
spider: 爬虫实例
Returns:
item: 处理后的数据项
"""
now = datetime.now()
now_str = now.strftime('%Y-%m-%d %H:%M:%S')
with self.db_manager.sqlite_session() as session:
# 检查是否存在相同source_url的记录
existing_video = session.query(VideoSQLite).filter_by(source_url=item.get('source_url')).first()
if existing_video:
logger.info(f"发现重复视频: {item.get('source_url')}")
# 更新现有记录
existing_video.title = item.get('title', '')
existing_video.description = item.get('description', '')
existing_video.publish_time = item.get('publish_time', '')
existing_video.update_time = now_str
existing_video.video_url = item.get('video_url', '')
existing_video.source_thumbnail_url = item.get('source_thumbnail_url', '')
existing_video.duration = str(item.get('duration', ''))
existing_video.video_list = str(item.get('video_list', 0))
# 判断video_id、status、thumbnail_url防止被覆盖
if item.get('aliyun_video_id'):
existing_video.aliyun_video_id = item['aliyun_video_id']
if item.get('aliyun_status'):
existing_video.aliyun_status = item['aliyun_status']
if item.get('thumbnail_url'):
existing_video.thumbnail_url = item['thumbnail_url']
# existing_video.status = 0 # 重置状态为0
# 保存SQLite记录ID到item中供后续中间件使用
item['sqlite_id'] = existing_video.id
else:
# 创建新记录
sqlite_data = {
'title': item.get('title', ''),
'description': item.get('description', ''),
'source_url': item.get('source_url', ''),
'publish_time': item.get('publish_time', ''),
'create_time': now_str,
'update_time': now_str,
'video_url': item.get('video_url', ''),
'source_thumbnail_url': item.get('source_thumbnail_url', ''),
'thumbnail_url': item.get('thumbnail_url', ''),
'duration': str(item.get('duration', '')),
'video_list': item.get('video_list', ''),
'aliyun_video_id': item.get('aliyun_video_id', ''),
'aliyun_status': item.get('aliyun_status', ''),
'status': 0
}
new_video = VideoSQLite(**sqlite_data)
session.add(new_video)
session.flush() # 获取新插入记录的ID
# 保存SQLite记录ID到item中供后续中间件使用
item['sqlite_id'] = new_video.id
return item
class MariaDBPipeline:
"""将数据从SQLite迁移到MariaDB的管道"""
def __init__(self, settings):
"""初始化管道
Args:
settings: Scrapy设置对象
"""
self.db_manager = DatabaseManager(settings)
self.logger = logging.getLogger(__name__)
@classmethod
def from_crawler(cls, crawler):
"""从crawler创建管道实例
Args:
crawler: Scrapy crawler对象
Returns:
MariaDBPipeline: 管道实例
"""
return cls(crawler.settings)
def open_spider(self, spider):
"""当spider开启时调用"""
self.logger.info("MariaDB管道已开启")
def close_spider(self, spider):
"""当spider关闭时调用"""
self.logger.info("MariaDB管道已关闭")
self.migrate_data()
def process_item(self, item, spider):
"""处理item
Args:
item: Scrapy item对象
spider: Scrapy spider对象
Returns:
item: 处理后的item
"""
# 这里不需要处理item因为我们要从SQLite读取数据
return item
def migrate_data(self):
"""从SQLite迁移数据到MariaDB"""
try:
with self.db_manager.sqlite_session() as sqlite_session, \
self.db_manager.mysql_session() as mysql_session:
# 1. 从SQLite读取视频数据
sqlite_videos = sqlite_session.query(VideoSQLite).where((VideoSQLite.aliyun_video_id != None) & (VideoSQLite.aliyun_video_id != '')).all()
# sqlite_videos = sqlite_session.query(VideoSQLite).all()
# 2. 批量迁移到MariaDB
for video in sqlite_videos:
# 根据video_id查重
existing_video_id = mysql_session.execute(
text("SELECT id FROM wz_video WHERE video_remote_id = :video_remote_id LIMIT 1"), {
'video_remote_id': video.aliyun_video_id
}
)
if existing_video_id.first():
self.logger.info(f"远程数据库已存在该视频: {video.title}")
continue
# 映射到wz_video表
wz_video = {
'cid': 1,
'title': video.title or '',
'css': '',
'thumb': video.thumbnail_url or '',
'keywords': '',
'remark': video.description or '',
'block': 0,
'url': '',
'status': 9,
'route': 0,
'publisher': 'spider',
'addtime': int(time.time()),
'updatetime': int(time.time()),
'area': '1',
'category': '1',
'theme': 0,
'year': '2025',
'video_remote_id': video.aliyun_video_id or '',
'video_url': '',
'video_list': video.video_list or 0,
'month': '1'
}
# 映射到wz_video_data表
wz_video_data = {
'id': None, # 将在插入后设置
'content': '',
'coin': 0,
'groups': '',
'pagetype': 0,
'maxchars': 0,
'template': '',
'allowcomment': 1,
'relation': ''
}
# 插入wz_video并获取ID
result = mysql_session.execute(
text("""INSERT INTO wz_video (
cid, title, css, thumb, keywords, remark, block, url,
status, route, publisher, addtime, updatetime, area,
category, theme, year, video_remote_id, video_url,
video_list, month
) VALUES (
:cid, :title, :css, :thumb, :keywords, :remark, :block, :url,
:status, :route, :publisher, :addtime, :updatetime, :area,
:category, :theme, :year, :video_remote_id, :video_url,
:video_list, :month
)"""),
wz_video
)
video_id = result.lastrowid
# 设置wz_video_data的id并插入
wz_video_data['id'] = video_id
mysql_session.execute(
text("""INSERT INTO wz_video_data (
id, content, coin, groups, pagetype, maxchars,
template, allowcomment, relation
) VALUES (
:id, :content, :coin, :groups, :pagetype, :maxchars,
:template, :allowcomment, :relation
)"""),
wz_video_data
)
mysql_session.commit()
self.logger.info(f"成功迁移 {len(sqlite_videos)} 条视频数据到线上数据库")
except Exception as e:
self.logger.error(f"数据迁移失败: {str(e)}")
raise

121
scrapy_proj/settings.py Normal file
View File

@ -0,0 +1,121 @@
# Scrapy settings for scrapy_proj project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "scrapy_proj"
SPIDER_MODULES = ["scrapy_proj.spiders"]
NEWSPIDER_MODULE = "scrapy_proj.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 8
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 8
CONCURRENT_REQUESTS_PER_IP = 8
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "scrapy_proj.middlewares.ScrapyProjSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
"scrapy_proj.middlewares.ScrapyProjDownloaderMiddleware": 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# "scrapy_proj.pipelines.AliyunVodPipeline": 300, # 上传到阿里云视频点播
# "scrapy_proj.pipelines.SQLitePipeline": 400, # 保存到SQLite
# "scrapy_proj.pipelines.MariaDBPipeline": 500, # 最后保存到MariaDB
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
# 项目根目录配置用于SQLite数据库文件路径
import os
from dotenv import load_dotenv
load_dotenv() # 加载.env文件中的环境变量
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# 数据库配置
SQLITE_FILE = os.getenv("SQLITE_FILE", "data/videos.db")
# MariaDB配置
MYSQL_HOST = os.getenv("MYSQL_HOST", "localhost") # 数据库主机
MYSQL_PORT = int(os.getenv("MYSQL_PORT", "3306")) # 数据库端口
MYSQL_USER = os.getenv("MYSQL_USER", "root") # 数据库用户名
MYSQL_PASSWORD = os.getenv("MYSQL_PASSWORD") # 数据库密码
MYSQL_DATABASE = os.getenv("MYSQL_DATABASE", "dev_yszy") # 数据库名称
# 阿里云配置
ALIYUN_ACCESS_KEY_ID = os.getenv("ALIYUN_ACCESS_KEY_ID") # 阿里云AccessKey ID
ALIYUN_ACCESS_KEY_SECRET = os.getenv("ALIYUN_ACCESS_KEY_SECRET") # 阿里云AccessKey Secret
ALIYUN_TEMPLATE_GROUP_ID = os.getenv("ALIYUN_TEMPLATE_GROUP_ID") # 转码模板组ID
# 阿里云OSS配置
ALIYUN_OSS_BUCKET = os.getenv("ALIYUN_OSS_BUCKET") # 阿里云OSS Bucket名称
ALIYUN_OSS_ENDPOINT = os.getenv("ALIYUN_OSS_ENDPOINT") # 阿里云OSS Endpoint

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,32 @@
import scrapy
from typing import Optional
class ExampleSpider(scrapy.Spider):
name = "example"
def __init__(self, url: Optional[str] = None, *args, **kwargs):
"""初始化爬虫
Args:
url: 开始URL可通过API传入
"""
super(ExampleSpider, self).__init__(*args, **kwargs)
self.start_urls = [url] if url else ["http://quotes.toscrape.com"]
def parse(self, response):
"""解析页面数据
这是一个示例解析器从quotes.toscrape.com抓取引用和作者
"""
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').get(),
'author': quote.css('small.author::text').get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
# 获取下一页链接
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, self.parse)

110
scrapy_proj/spiders/zgjs.py Normal file
View File

@ -0,0 +1,110 @@
import re
import scrapy
from datetime import datetime
from urllib.parse import urljoin
from ..items import VideoItem
class ZGJSSpider(scrapy.Spider):
name = "zgjs"
allowed_domains = ["tv.81.cn"]
def __init__(self, url: str = None, video_list: int = 0 , *args, **kwargs):
"""初始化爬虫
Args:
url: 开始URL可通过API传入
"""
super(ZGJSSpider, self).__init__(*args, **kwargs)
self.video_list = video_list
self.start_urls = [url] if url else ["http://tv.81.cn/zgjs/jsjs/index.html"]
def parse(self, response):
"""解析列表页
Args:
response: 响应对象
"""
print("开始爬取")
# 限制请求次数
limit_status = False
limit_count = 3
# 解析视频列表
for video_item in response.xpath('//li[@class="content-box col-lg-2-10 col-sm-3-12 col-xs-6-12"]'):
if limit_status and limit_count <= 0:
return
limit_count -= 1
# 获取详情页URL
detail_url = video_item.xpath('.//a/@href').get()
if detail_url:
detail_url = urljoin(response.url, detail_url)
# 获取基本信息
item = VideoItem()
item['video_list'] = self.video_list
item['source_url'] = detail_url
item['source_thumbnail_url'] = str.format("http://tv.81.cn" + video_item.xpath('.//img/@src').get()) if video_item.xpath('.//img/@src').get() else ""
item['duration'] = video_item.xpath('.//div[@class="video-des"]//span/text()').get().strip()
item['publish_time'] = video_item.xpath('.//small[@class="time hidden"]/text()').get()
item['status'] = 0 # 初始状态:待处理
# 请求详情页
yield scrapy.Request(
url=detail_url,
callback=self.parse_detail,
meta={'item': item}
)
# 处理分页
# 使用正则匹配 createPageHTML 的参数
script_text = response.xpath('//script[contains(., "createPageHTML")]/text()').get()
if script_text:
page_match = re.findall(r"'([^']+)'", script_text)
if page_match:
max_page = int(page_match[0]) # 10
cur_page = int(page_match[1]) # 10
if max_page > cur_page:
next_page = urljoin(response.url, f"index_{cur_page + 1}.html")
if next_page and limit_status is False:
print(f"开始爬取下一页:{next_page}")
next_url = urljoin(response.url, next_page)
yield scrapy.Request(url=next_url, callback=self.parse)
def parse_detail(self, response):
"""解析详情页
Args:
response: 响应对象
"""
item = response.meta['item']
# 提取标题
item['title'] = response.xpath('//div[@class="video-header"]/h2/text()').get().strip()
# 提取视频简介
description = response.xpath('//div[@id="content-source"]/text()').get()
item['description'] = description.strip() if description else ""
# 提取视频URL
video_url = response.xpath('//div[@id="new_cmplayer"]/@data-media').get()
if video_url:
item['video_url'] = urljoin(response.url, video_url) if video_url else ""
# 处理时间格式
if item.get('publish_time'):
try:
# 假设时间格式为 "YYYY-MM-DD HH:MM:SS"
datetime.strptime(item['publish_time'], '%Y-%m-%d %H:%M:%S')
except ValueError:
# 如果解析失败,使用当前时间
item['publish_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
yield item
def closed(self, reason):
"""爬虫关闭时的回调函数
Args:
reason: 关闭原因
"""
self.logger.info(f'Spider closed: {reason}')

39
scripts/build-docker.sh Normal file
View File

@ -0,0 +1,39 @@
#!/bin/bash
# 版本号文件
VERSION_FILE=".version"
# 默认版本号
DEFAULT_VERSION="0.1"
# 如果未提供版本号,则自动增长
if [ -z "$1" ]; then
# 检查是否有版本记录文件
if [ -f "$VERSION_FILE" ] && [ -s "$VERSION_FILE" ]; then # 增加 -s 检查文件非空
CURRENT_VERSION=$(cat "$VERSION_FILE")
# 版本号 +0.1(仅支持 0.1 这种格式)
NEW_VERSION=$(echo "$CURRENT_VERSION + 0.1" | bc)
else
NEW_VERSION="$DEFAULT_VERSION"
fi
else
# 使用手动指定的版本号
NEW_VERSION="$1"
fi
# 确保版本号格式正确(以数字开头)
if [[ ! "$NEW_VERSION" =~ ^[0-9] ]]; then
NEW_VERSION="$DEFAULT_VERSION"
fi
# 保存新版本号
echo "$NEW_VERSION" > "$VERSION_FILE"
# 构建 Docker 镜像
IMAGE_NAME="crawler_zgjs"
TAG="$IMAGE_NAME:$NEW_VERSION"
echo "🛠️ 构建Docker镜像: $TAG"
docker build -t "$TAG" .
echo "✅ 构建成功! 版本号: $NEW_VERSION"

95
scripts/export-docker.sh Executable file
View File

@ -0,0 +1,95 @@
#!/bin/bash
# 版本号文件
VERSION_FILE=".version"
# 默认镜像名称与build-docker.sh保持一致
DEFAULT_IMAGE_NAME="crawler_zgjs"
# 显示帮助信息
show_help() {
echo "使用方法: $0 [选项]"
echo
echo "选项:"
echo " -h, --help 显示帮助信息"
echo " -v, --version 指定版本号(可选,默认使用.version文件中的版本"
echo " -n, --name 指定镜像名称(可选,默认为 $DEFAULT_IMAGE_NAME"
echo " -o, --output 指定输出文件路径(可选,默认为 {镜像名称}-{版本号}.tar"
echo
echo "示例:"
echo " $0 # 使用默认设置导出镜像"
echo " $0 -v 1.0 # 导出指定版本的镜像"
echo " $0 -n custom_name -v 2.0 # 导出指定名称和版本的镜像"
echo " $0 -o /path/to/image.tar # 指定输出文件路径"
}
# 解析命令行参数
VERSION=""
IMAGE_NAME="$DEFAULT_IMAGE_NAME"
OUTPUT_FILE=""
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
show_help
exit 0
;;
-v|--version)
VERSION="$2"
shift 2
;;
-n|--name)
IMAGE_NAME="$2"
shift 2
;;
-o|--output)
OUTPUT_FILE="$2"
shift 2
;;
*)
echo "❌ 错误: 未知参数 $1"
show_help
exit 1
;;
esac
done
# 如果未指定版本,从文件读取
if [ -z "$VERSION" ]; then
if [ -f "$VERSION_FILE" ] && [ -s "$VERSION_FILE" ]; then
VERSION=$(cat "$VERSION_FILE")
else
echo "❌ 错误: 未指定版本且无法从 $VERSION_FILE 读取版本号"
exit 1
fi
fi
# 构建完整的镜像标签
TAG="$IMAGE_NAME:$VERSION"
# 如果未指定输出文件,使用默认命名
if [ -z "$OUTPUT_FILE" ]; then
OUTPUT_FILE="${IMAGE_NAME}-${VERSION}.tar"
fi
# 检查镜像是否存在
if ! docker image inspect "$TAG" >/dev/null 2>&1; then
echo "❌ 错误: 镜像 $TAG 不存在"
echo "提示: 请先使用 build-docker.sh 构建镜像"
exit 1
fi
echo "🚀 开始导出Docker镜像..."
echo "📦 镜像: $TAG"
echo "📄 输出: $OUTPUT_FILE"
# 导出镜像
if docker save -o "$OUTPUT_FILE" "$TAG"; then
echo "✅ 导出成功!"
echo "📁 文件大小: $(du -h "$OUTPUT_FILE" | cut -f1)"
else
echo "❌ 导出失败!"
# 如果导出失败,清理可能部分写入的文件
[ -f "$OUTPUT_FILE" ] && rm "$OUTPUT_FILE"
exit 1
fi

2078
static/bootstrap/css/bootstrap-icons.css vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

1006
static/index.html Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

2
static/layui/layui.js Normal file

File diff suppressed because one or more lines are too long