初始化项目
This commit is contained in:
commit
d3834eb37e
10
.gitignore
vendored
Normal file
10
.gitignore
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
.venv
|
||||
venv
|
||||
|
||||
__pycache__
|
||||
|
||||
*.pyc
|
||||
.env
|
||||
data/videos.db
|
||||
|
||||
output
|
5
.vscode/settings.json
vendored
Normal file
5
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"python.analysis.extraPaths": [
|
||||
"./scrapy_proj"
|
||||
]
|
||||
}
|
43
Dockerfile
Normal file
43
Dockerfile
Normal file
@ -0,0 +1,43 @@
|
||||
# 使用官方Python基础镜像
|
||||
FROM python:3.9-bookworm
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /app
|
||||
|
||||
# 设置时区(可选)
|
||||
ENV TZ=Asia/Shanghai
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
# 使用阿里云镜像源
|
||||
RUN echo "deb https://mirrors.aliyun.com/debian/ bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \
|
||||
echo "deb https://mirrors.aliyun.com/debian/ bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \
|
||||
echo "deb https://mirrors.aliyun.com/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list
|
||||
|
||||
# 安装系统依赖
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
python3-dev \
|
||||
python3-pip \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 复制requirements.txt并安装Python依赖(使用阿里pip源)
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt \
|
||||
-i https://mirrors.aliyun.com/pypi/simple/ \
|
||||
--trusted-host mirrors.aliyun.com
|
||||
|
||||
# 复制项目文件
|
||||
COPY . .
|
||||
|
||||
# 创建数据目录
|
||||
RUN mkdir -p /app/data
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 8000
|
||||
|
||||
# 设置环境变量
|
||||
ENV PYTHONPATH=/app
|
||||
ENV DATABASE_URL=sqlite:////app/data/videos.db
|
||||
|
||||
# 启动命令
|
||||
CMD ["python", "main.py"]
|
5
README.md
Normal file
5
README.md
Normal file
@ -0,0 +1,5 @@
|
||||
创建虚拟环境:python -m venv .venv
|
||||
|
||||
激活虚拟环境:.venv\Scripts\Activate.ps1
|
||||
source ./.venv/bin/activate
|
||||
|
18
docker-compose.yml
Normal file
18
docker-compose.yml
Normal file
@ -0,0 +1,18 @@
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
app:
|
||||
image: crawler_zgjs:0.1
|
||||
ports:
|
||||
- "${PORT:-8000}:8000"
|
||||
volumes:
|
||||
- ./data:/app/data
|
||||
env_file:
|
||||
- .env
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- app-network
|
||||
|
||||
networks:
|
||||
app-network:
|
||||
driver: bridge
|
18
init_db.py
Normal file
18
init_db.py
Normal file
@ -0,0 +1,18 @@
|
||||
import os
|
||||
from sqlalchemy import create_engine
|
||||
from scrapy_proj.models import Base
|
||||
|
||||
# 确保data目录存在
|
||||
os.makedirs("data", exist_ok=True)
|
||||
|
||||
# 创建数据库引擎
|
||||
DATABASE_URL = "sqlite:///data/videos.db"
|
||||
engine = create_engine(DATABASE_URL)
|
||||
|
||||
def init_db():
|
||||
"""初始化数据库,创建所有表"""
|
||||
Base.metadata.create_all(bind=engine)
|
||||
print("数据库表创建成功!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
init_db()
|
744
main.py
Normal file
744
main.py
Normal file
@ -0,0 +1,744 @@
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, List, Optional, Literal
|
||||
from datetime import datetime
|
||||
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.responses import FileResponse
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
import uvicorn
|
||||
from multiprocessing import Process
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# 加载环境变量
|
||||
load_dotenv()
|
||||
|
||||
# 添加scrapy项目到Python路径
|
||||
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "scrapy_proj"))
|
||||
|
||||
# 导入爬虫
|
||||
from scrapy_proj.spiders.zgjs import ZGJSSpider
|
||||
from scrapy_proj.models import ScheduledTask
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 定时任务相关的Pydantic模型
|
||||
class ScheduledTaskCreate(BaseModel):
|
||||
"""创建定时任务请求模型"""
|
||||
name: str
|
||||
cron_expression: str
|
||||
spider_name: str
|
||||
url: str
|
||||
video_list: int
|
||||
enabled: bool = True
|
||||
|
||||
class ScheduledTaskUpdate(BaseModel):
|
||||
"""更新定时任务请求模型"""
|
||||
name: Optional[str] = None
|
||||
cron_expression: Optional[str] = None
|
||||
spider_name: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
video_list: Optional[int] = None
|
||||
enabled: Optional[bool] = None
|
||||
|
||||
class TaskStatus(BaseModel):
|
||||
"""任务状态响应模型"""
|
||||
status: Literal['pending', 'running', 'completed', 'failed']
|
||||
message: Optional[str] = None
|
||||
start_time: Optional[str] = None
|
||||
end_time: Optional[str] = None
|
||||
|
||||
class ScheduledTaskResponse(BaseModel):
|
||||
"""定时任务响应模型"""
|
||||
id: int
|
||||
name: str
|
||||
cron_expression: str
|
||||
spider_name: str
|
||||
url: str
|
||||
video_list: int
|
||||
enabled: bool
|
||||
create_time: str
|
||||
update_time: str
|
||||
|
||||
# 存储任务状态
|
||||
task_status_store: Dict[str, Dict] = {}
|
||||
|
||||
# 创建数据库引擎和会话
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///data/videos.db")
|
||||
engine = create_engine(DATABASE_URL)
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
# 创建调度器
|
||||
scheduler = BackgroundScheduler()
|
||||
scheduler.start()
|
||||
|
||||
# 创建FastAPI应用
|
||||
app = FastAPI(
|
||||
title="Scrapy API",
|
||||
description="影视资源爬虫API",
|
||||
version="0.1.0",
|
||||
)
|
||||
|
||||
# 存储爬虫任务状态
|
||||
spider_tasks = {}
|
||||
|
||||
# 获取数据库会话
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
return db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
class SpiderRequest(BaseModel):
|
||||
"""爬虫请求模型"""
|
||||
url: Optional[HttpUrl] = None
|
||||
spider_name: str = "example"
|
||||
settings: Optional[Dict] = None
|
||||
video_list: int
|
||||
|
||||
|
||||
class SpiderResponse(BaseModel):
|
||||
"""爬虫响应模型"""
|
||||
task_id: str
|
||||
status: str
|
||||
spider_name: str
|
||||
task_name: Optional[str] = None # 添加任务名称字段
|
||||
message: str
|
||||
start_time: Optional[str] = None
|
||||
end_time: Optional[str] = None
|
||||
|
||||
class PaginatedSpiderResponse(BaseModel):
|
||||
"""分页爬虫响应模型"""
|
||||
items: List[SpiderResponse]
|
||||
total: int
|
||||
page: int
|
||||
page_size: int
|
||||
total_pages: int
|
||||
|
||||
|
||||
def _run_spider_process(spider_name: str, url: Optional[str], video_list: int, settings: Dict):
|
||||
"""实际运行爬虫的进程函数"""
|
||||
try:
|
||||
# 获取项目设置
|
||||
crawler_settings = get_project_settings()
|
||||
|
||||
# 如果提供了自定义设置,则更新
|
||||
if settings:
|
||||
for key, value in settings.items():
|
||||
crawler_settings.set(key, value)
|
||||
|
||||
# 创建爬虫进程
|
||||
process = CrawlerProcess(settings=crawler_settings)
|
||||
|
||||
# 选择爬虫
|
||||
if spider_name == "zgjs":
|
||||
logger.info(f"启动爬虫 {spider_name},URL: {url}")
|
||||
process.crawl(ZGJSSpider, url=url, video_list=video_list)
|
||||
else:
|
||||
raise ValueError(f"未知的爬虫: {spider_name}")
|
||||
|
||||
# 启动爬虫
|
||||
process.start()
|
||||
|
||||
# 确保爬虫进程正确关闭
|
||||
try:
|
||||
if hasattr(process, '_active') and not process._active:
|
||||
logger.info("爬虫进程已完成")
|
||||
elif hasattr(process, 'bootstrap_stopped') and process.bootstrap_stopped: # type: ignore
|
||||
logger.info("爬虫进程正常停止")
|
||||
else:
|
||||
logger.warning("爬虫进程未正常停止,强制关闭")
|
||||
process.stop()
|
||||
|
||||
# 确保所有reactor线程都停止
|
||||
from twisted.internet import reactor
|
||||
if reactor.running: # type: ignore
|
||||
logger.info("停止reactor")
|
||||
reactor.stop() # type: ignore
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"关闭爬虫进程时出错: {str(e)}")
|
||||
sys.exit(1) # 非正常退出
|
||||
|
||||
# 确保进程退出
|
||||
logger.info("爬虫进程正常退出")
|
||||
sys.exit(0)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"爬虫进程运行错误: {str(e)}", exc_info=True)
|
||||
sys.exit(1) # 非正常退出
|
||||
|
||||
def run_spider(task_id: str, spider_name: str, task_name: Optional[str] = None, url: Optional[str] = None, video_list: int = 0, settings: Optional[Dict] = None):
|
||||
"""在后台运行爬虫
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
spider_name: 爬虫名称
|
||||
task_name: 任务名称
|
||||
url: 开始URL
|
||||
video_list: 视频列表数量
|
||||
settings: 爬虫设置
|
||||
"""
|
||||
try:
|
||||
# 确保settings是字典
|
||||
settings = settings or {}
|
||||
|
||||
# 初始化爬虫任务状态
|
||||
if task_id not in spider_tasks:
|
||||
spider_tasks[task_id] = {
|
||||
"status": "pending",
|
||||
"spider_name": spider_name,
|
||||
"task_name": task_name,
|
||||
"message": "爬虫任务初始化中",
|
||||
"started_at": time.time(),
|
||||
"finished_at": None
|
||||
}
|
||||
|
||||
# 创建并启动新进程
|
||||
p = Process(
|
||||
target=_run_spider_process,
|
||||
args=(spider_name, url, video_list, settings),
|
||||
daemon=True # 设置为守护进程,确保主进程退出时子进程也会退出
|
||||
)
|
||||
p.start()
|
||||
|
||||
# 更新任务状态为运行中
|
||||
spider_tasks[task_id].update({
|
||||
"status": "running",
|
||||
"message": "爬虫任务正在运行",
|
||||
"process": p,
|
||||
"started_at": time.time()
|
||||
})
|
||||
|
||||
# 启动一个线程来监控进程状态
|
||||
def monitor_process(process, task_id):
|
||||
process.join() # 等待进程结束
|
||||
if task_id in spider_tasks:
|
||||
finish_time = time.time()
|
||||
spider_tasks[task_id]["finished_at"] = finish_time
|
||||
|
||||
if process.exitcode == 0:
|
||||
spider_tasks[task_id].update({
|
||||
"status": "completed",
|
||||
"message": "爬虫任务成功完成"
|
||||
})
|
||||
else:
|
||||
spider_tasks[task_id].update({
|
||||
"status": "failed",
|
||||
"message": f"爬虫任务失败,退出码: {process.exitcode}"
|
||||
})
|
||||
|
||||
# 同步更新定时任务状态(如果存在)
|
||||
for scheduled_task_id, status_info in task_status_store.items():
|
||||
if status_info.get("spider_task_id") == task_id:
|
||||
status_info.update({
|
||||
"status": spider_tasks[task_id]["status"],
|
||||
"message": spider_tasks[task_id]["message"],
|
||||
"end_time": datetime.fromtimestamp(finish_time).strftime("%Y-%m-%d %H:%M:%S")
|
||||
})
|
||||
|
||||
from threading import Thread
|
||||
monitor_thread = Thread(target=monitor_process, args=(p, task_id))
|
||||
monitor_thread.daemon = True
|
||||
monitor_thread.start()
|
||||
|
||||
except Exception as e:
|
||||
error_time = time.time()
|
||||
# 更新任务状态为失败
|
||||
spider_tasks[task_id].update({
|
||||
"status": "failed",
|
||||
"message": f"启动爬虫进程错误: {str(e)}",
|
||||
"finished_at": error_time
|
||||
})
|
||||
|
||||
# 同步更新定时任务状态(如果存在)
|
||||
for scheduled_task_id, status_info in task_status_store.items():
|
||||
if status_info.get("spider_task_id") == task_id:
|
||||
status_info.update({
|
||||
"status": "failed",
|
||||
"message": f"启动爬虫进程错误: {str(e)}",
|
||||
"end_time": datetime.fromtimestamp(error_time).strftime("%Y-%m-%d %H:%M:%S")
|
||||
})
|
||||
|
||||
logger.error(f"启动爬虫进程错误: {str(e)}")
|
||||
|
||||
def cleanup_finished_processes():
|
||||
"""清理已完成的进程并更新状态"""
|
||||
for task_id, task_info in list(spider_tasks.items()):
|
||||
if "process" in task_info:
|
||||
process = task_info["process"]
|
||||
if not process.is_alive():
|
||||
# 获取进程退出码
|
||||
exitcode = process.exitcode
|
||||
|
||||
# 清理进程资源
|
||||
process.join()
|
||||
process.close()
|
||||
|
||||
# 当前时间
|
||||
current_time = time.time()
|
||||
|
||||
# 根据退出码更新状态
|
||||
if exitcode == 0:
|
||||
task_info["status"] = "completed"
|
||||
task_info["message"] = "爬虫任务成功完成"
|
||||
else:
|
||||
task_info["status"] = "failed"
|
||||
task_info["message"] = f"爬虫任务失败,退出码: {exitcode}"
|
||||
|
||||
# 记录完成时间(如果还没有设置的话)
|
||||
if "finished_at" not in task_info:
|
||||
task_info["finished_at"] = current_time
|
||||
|
||||
# 确保有开始时间
|
||||
if "started_at" not in task_info:
|
||||
task_info["started_at"] = task_info.get("finished_at", current_time)
|
||||
|
||||
del spider_tasks[task_id]["process"]
|
||||
|
||||
|
||||
@app.post("/api/spiders/run", response_model=SpiderResponse)
|
||||
async def start_spider(spider_request: SpiderRequest, background_tasks: BackgroundTasks):
|
||||
"""启动爬虫任务
|
||||
|
||||
Args:
|
||||
spider_request: 爬虫请求参数
|
||||
background_tasks: 后台任务
|
||||
|
||||
Returns:
|
||||
SpiderResponse: 爬虫响应
|
||||
"""
|
||||
# 生成任务ID
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
if (spider_request.url is None):
|
||||
raise HTTPException(status_code=400, detail="缺少url参数")
|
||||
if (spider_request.video_list is None):
|
||||
raise HTTPException(status_code=400, detail="缺少video_list参数")
|
||||
|
||||
# 当前时间戳
|
||||
current_time = time.time()
|
||||
|
||||
# 记录任务信息
|
||||
spider_tasks[task_id] = {
|
||||
"status": "pending",
|
||||
"spider_name": spider_request.spider_name,
|
||||
"message": "爬虫任务已创建,等待执行",
|
||||
"started_at": current_time, # 添加开始时间
|
||||
"finished_at": None # 初始化结束时间为None
|
||||
}
|
||||
|
||||
# 在后台运行爬虫
|
||||
background_tasks.add_task(
|
||||
run_spider,
|
||||
task_id=task_id,
|
||||
spider_name=spider_request.spider_name,
|
||||
url=str(spider_request.url) if spider_request.url else None,
|
||||
video_list=spider_request.video_list,
|
||||
settings=spider_request.settings
|
||||
)
|
||||
|
||||
# 格式化时间为ISO格式
|
||||
start_time = datetime.fromtimestamp(current_time).isoformat()
|
||||
|
||||
return SpiderResponse(
|
||||
task_id=task_id,
|
||||
status="pending",
|
||||
spider_name=spider_request.spider_name,
|
||||
message="爬虫任务已创建,等待执行",
|
||||
start_time=start_time,
|
||||
end_time=None
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/spiders/status/{task_id}", response_model=SpiderResponse)
|
||||
async def get_spider_status(task_id: str):
|
||||
"""获取爬虫任务状态
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
|
||||
Returns:
|
||||
SpiderResponse: 爬虫响应
|
||||
"""
|
||||
# 先清理已完成进程
|
||||
cleanup_finished_processes()
|
||||
|
||||
if task_id not in spider_tasks:
|
||||
raise HTTPException(status_code=404, detail="任务不存在")
|
||||
|
||||
task_info = spider_tasks[task_id]
|
||||
|
||||
return SpiderResponse(
|
||||
task_id=task_id,
|
||||
status=task_info["status"],
|
||||
spider_name=task_info["spider_name"],
|
||||
task_name=task_info.get("task_name"),
|
||||
message=task_info["message"],
|
||||
start_time=datetime.fromtimestamp(task_info["started_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("started_at") else None,
|
||||
end_time=datetime.fromtimestamp(task_info["finished_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("finished_at") else None
|
||||
)
|
||||
|
||||
@app.post("/api/spiders/cleanup")
|
||||
async def cleanup_spiders():
|
||||
"""清理已完成的任务进程
|
||||
|
||||
Returns:
|
||||
Dict: 清理结果
|
||||
"""
|
||||
cleanup_finished_processes()
|
||||
return {"message": "已完成进程清理"}
|
||||
|
||||
|
||||
@app.get("/api/spiders/list", response_model=PaginatedSpiderResponse)
|
||||
async def list_spiders(page: int = 1, page_size: int = 10):
|
||||
"""列出爬虫任务(分页)
|
||||
|
||||
Args:
|
||||
page: 页码,从1开始
|
||||
page_size: 每页数量
|
||||
|
||||
Returns:
|
||||
PaginatedSpiderResponse: 分页的爬虫任务列表
|
||||
"""
|
||||
# 先清理已完成进程,确保状态最新
|
||||
cleanup_finished_processes()
|
||||
|
||||
# 计算分页参数
|
||||
all_tasks = list(spider_tasks.items())
|
||||
all_tasks.reverse() # 倒序
|
||||
total = len(all_tasks)
|
||||
total_pages = (total + page_size - 1) // page_size
|
||||
page = max(1, min(page, total_pages))
|
||||
|
||||
# 获取当前页的数据
|
||||
start = (page - 1) * page_size
|
||||
end = start + page_size
|
||||
paginated_tasks = all_tasks[start:end]
|
||||
|
||||
return PaginatedSpiderResponse(
|
||||
items=[
|
||||
SpiderResponse(
|
||||
task_id=str(task_id),
|
||||
status=task_info["status"],
|
||||
spider_name=task_info["spider_name"],
|
||||
task_name=task_info.get("task_name"),
|
||||
message=task_info["message"],
|
||||
start_time=datetime.fromtimestamp(task_info["started_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("started_at") else None,
|
||||
end_time=datetime.fromtimestamp(task_info["finished_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("finished_at") else None
|
||||
)
|
||||
for task_id, task_info in paginated_tasks
|
||||
],
|
||||
total=total,
|
||||
page=page,
|
||||
page_size=page_size,
|
||||
total_pages=total_pages
|
||||
)
|
||||
|
||||
|
||||
def add_job_to_scheduler(task: ScheduledTask):
|
||||
"""添加任务到调度器"""
|
||||
if task.enabled: # type: ignore
|
||||
# 创建一个包装函数来处理定时任务的状态更新
|
||||
def scheduled_spider_run():
|
||||
# 生成唯一的爬虫任务ID
|
||||
spider_task_id = str(uuid.uuid4())
|
||||
task_id_str = str(task.id)
|
||||
current_time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# 记录定时任务状态
|
||||
task_status_store[task_id_str] = {
|
||||
"status": "running",
|
||||
"message": "定时任务自动执行中",
|
||||
"start_time": current_time_str,
|
||||
"end_time": None,
|
||||
"spider_task_id": spider_task_id
|
||||
}
|
||||
|
||||
# 运行爬虫
|
||||
run_spider(
|
||||
task_id=spider_task_id,
|
||||
spider_name=str(task.spider_name),
|
||||
task_name=str(task.name),
|
||||
url=str(task.url),
|
||||
video_list=task.video_list # type: ignore
|
||||
)
|
||||
|
||||
# 记录日志
|
||||
logger.info(f"定时任务 {task.name} (ID: {task.id}) 已自动执行,爬虫任务ID: {spider_task_id}")
|
||||
|
||||
# 添加到调度器
|
||||
scheduler.add_job(
|
||||
scheduled_spider_run,
|
||||
CronTrigger.from_crontab(task.cron_expression),
|
||||
id=str(task.id),
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
@app.post("/api/scheduled-tasks", response_model=ScheduledTaskResponse)
|
||||
async def create_scheduled_task(task: ScheduledTaskCreate):
|
||||
"""创建定时任务"""
|
||||
db = get_db()
|
||||
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
db_task = ScheduledTask(
|
||||
name=task.name,
|
||||
cron_expression=task.cron_expression,
|
||||
spider_name=task.spider_name,
|
||||
url=task.url,
|
||||
video_list=task.video_list,
|
||||
enabled=task.enabled,
|
||||
create_time=current_time,
|
||||
update_time=current_time
|
||||
)
|
||||
|
||||
try:
|
||||
db.add(db_task)
|
||||
db.commit()
|
||||
db.refresh(db_task)
|
||||
|
||||
# 如果任务启用,添加到调度器
|
||||
if task.enabled:
|
||||
add_job_to_scheduler(db_task)
|
||||
|
||||
return db_task
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@app.get("/api/scheduled-tasks", response_model=List[ScheduledTaskResponse])
|
||||
async def list_scheduled_tasks():
|
||||
"""获取所有定时任务"""
|
||||
db = get_db()
|
||||
try:
|
||||
tasks = db.query(ScheduledTask).all()
|
||||
return tasks
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@app.get("/api/scheduled-tasks/{task_id}", response_model=ScheduledTaskResponse)
|
||||
async def get_scheduled_task(task_id: int):
|
||||
"""获取指定定时任务"""
|
||||
db = get_db()
|
||||
try:
|
||||
task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
|
||||
if task is None:
|
||||
raise HTTPException(status_code=404, detail="Task not found")
|
||||
return task
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@app.put("/api/scheduled-tasks/{task_id}", response_model=ScheduledTaskResponse)
|
||||
async def update_scheduled_task(task_id: int, task_update: ScheduledTaskUpdate):
|
||||
"""更新定时任务"""
|
||||
db = get_db()
|
||||
try:
|
||||
db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
|
||||
if db_task is None:
|
||||
raise HTTPException(status_code=404, detail="Task not found")
|
||||
|
||||
update_data = task_update.dict(exclude_unset=True)
|
||||
for key, value in update_data.items():
|
||||
setattr(db_task, key, value)
|
||||
|
||||
db_task.update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # type: ignore
|
||||
db.commit()
|
||||
db.refresh(db_task)
|
||||
|
||||
# 更新调度器中的任务
|
||||
job_id = str(db_task.id)
|
||||
if scheduler.get_job(job_id):
|
||||
scheduler.remove_job(job_id)
|
||||
if db_task.enabled: # type: ignore
|
||||
add_job_to_scheduler(db_task)
|
||||
|
||||
return db_task
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@app.delete("/api/scheduled-tasks/{task_id}")
|
||||
async def delete_scheduled_task(task_id: int):
|
||||
"""删除定时任务"""
|
||||
db = get_db()
|
||||
try:
|
||||
db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
|
||||
if db_task is None:
|
||||
raise HTTPException(status_code=404, detail="Task not found")
|
||||
|
||||
# 从调度器中移除任务
|
||||
job_id = str(db_task.id)
|
||||
if scheduler.get_job(job_id):
|
||||
scheduler.remove_job(job_id)
|
||||
|
||||
db.delete(db_task)
|
||||
db.commit()
|
||||
return {"message": "Task deleted successfully"}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@app.post("/api/scheduled-tasks/{task_id}/toggle")
|
||||
async def toggle_scheduled_task(task_id: int):
|
||||
"""启用/禁用定时任务"""
|
||||
db = get_db()
|
||||
try:
|
||||
db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
|
||||
if db_task is None:
|
||||
raise HTTPException(status_code=404, detail="Task not found")
|
||||
|
||||
db_task.enabled = not db_task.enabled # type: ignore
|
||||
db_task.update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # type: ignore
|
||||
|
||||
# 更新调度器中的任务
|
||||
job_id = str(db_task.id)
|
||||
if scheduler.get_job(job_id):
|
||||
scheduler.remove_job(job_id)
|
||||
if db_task.enabled: # type: ignore
|
||||
add_job_to_scheduler(db_task)
|
||||
|
||||
db.commit()
|
||||
db.refresh(db_task)
|
||||
return {"message": f"Task {'enabled' if db_task.enabled else 'disabled'} successfully"} # type: ignore
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@app.post("/api/scheduled-tasks/{task_id}/run")
|
||||
async def run_scheduled_task(task_id: int, background_tasks: BackgroundTasks):
|
||||
"""手动执行定时任务"""
|
||||
db = get_db()
|
||||
try:
|
||||
db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
|
||||
if db_task is None:
|
||||
raise HTTPException(status_code=404, detail="Task not found")
|
||||
|
||||
# 生成唯一的爬虫任务ID
|
||||
spider_task_id = str(uuid.uuid4())
|
||||
|
||||
# 当前时间戳
|
||||
current_time = time.time()
|
||||
current_time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# 记录爬虫任务信息
|
||||
spider_tasks[spider_task_id] = {
|
||||
"status": "pending",
|
||||
"spider_name": db_task.spider_name,
|
||||
"task_name": db_task.name,
|
||||
"message": "爬虫任务已创建,等待执行",
|
||||
"started_at": current_time,
|
||||
"finished_at": None
|
||||
}
|
||||
|
||||
# 记录定时任务状态
|
||||
task_status_store[str(task_id)] = {
|
||||
"status": "running",
|
||||
"message": "定时任务正在执行",
|
||||
"start_time": current_time_str,
|
||||
"end_time": None,
|
||||
"spider_task_id": spider_task_id, # 关联爬虫任务ID
|
||||
"task_name": db_task.name
|
||||
}
|
||||
|
||||
# 在后台运行爬虫
|
||||
background_tasks.add_task(
|
||||
run_spider,
|
||||
task_id=spider_task_id,
|
||||
spider_name=str(db_task.spider_name),
|
||||
task_name=str(db_task.name),
|
||||
url=str(db_task.url),
|
||||
video_list=db_task.video_list # type: ignore
|
||||
)
|
||||
|
||||
return {
|
||||
"message": "任务已开始执行",
|
||||
"task_id": task_id,
|
||||
"spider_task_id": spider_task_id,
|
||||
"task_name": db_task.name # 添加任务名称到响应
|
||||
}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@app.get("/api/task-status/{task_id}", response_model=TaskStatus)
|
||||
async def get_task_status(task_id: int):
|
||||
"""获取定时任务的执行状态"""
|
||||
task_id_str = str(task_id)
|
||||
|
||||
# 先清理已完成的爬虫进程,确保状态最新
|
||||
cleanup_finished_processes()
|
||||
|
||||
if task_id_str not in task_status_store:
|
||||
return TaskStatus(status="pending", message="任务未执行")
|
||||
|
||||
status_info = task_status_store[task_id_str]
|
||||
|
||||
# 如果任务正在运行,检查爬虫任务的状态
|
||||
if "spider_task_id" in status_info:
|
||||
spider_task_id = status_info["spider_task_id"]
|
||||
if spider_task_id in spider_tasks:
|
||||
spider_info = spider_tasks[spider_task_id]
|
||||
spider_status = spider_info["status"]
|
||||
spider_message = spider_info["message"]
|
||||
|
||||
# 同步状态
|
||||
if spider_status != status_info["status"]:
|
||||
status_info["status"] = spider_status
|
||||
status_info["message"] = spider_message
|
||||
|
||||
# 如果爬虫任务完成或失败,更新结束时间
|
||||
if spider_status in ["completed", "failed"]:
|
||||
status_info["end_time"] = datetime.fromtimestamp(
|
||||
spider_info.get("finished_at", time.time())
|
||||
).strftime("%Y-%m-%d %H:%M:%S")
|
||||
else:
|
||||
# 如果爬虫任务不存在且状态是running,可能是异常终止
|
||||
if status_info["status"] == "running":
|
||||
status_info["status"] = "failed"
|
||||
status_info["message"] = "爬虫任务异常终止"
|
||||
status_info["end_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
return TaskStatus(
|
||||
status=status_info["status"],
|
||||
message=status_info["message"],
|
||||
start_time=status_info.get("start_time"),
|
||||
end_time=status_info.get("end_time")
|
||||
)
|
||||
|
||||
# 挂载静态文件目录
|
||||
app.mount("/static", StaticFiles(directory="static"), name="static")
|
||||
|
||||
@app.get("/")
|
||||
async def read_index():
|
||||
return FileResponse("static/index.html")
|
||||
|
||||
if __name__ == "__main__":
|
||||
host = os.getenv("HOST", "0.0.0.0")
|
||||
port = int(os.getenv("PORT", "8000"))
|
||||
log_level = os.getenv("LOG_LEVEL", "info")
|
||||
|
||||
# 启动服务器
|
||||
uvicorn.run(
|
||||
app if not os.getenv("RELOAD") else "main:app",
|
||||
host=host,
|
||||
port=port,
|
||||
log_level=log_level,
|
||||
reload=bool(os.getenv("RELOAD"))
|
||||
)
|
15
requirements.txt
Normal file
15
requirements.txt
Normal file
@ -0,0 +1,15 @@
|
||||
scrapy>=2.11.0
|
||||
aliyun-python-sdk-core>=2.13.3
|
||||
aliyun-python-sdk-vod>=2.16.16
|
||||
requests>=2.31.0
|
||||
fastapi>=0.104.1
|
||||
uvicorn>=0.24.0
|
||||
pydantic>=2.5.1
|
||||
python-multipart>=0.0.6
|
||||
SQLAlchemy>=2.0.23
|
||||
alembic>=1.12.1
|
||||
pymysql>=1.1.0
|
||||
oss2>=2.19.1
|
||||
apscheduler>=3.11.0
|
||||
jinja2>=3.1.6
|
||||
dotenv>=0.9.9
|
11
scrapy.cfg
Normal file
11
scrapy.cfg
Normal file
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = scrapy_proj.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = scrapy_proj
|
0
scrapy_proj/__init__.py
Normal file
0
scrapy_proj/__init__.py
Normal file
99
scrapy_proj/database.py
Normal file
99
scrapy_proj/database.py
Normal file
@ -0,0 +1,99 @@
|
||||
import os
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy.pool import QueuePool
|
||||
from contextlib import contextmanager
|
||||
from .models import Base
|
||||
|
||||
class DatabaseManager:
|
||||
"""数据库管理器"""
|
||||
|
||||
def __init__(self, settings):
|
||||
"""初始化数据库管理器
|
||||
|
||||
Args:
|
||||
settings: Scrapy设置对象
|
||||
"""
|
||||
self.sqlite_file = settings.get('SQLITE_FILE', 'videos.db')
|
||||
self.mysql_config = {
|
||||
'host': settings.get('MYSQL_HOST', 'localhost'),
|
||||
'port': settings.get('MYSQL_PORT', 3306),
|
||||
'user': settings.get('MYSQL_USER', 'root'),
|
||||
'password': settings.get('MYSQL_PASSWORD', ''),
|
||||
'database': settings.get('MYSQL_DATABASE', 'crawler'),
|
||||
}
|
||||
|
||||
# 初始化数据库引擎
|
||||
self._init_sqlite()
|
||||
self._init_mysql()
|
||||
|
||||
# 创建会话工厂
|
||||
self.sqlite_session_maker = sessionmaker(bind=self.sqlite_engine)
|
||||
self.mysql_session_maker = sessionmaker(bind=self.mysql_engine)
|
||||
|
||||
def _init_sqlite(self):
|
||||
"""初始化SQLite数据库"""
|
||||
# 确保数据库目录存在
|
||||
db_dir = os.path.dirname(self.sqlite_file)
|
||||
if db_dir and not os.path.exists(db_dir):
|
||||
os.makedirs(db_dir)
|
||||
|
||||
# 创建SQLite引擎
|
||||
self.sqlite_engine = create_engine(
|
||||
f'sqlite:///{self.sqlite_file}',
|
||||
poolclass=QueuePool,
|
||||
pool_size=5,
|
||||
max_overflow=10,
|
||||
pool_timeout=30
|
||||
)
|
||||
|
||||
# 自动创建所有表
|
||||
Base.metadata.create_all(self.sqlite_engine)
|
||||
|
||||
def _init_mysql(self):
|
||||
"""初始化MySQL/MariaDB数据库"""
|
||||
# 创建MySQL引擎
|
||||
self.mysql_engine = create_engine(
|
||||
'mysql+pymysql://{user}:{password}@{host}:{port}/{database}?charset=utf8mb4'.format(
|
||||
**self.mysql_config
|
||||
),
|
||||
poolclass=QueuePool,
|
||||
pool_size=5,
|
||||
max_overflow=10,
|
||||
pool_timeout=30,
|
||||
pool_pre_ping=True # 自动检测断开的连接
|
||||
)
|
||||
|
||||
@contextmanager
|
||||
def sqlite_session(self):
|
||||
"""SQLite会话上下文管理器
|
||||
|
||||
Yields:
|
||||
Session: SQLite数据库会话
|
||||
"""
|
||||
session = self.sqlite_session_maker()
|
||||
try:
|
||||
yield session
|
||||
session.commit()
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
@contextmanager
|
||||
def mysql_session(self):
|
||||
"""MySQL会话上下文管理器
|
||||
|
||||
Yields:
|
||||
Session: MySQL数据库会话
|
||||
"""
|
||||
session = self.mysql_session_maker()
|
||||
try:
|
||||
yield session
|
||||
session.commit()
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
31
scrapy_proj/items.py
Normal file
31
scrapy_proj/items.py
Normal file
@ -0,0 +1,31 @@
|
||||
from scrapy import Item, Field
|
||||
from datetime import datetime
|
||||
|
||||
class VideoItem(Item):
|
||||
"""视频信息数据模型"""
|
||||
# 基本信息
|
||||
title = Field() # 标题
|
||||
description = Field() # 描述
|
||||
source_url = Field() # 来源URL
|
||||
publish_time = Field() # 发布时间
|
||||
create_time = Field(serializer=lambda x: datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # 创建时间
|
||||
update_time = Field(serializer=lambda x: datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # 更新时间
|
||||
|
||||
# 媒体信息
|
||||
video_url = Field() # 视频URL
|
||||
source_thumbnail_url = Field() # 原始缩略图URL
|
||||
thumbnail_url = Field() # 缩略图URL
|
||||
duration = Field() # 视频时长
|
||||
|
||||
# 附加信息,辅助数据导入迁移
|
||||
video_list = Field() # 视频分类ID
|
||||
|
||||
# 阿里云点播信息
|
||||
aliyun_video_id = Field() # 阿里云视频ID
|
||||
aliyun_status = Field() # 阿里云处理状态
|
||||
|
||||
# 其他信息
|
||||
status = Field() # 状态(0:待处理,1:处理中,2:处理完成,-1:处理失败)
|
||||
|
||||
sqlite_id = Field() # sqlite处理ID
|
||||
|
100
scrapy_proj/middlewares.py
Normal file
100
scrapy_proj/middlewares.py
Normal file
@ -0,0 +1,100 @@
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
|
||||
class ScrapyProjSpiderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, or item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request or item objects.
|
||||
pass
|
||||
|
||||
async def process_start(self, start):
|
||||
# Called with an async iterator over the spider start() method or the
|
||||
# maching method of an earlier spider middleware.
|
||||
async for item_or_request in start:
|
||||
yield item_or_request
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
|
||||
|
||||
class ScrapyProjDownloaderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
38
scrapy_proj/models.py
Normal file
38
scrapy_proj/models.py
Normal file
@ -0,0 +1,38 @@
|
||||
from sqlalchemy import Column, Integer, String, Text, DateTime, SmallInteger, Boolean
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
class ScheduledTask(Base):
|
||||
"""定时任务模型"""
|
||||
__tablename__ = 'scheduled_tasks'
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
name = Column(String(100), nullable=False)
|
||||
cron_expression = Column(String(100), nullable=False) # cron表达式
|
||||
spider_name = Column(String(50), nullable=False) # 爬虫名称
|
||||
url = Column(Text, nullable=False) # 爬取的URL
|
||||
video_list = Column(Integer, nullable=False) # 视频分类ID
|
||||
enabled = Column(Boolean, default=True) # 是否启用
|
||||
create_time = Column(Text) # SQLite中使用TEXT存储时间
|
||||
update_time = Column(Text) # SQLite中使用TEXT存储时间
|
||||
|
||||
class VideoSQLite(Base):
|
||||
"""SQLite视频模型"""
|
||||
__tablename__ = 'videos'
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
title = Column(Text)
|
||||
description = Column(Text)
|
||||
source_url = Column(Text, unique=True)
|
||||
publish_time = Column(Text) # SQLite中使用TEXT存储时间
|
||||
create_time = Column(Text)
|
||||
update_time = Column(Text)
|
||||
video_url = Column(Text)
|
||||
source_thumbnail_url = Column(Text)
|
||||
thumbnail_url = Column(Text)
|
||||
duration = Column(Text) # SQLite中duration是TEXT类型
|
||||
aliyun_video_id = Column(Text)
|
||||
aliyun_status = Column(Text)
|
||||
status = Column(Integer) # 0 默认 1 已迁移
|
||||
video_list = Column(Integer) # 视频分类ID
|
479
scrapy_proj/pipelines.py
Normal file
479
scrapy_proj/pipelines.py
Normal file
@ -0,0 +1,479 @@
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import json
|
||||
import requests
|
||||
import time
|
||||
import base64
|
||||
import oss2
|
||||
from aliyunsdkcore.client import AcsClient
|
||||
from aliyunsdkvod.request.v20170321.CreateUploadVideoRequest import CreateUploadVideoRequest
|
||||
from aliyunsdkvod.request.v20170321.GetVideoInfoRequest import GetVideoInfoRequest
|
||||
from aliyunsdkvod.request.v20170321.UpdateVideoInfoRequest import UpdateVideoInfoRequest
|
||||
from aliyunsdkvod.request.v20170321.CreateUploadImageRequest import CreateUploadImageRequest
|
||||
from sqlalchemy import text
|
||||
from .database import DatabaseManager
|
||||
from .models import VideoSQLite
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AliyunVodPipeline:
|
||||
"""阿里云视频点播处理中间件"""
|
||||
|
||||
def __init__(self, settings):
|
||||
"""初始化阿里云视频点播中间件
|
||||
初始化SQLite中间件
|
||||
|
||||
Args:
|
||||
settings: Scrapy设置对象
|
||||
"""
|
||||
self.settings = settings
|
||||
self.access_key_id = settings.get('ALIYUN_ACCESS_KEY_ID')
|
||||
self.access_key_secret = settings.get('ALIYUN_ACCESS_KEY_SECRET')
|
||||
self.template_group_id = settings.get('ALIYUN_TEMPLATE_GROUP_ID')
|
||||
self.client = AcsClient(self.access_key_id, self.access_key_secret, 'cn-shanghai')
|
||||
self.oss_client = oss2.Auth(self.access_key_id, self.access_key_secret)
|
||||
|
||||
self.db_manager = DatabaseManager(settings)
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def upload_media_by_url(self, video_url, title, cover_url=None):
|
||||
"""通过URL上传视频到阿里云VOD
|
||||
|
||||
Args:
|
||||
video_url: 视频URL
|
||||
title: 视频标题
|
||||
cover_url: 封面URL
|
||||
|
||||
Returns:
|
||||
str: 上传任务ID (JobId)
|
||||
"""
|
||||
from aliyunsdkvod.request.v20170321.UploadMediaByURLRequest import UploadMediaByURLRequest
|
||||
|
||||
request = UploadMediaByURLRequest()
|
||||
request.set_accept_format('JSON')
|
||||
|
||||
# 设置视频URL
|
||||
logger.info(f"上传视频URL: {video_url}")
|
||||
request.set_UploadURLs(video_url)
|
||||
|
||||
# 设置视频信息(需要是JSON数组字符串)
|
||||
upload_metadata = [{
|
||||
'Title': title,
|
||||
'SourceURL': video_url,
|
||||
'TemplateGroupId': self.template_group_id
|
||||
}]
|
||||
# 设置封面URL
|
||||
# if cover_url:
|
||||
# upload_metadata[0]['CoverURL'] = cover_url
|
||||
|
||||
request.set_UploadMetadatas(json.dumps(upload_metadata))
|
||||
|
||||
response = self.client.do_action_with_exception(request)
|
||||
result = json.loads(response)
|
||||
|
||||
# 返回第一个上传任务的JobId
|
||||
upload_jobs = result.get('UploadJobs', [])
|
||||
if not upload_jobs:
|
||||
raise Exception("No upload job created")
|
||||
|
||||
job = upload_jobs[0]
|
||||
# if job.get('Code') != 'Success':
|
||||
# raise Exception(f"Upload job failed: {job}")
|
||||
|
||||
return job.get('JobId') # 返回JobId而不是VideoId
|
||||
|
||||
def get_upload_job_status(self, job_id):
|
||||
"""获取上传任务状态
|
||||
|
||||
Args:
|
||||
job_id: 上传任务ID
|
||||
|
||||
Returns:
|
||||
dict: 任务状态信息,包含VideoId(如果上传完成)
|
||||
"""
|
||||
from aliyunsdkvod.request.v20170321.GetURLUploadInfosRequest import GetURLUploadInfosRequest
|
||||
|
||||
request = GetURLUploadInfosRequest()
|
||||
request.set_accept_format('JSON')
|
||||
request.set_JobIds(job_id)
|
||||
|
||||
response = self.client.do_action_with_exception(request)
|
||||
result = json.loads(response)
|
||||
|
||||
upload_jobs = result.get('URLUploadInfoList', [])
|
||||
if not upload_jobs:
|
||||
raise Exception(f"No upload job found with ID: {job_id}")
|
||||
|
||||
job = upload_jobs[0]
|
||||
return job
|
||||
|
||||
def wait_for_video_id(self, job_id, max_retries=5, retry_interval=2):
|
||||
"""等待上传任务完成并获取VideoId
|
||||
|
||||
Args:
|
||||
job_id: 上传任务ID
|
||||
max_retries: 最大重试次数
|
||||
retry_interval: 重试间隔(秒)
|
||||
|
||||
Returns:
|
||||
str: 视频ID
|
||||
"""
|
||||
import time
|
||||
|
||||
for i in range(max_retries):
|
||||
job_status = self.get_upload_job_status(job_id)
|
||||
|
||||
if job_status.get('MediaId'):
|
||||
return job_status.get('MediaId')
|
||||
|
||||
# 等待一段时间后重试
|
||||
time.sleep(retry_interval)
|
||||
|
||||
raise Exception(f"Max retries reached, upload job not completed: {job_id}")
|
||||
|
||||
def upload_image_to_oss(self, image_url, title):
|
||||
"""直接上传图片到阿里云OSS
|
||||
|
||||
Args:
|
||||
image_url: 图片URL
|
||||
title: 图片标题
|
||||
|
||||
Returns:
|
||||
str: OSS中的图片URL
|
||||
"""
|
||||
logger.info(f"开始上传图片到OSS: {image_url}")
|
||||
|
||||
try:
|
||||
# 1. 下载远程图片
|
||||
image_response = requests.get(image_url, timeout=30)
|
||||
image_response.raise_for_status()
|
||||
image_content = image_response.content
|
||||
except Exception as e:
|
||||
logger.error(f"下载图片失败: {str(e)}")
|
||||
raise Exception(f"下载图片失败: {str(e)}")
|
||||
|
||||
try:
|
||||
# 2. 生成OSS中的文件名(使用时间戳和原始文件名的组合)
|
||||
timestamp = int(time.time())
|
||||
file_ext = image_url.split('.')[-1] if '.' in image_url else 'jpg'
|
||||
oss_filename = f"images/{timestamp}_{title[:30]}.{file_ext}" # 限制标题长度,避免文件名过长
|
||||
|
||||
# 3. 获取OSS bucket(从settings中获取配置)
|
||||
bucket_name = self.settings.get('ALIYUN_OSS_BUCKET')
|
||||
endpoint = self.settings.get('ALIYUN_OSS_ENDPOINT')
|
||||
oss_bucket = oss2.Bucket(self.oss_client, endpoint, bucket_name)
|
||||
|
||||
# 4. 上传图片到OSS
|
||||
upload_response = oss_bucket.put_object(oss_filename, image_content)
|
||||
if upload_response.status == 200:
|
||||
# 5. 返回可访问的URL
|
||||
oss_url = f"https://{bucket_name}.{endpoint}/{oss_filename}"
|
||||
logger.info(f"图片上传成功: {oss_url}")
|
||||
return oss_url
|
||||
else:
|
||||
raise Exception(f"图片上传失败: {upload_response.status}")
|
||||
except Exception as e:
|
||||
logger.error(f"上传图片到OSS失败: {str(e)}")
|
||||
raise Exception(f"上传图片到OSS失败: {str(e)}")
|
||||
|
||||
|
||||
def process_item(self, item, spider):
|
||||
"""处理数据项,通过URL上传视频到阿里云VOD
|
||||
|
||||
Args:
|
||||
item: 爬取的数据项
|
||||
spider: 爬虫实例
|
||||
|
||||
Returns:
|
||||
item: 处理后的数据项
|
||||
"""
|
||||
# 如果已经有阿里云视频ID,跳过处理
|
||||
with self.db_manager.sqlite_session() as session:
|
||||
# 检查是否存在相同source_url的记录
|
||||
existing_video = session.query(VideoSQLite).filter_by(source_url=item.get('source_url')).first()
|
||||
if existing_video and existing_video.aliyun_video_id:
|
||||
logger.info(f"阿里云视频ID已存在,跳过该任务: {item.get('title')}")
|
||||
return item
|
||||
|
||||
video_url = item.get('video_url')
|
||||
if not video_url:
|
||||
logger.warning(f"视频URL为空,跳过处理: {item.get('source_url')}")
|
||||
return item
|
||||
|
||||
try:
|
||||
# 1. 上传封面图片到OSS(如果有)
|
||||
cover_url = item.get('source_thumbnail_url')
|
||||
if cover_url:
|
||||
try:
|
||||
oss_url = self.upload_image_to_oss(
|
||||
image_url=cover_url,
|
||||
title=item.get('title', '')
|
||||
)
|
||||
# 更新item中的封面URL为OSS URL
|
||||
item['thumbnail_url'] = oss_url
|
||||
logger.info(f"封面图片上传到OSS成功: {oss_url}")
|
||||
except Exception as e:
|
||||
logger.error(f"封面图片上传到OSS失败: {str(e)}")
|
||||
# 如果封面上传失败,继续处理视频,不中断流程
|
||||
|
||||
# 2. 通过URL上传视频,获取JobId
|
||||
title = item.get('title', '')
|
||||
job_id = self.upload_media_by_url(
|
||||
video_url=video_url,
|
||||
title=title,
|
||||
cover_url=oss_url # 使用刚上传的OSS封面URL
|
||||
)
|
||||
|
||||
logger.info(f"成功创建阿里云视频URL上传任务: job_id={job_id}, title={title}")
|
||||
|
||||
# 2. 等待上传完成并获取VideoId
|
||||
try:
|
||||
video_id = self.wait_for_video_id(job_id)
|
||||
logger.info(f"视频上传完成: video_id={video_id}, job_id={job_id}")
|
||||
|
||||
# 3. 更新item中的阿里云视频ID和状态
|
||||
item['aliyun_video_id'] = video_id
|
||||
item['aliyun_status'] = 'Success'
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"等待视频上传完成失败: job_id={job_id}, error={str(e)}")
|
||||
item['aliyun_video_id'] = ""
|
||||
item['aliyun_status'] = 'Uploading'
|
||||
raise # 重新抛出异常,让上层错误处理来处理
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"阿里云视频URL上传失败: {str(e)}")
|
||||
item['aliyun_status'] = 'Failed'
|
||||
|
||||
return item
|
||||
|
||||
|
||||
class SQLitePipeline:
|
||||
"""SQLite数据库处理中间件"""
|
||||
|
||||
def __init__(self, settings):
|
||||
"""初始化SQLite中间件
|
||||
|
||||
Args:
|
||||
settings: Scrapy设置对象
|
||||
"""
|
||||
self.db_manager = DatabaseManager(settings)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
"""处理数据项,保存到SQLite数据库
|
||||
|
||||
Args:
|
||||
item: 爬取的数据项
|
||||
spider: 爬虫实例
|
||||
|
||||
Returns:
|
||||
item: 处理后的数据项
|
||||
"""
|
||||
now = datetime.now()
|
||||
now_str = now.strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
with self.db_manager.sqlite_session() as session:
|
||||
# 检查是否存在相同source_url的记录
|
||||
existing_video = session.query(VideoSQLite).filter_by(source_url=item.get('source_url')).first()
|
||||
|
||||
if existing_video:
|
||||
logger.info(f"发现重复视频: {item.get('source_url')}")
|
||||
# 更新现有记录
|
||||
existing_video.title = item.get('title', '')
|
||||
existing_video.description = item.get('description', '')
|
||||
existing_video.publish_time = item.get('publish_time', '')
|
||||
existing_video.update_time = now_str
|
||||
existing_video.video_url = item.get('video_url', '')
|
||||
existing_video.source_thumbnail_url = item.get('source_thumbnail_url', '')
|
||||
existing_video.duration = str(item.get('duration', ''))
|
||||
existing_video.video_list = str(item.get('video_list', 0))
|
||||
# 判断video_id、status、thumbnail_url防止被覆盖
|
||||
if item.get('aliyun_video_id'):
|
||||
existing_video.aliyun_video_id = item['aliyun_video_id']
|
||||
if item.get('aliyun_status'):
|
||||
existing_video.aliyun_status = item['aliyun_status']
|
||||
if item.get('thumbnail_url'):
|
||||
existing_video.thumbnail_url = item['thumbnail_url']
|
||||
# existing_video.status = 0 # 重置状态为0
|
||||
|
||||
# 保存SQLite记录ID到item中,供后续中间件使用
|
||||
item['sqlite_id'] = existing_video.id
|
||||
|
||||
else:
|
||||
# 创建新记录
|
||||
sqlite_data = {
|
||||
'title': item.get('title', ''),
|
||||
'description': item.get('description', ''),
|
||||
'source_url': item.get('source_url', ''),
|
||||
'publish_time': item.get('publish_time', ''),
|
||||
'create_time': now_str,
|
||||
'update_time': now_str,
|
||||
'video_url': item.get('video_url', ''),
|
||||
'source_thumbnail_url': item.get('source_thumbnail_url', ''),
|
||||
'thumbnail_url': item.get('thumbnail_url', ''),
|
||||
'duration': str(item.get('duration', '')),
|
||||
'video_list': item.get('video_list', ''),
|
||||
'aliyun_video_id': item.get('aliyun_video_id', ''),
|
||||
'aliyun_status': item.get('aliyun_status', ''),
|
||||
'status': 0
|
||||
}
|
||||
|
||||
new_video = VideoSQLite(**sqlite_data)
|
||||
session.add(new_video)
|
||||
session.flush() # 获取新插入记录的ID
|
||||
|
||||
# 保存SQLite记录ID到item中,供后续中间件使用
|
||||
item['sqlite_id'] = new_video.id
|
||||
|
||||
return item
|
||||
|
||||
|
||||
class MariaDBPipeline:
|
||||
"""将数据从SQLite迁移到MariaDB的管道"""
|
||||
|
||||
def __init__(self, settings):
|
||||
"""初始化管道
|
||||
|
||||
Args:
|
||||
settings: Scrapy设置对象
|
||||
"""
|
||||
self.db_manager = DatabaseManager(settings)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
"""从crawler创建管道实例
|
||||
|
||||
Args:
|
||||
crawler: Scrapy crawler对象
|
||||
|
||||
Returns:
|
||||
MariaDBPipeline: 管道实例
|
||||
"""
|
||||
return cls(crawler.settings)
|
||||
|
||||
def open_spider(self, spider):
|
||||
"""当spider开启时调用"""
|
||||
self.logger.info("MariaDB管道已开启")
|
||||
|
||||
def close_spider(self, spider):
|
||||
"""当spider关闭时调用"""
|
||||
self.logger.info("MariaDB管道已关闭")
|
||||
self.migrate_data()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
"""处理item
|
||||
|
||||
Args:
|
||||
item: Scrapy item对象
|
||||
spider: Scrapy spider对象
|
||||
|
||||
Returns:
|
||||
item: 处理后的item
|
||||
"""
|
||||
# 这里不需要处理item,因为我们要从SQLite读取数据
|
||||
return item
|
||||
|
||||
def migrate_data(self):
|
||||
"""从SQLite迁移数据到MariaDB"""
|
||||
try:
|
||||
with self.db_manager.sqlite_session() as sqlite_session, \
|
||||
self.db_manager.mysql_session() as mysql_session:
|
||||
|
||||
# 1. 从SQLite读取视频数据
|
||||
sqlite_videos = sqlite_session.query(VideoSQLite).where((VideoSQLite.aliyun_video_id != None) & (VideoSQLite.aliyun_video_id != '')).all()
|
||||
# sqlite_videos = sqlite_session.query(VideoSQLite).all()
|
||||
|
||||
# 2. 批量迁移到MariaDB
|
||||
for video in sqlite_videos:
|
||||
# 根据video_id查重
|
||||
existing_video_id = mysql_session.execute(
|
||||
text("SELECT id FROM wz_video WHERE video_remote_id = :video_remote_id LIMIT 1"), {
|
||||
'video_remote_id': video.aliyun_video_id
|
||||
}
|
||||
)
|
||||
if existing_video_id.first():
|
||||
self.logger.info(f"远程数据库已存在该视频: {video.title}")
|
||||
continue
|
||||
|
||||
# 映射到wz_video表
|
||||
wz_video = {
|
||||
'cid': 1,
|
||||
'title': video.title or '',
|
||||
'css': '',
|
||||
'thumb': video.thumbnail_url or '',
|
||||
'keywords': '',
|
||||
'remark': video.description or '',
|
||||
'block': 0,
|
||||
'url': '',
|
||||
'status': 9,
|
||||
'route': 0,
|
||||
'publisher': 'spider',
|
||||
'addtime': int(time.time()),
|
||||
'updatetime': int(time.time()),
|
||||
'area': '1',
|
||||
'category': '1',
|
||||
'theme': 0,
|
||||
'year': '2025',
|
||||
'video_remote_id': video.aliyun_video_id or '',
|
||||
'video_url': '',
|
||||
'video_list': video.video_list or 0,
|
||||
'month': '1'
|
||||
}
|
||||
|
||||
# 映射到wz_video_data表
|
||||
wz_video_data = {
|
||||
'id': None, # 将在插入后设置
|
||||
'content': '',
|
||||
'coin': 0,
|
||||
'groups': '',
|
||||
'pagetype': 0,
|
||||
'maxchars': 0,
|
||||
'template': '',
|
||||
'allowcomment': 1,
|
||||
'relation': ''
|
||||
}
|
||||
|
||||
# 插入wz_video并获取ID
|
||||
result = mysql_session.execute(
|
||||
text("""INSERT INTO wz_video (
|
||||
cid, title, css, thumb, keywords, remark, block, url,
|
||||
status, route, publisher, addtime, updatetime, area,
|
||||
category, theme, year, video_remote_id, video_url,
|
||||
video_list, month
|
||||
) VALUES (
|
||||
:cid, :title, :css, :thumb, :keywords, :remark, :block, :url,
|
||||
:status, :route, :publisher, :addtime, :updatetime, :area,
|
||||
:category, :theme, :year, :video_remote_id, :video_url,
|
||||
:video_list, :month
|
||||
)"""),
|
||||
wz_video
|
||||
)
|
||||
video_id = result.lastrowid
|
||||
|
||||
# 设置wz_video_data的id并插入
|
||||
wz_video_data['id'] = video_id
|
||||
mysql_session.execute(
|
||||
text("""INSERT INTO wz_video_data (
|
||||
id, content, coin, groups, pagetype, maxchars,
|
||||
template, allowcomment, relation
|
||||
) VALUES (
|
||||
:id, :content, :coin, :groups, :pagetype, :maxchars,
|
||||
:template, :allowcomment, :relation
|
||||
)"""),
|
||||
wz_video_data
|
||||
)
|
||||
|
||||
mysql_session.commit()
|
||||
self.logger.info(f"成功迁移 {len(sqlite_videos)} 条视频数据到线上数据库")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"数据迁移失败: {str(e)}")
|
||||
raise
|
121
scrapy_proj/settings.py
Normal file
121
scrapy_proj/settings.py
Normal file
@ -0,0 +1,121 @@
|
||||
# Scrapy settings for scrapy_proj project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = "scrapy_proj"
|
||||
|
||||
SPIDER_MODULES = ["scrapy_proj.spiders"]
|
||||
NEWSPIDER_MODULE = "scrapy_proj.spiders"
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
CONCURRENT_REQUESTS = 8
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
DOWNLOAD_DELAY = 1
|
||||
# The download delay setting will honor only one of:
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN = 8
|
||||
CONCURRENT_REQUESTS_PER_IP = 8
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
DEFAULT_REQUEST_HEADERS = {
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# "scrapy_proj.middlewares.ScrapyProjSpiderMiddleware": 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
"scrapy_proj.middlewares.ScrapyProjDownloaderMiddleware": 543,
|
||||
}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
# "scrapy_proj.pipelines.AliyunVodPipeline": 300, # 上传到阿里云视频点播
|
||||
# "scrapy_proj.pipelines.SQLitePipeline": 400, # 保存到SQLite
|
||||
# "scrapy_proj.pipelines.MariaDBPipeline": 500, # 最后保存到MariaDB
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = "httpcache"
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
# 项目根目录配置(用于SQLite数据库文件路径)
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv() # 加载.env文件中的环境变量
|
||||
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# 数据库配置
|
||||
SQLITE_FILE = os.getenv("SQLITE_FILE", "data/videos.db")
|
||||
|
||||
# MariaDB配置
|
||||
MYSQL_HOST = os.getenv("MYSQL_HOST", "localhost") # 数据库主机
|
||||
MYSQL_PORT = int(os.getenv("MYSQL_PORT", "3306")) # 数据库端口
|
||||
MYSQL_USER = os.getenv("MYSQL_USER", "root") # 数据库用户名
|
||||
MYSQL_PASSWORD = os.getenv("MYSQL_PASSWORD") # 数据库密码
|
||||
MYSQL_DATABASE = os.getenv("MYSQL_DATABASE", "dev_yszy") # 数据库名称
|
||||
|
||||
# 阿里云配置
|
||||
ALIYUN_ACCESS_KEY_ID = os.getenv("ALIYUN_ACCESS_KEY_ID") # 阿里云AccessKey ID
|
||||
ALIYUN_ACCESS_KEY_SECRET = os.getenv("ALIYUN_ACCESS_KEY_SECRET") # 阿里云AccessKey Secret
|
||||
ALIYUN_TEMPLATE_GROUP_ID = os.getenv("ALIYUN_TEMPLATE_GROUP_ID") # 转码模板组ID
|
||||
|
||||
# 阿里云OSS配置
|
||||
ALIYUN_OSS_BUCKET = os.getenv("ALIYUN_OSS_BUCKET") # 阿里云OSS Bucket名称
|
||||
ALIYUN_OSS_ENDPOINT = os.getenv("ALIYUN_OSS_ENDPOINT") # 阿里云OSS Endpoint
|
4
scrapy_proj/spiders/__init__.py
Normal file
4
scrapy_proj/spiders/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
32
scrapy_proj/spiders/example.py
Normal file
32
scrapy_proj/spiders/example.py
Normal file
@ -0,0 +1,32 @@
|
||||
import scrapy
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class ExampleSpider(scrapy.Spider):
|
||||
name = "example"
|
||||
|
||||
def __init__(self, url: Optional[str] = None, *args, **kwargs):
|
||||
"""初始化爬虫
|
||||
|
||||
Args:
|
||||
url: 开始URL,可通过API传入
|
||||
"""
|
||||
super(ExampleSpider, self).__init__(*args, **kwargs)
|
||||
self.start_urls = [url] if url else ["http://quotes.toscrape.com"]
|
||||
|
||||
def parse(self, response):
|
||||
"""解析页面数据
|
||||
|
||||
这是一个示例解析器,从quotes.toscrape.com抓取引用和作者
|
||||
"""
|
||||
for quote in response.css('div.quote'):
|
||||
yield {
|
||||
'text': quote.css('span.text::text').get(),
|
||||
'author': quote.css('small.author::text').get(),
|
||||
'tags': quote.css('div.tags a.tag::text').getall(),
|
||||
}
|
||||
|
||||
# 获取下一页链接
|
||||
next_page = response.css('li.next a::attr(href)').get()
|
||||
if next_page is not None:
|
||||
yield response.follow(next_page, self.parse)
|
110
scrapy_proj/spiders/zgjs.py
Normal file
110
scrapy_proj/spiders/zgjs.py
Normal file
@ -0,0 +1,110 @@
|
||||
import re
|
||||
import scrapy
|
||||
from datetime import datetime
|
||||
from urllib.parse import urljoin
|
||||
from ..items import VideoItem
|
||||
|
||||
class ZGJSSpider(scrapy.Spider):
|
||||
name = "zgjs"
|
||||
allowed_domains = ["tv.81.cn"]
|
||||
|
||||
def __init__(self, url: str = None, video_list: int = 0 , *args, **kwargs):
|
||||
"""初始化爬虫
|
||||
|
||||
Args:
|
||||
url: 开始URL,可通过API传入
|
||||
"""
|
||||
super(ZGJSSpider, self).__init__(*args, **kwargs)
|
||||
self.video_list = video_list
|
||||
self.start_urls = [url] if url else ["http://tv.81.cn/zgjs/jsjs/index.html"]
|
||||
|
||||
def parse(self, response):
|
||||
"""解析列表页
|
||||
|
||||
Args:
|
||||
response: 响应对象
|
||||
"""
|
||||
print("开始爬取")
|
||||
# 限制请求次数
|
||||
limit_status = False
|
||||
limit_count = 3
|
||||
|
||||
# 解析视频列表
|
||||
for video_item in response.xpath('//li[@class="content-box col-lg-2-10 col-sm-3-12 col-xs-6-12"]'):
|
||||
if limit_status and limit_count <= 0:
|
||||
return
|
||||
limit_count -= 1
|
||||
|
||||
# 获取详情页URL
|
||||
detail_url = video_item.xpath('.//a/@href').get()
|
||||
if detail_url:
|
||||
detail_url = urljoin(response.url, detail_url)
|
||||
# 获取基本信息
|
||||
item = VideoItem()
|
||||
item['video_list'] = self.video_list
|
||||
item['source_url'] = detail_url
|
||||
item['source_thumbnail_url'] = str.format("http://tv.81.cn" + video_item.xpath('.//img/@src').get()) if video_item.xpath('.//img/@src').get() else ""
|
||||
item['duration'] = video_item.xpath('.//div[@class="video-des"]//span/text()').get().strip()
|
||||
item['publish_time'] = video_item.xpath('.//small[@class="time hidden"]/text()').get()
|
||||
item['status'] = 0 # 初始状态:待处理
|
||||
|
||||
# 请求详情页
|
||||
yield scrapy.Request(
|
||||
url=detail_url,
|
||||
callback=self.parse_detail,
|
||||
meta={'item': item}
|
||||
)
|
||||
|
||||
# 处理分页
|
||||
# 使用正则匹配 createPageHTML 的参数
|
||||
script_text = response.xpath('//script[contains(., "createPageHTML")]/text()').get()
|
||||
if script_text:
|
||||
page_match = re.findall(r"'([^']+)'", script_text)
|
||||
if page_match:
|
||||
max_page = int(page_match[0]) # 10
|
||||
cur_page = int(page_match[1]) # 10
|
||||
if max_page > cur_page:
|
||||
next_page = urljoin(response.url, f"index_{cur_page + 1}.html")
|
||||
if next_page and limit_status is False:
|
||||
print(f"开始爬取下一页:{next_page}")
|
||||
next_url = urljoin(response.url, next_page)
|
||||
yield scrapy.Request(url=next_url, callback=self.parse)
|
||||
|
||||
def parse_detail(self, response):
|
||||
"""解析详情页
|
||||
|
||||
Args:
|
||||
response: 响应对象
|
||||
"""
|
||||
item = response.meta['item']
|
||||
|
||||
# 提取标题
|
||||
item['title'] = response.xpath('//div[@class="video-header"]/h2/text()').get().strip()
|
||||
|
||||
# 提取视频简介
|
||||
description = response.xpath('//div[@id="content-source"]/text()').get()
|
||||
item['description'] = description.strip() if description else ""
|
||||
|
||||
# 提取视频URL
|
||||
video_url = response.xpath('//div[@id="new_cmplayer"]/@data-media').get()
|
||||
if video_url:
|
||||
item['video_url'] = urljoin(response.url, video_url) if video_url else ""
|
||||
|
||||
# 处理时间格式
|
||||
if item.get('publish_time'):
|
||||
try:
|
||||
# 假设时间格式为 "YYYY-MM-DD HH:MM:SS"
|
||||
datetime.strptime(item['publish_time'], '%Y-%m-%d %H:%M:%S')
|
||||
except ValueError:
|
||||
# 如果解析失败,使用当前时间
|
||||
item['publish_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
yield item
|
||||
|
||||
def closed(self, reason):
|
||||
"""爬虫关闭时的回调函数
|
||||
|
||||
Args:
|
||||
reason: 关闭原因
|
||||
"""
|
||||
self.logger.info(f'Spider closed: {reason}')
|
39
scripts/build-docker.sh
Normal file
39
scripts/build-docker.sh
Normal file
@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 版本号文件
|
||||
VERSION_FILE=".version"
|
||||
|
||||
# 默认版本号
|
||||
DEFAULT_VERSION="0.1"
|
||||
|
||||
# 如果未提供版本号,则自动增长
|
||||
if [ -z "$1" ]; then
|
||||
# 检查是否有版本记录文件
|
||||
if [ -f "$VERSION_FILE" ] && [ -s "$VERSION_FILE" ]; then # 增加 -s 检查文件非空
|
||||
CURRENT_VERSION=$(cat "$VERSION_FILE")
|
||||
# 版本号 +0.1(仅支持 0.1 这种格式)
|
||||
NEW_VERSION=$(echo "$CURRENT_VERSION + 0.1" | bc)
|
||||
else
|
||||
NEW_VERSION="$DEFAULT_VERSION"
|
||||
fi
|
||||
else
|
||||
# 使用手动指定的版本号
|
||||
NEW_VERSION="$1"
|
||||
fi
|
||||
|
||||
# 确保版本号格式正确(以数字开头)
|
||||
if [[ ! "$NEW_VERSION" =~ ^[0-9] ]]; then
|
||||
NEW_VERSION="$DEFAULT_VERSION"
|
||||
fi
|
||||
|
||||
# 保存新版本号
|
||||
echo "$NEW_VERSION" > "$VERSION_FILE"
|
||||
|
||||
# 构建 Docker 镜像
|
||||
IMAGE_NAME="crawler_zgjs"
|
||||
TAG="$IMAGE_NAME:$NEW_VERSION"
|
||||
|
||||
echo "🛠️ 构建Docker镜像: $TAG"
|
||||
docker build -t "$TAG" .
|
||||
|
||||
echo "✅ 构建成功! 版本号: $NEW_VERSION"
|
95
scripts/export-docker.sh
Executable file
95
scripts/export-docker.sh
Executable file
@ -0,0 +1,95 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 版本号文件
|
||||
VERSION_FILE=".version"
|
||||
|
||||
# 默认镜像名称(与build-docker.sh保持一致)
|
||||
DEFAULT_IMAGE_NAME="crawler_zgjs"
|
||||
|
||||
# 显示帮助信息
|
||||
show_help() {
|
||||
echo "使用方法: $0 [选项]"
|
||||
echo
|
||||
echo "选项:"
|
||||
echo " -h, --help 显示帮助信息"
|
||||
echo " -v, --version 指定版本号(可选,默认使用.version文件中的版本)"
|
||||
echo " -n, --name 指定镜像名称(可选,默认为 $DEFAULT_IMAGE_NAME)"
|
||||
echo " -o, --output 指定输出文件路径(可选,默认为 {镜像名称}-{版本号}.tar)"
|
||||
echo
|
||||
echo "示例:"
|
||||
echo " $0 # 使用默认设置导出镜像"
|
||||
echo " $0 -v 1.0 # 导出指定版本的镜像"
|
||||
echo " $0 -n custom_name -v 2.0 # 导出指定名称和版本的镜像"
|
||||
echo " $0 -o /path/to/image.tar # 指定输出文件路径"
|
||||
}
|
||||
|
||||
# 解析命令行参数
|
||||
VERSION=""
|
||||
IMAGE_NAME="$DEFAULT_IMAGE_NAME"
|
||||
OUTPUT_FILE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
-v|--version)
|
||||
VERSION="$2"
|
||||
shift 2
|
||||
;;
|
||||
-n|--name)
|
||||
IMAGE_NAME="$2"
|
||||
shift 2
|
||||
;;
|
||||
-o|--output)
|
||||
OUTPUT_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "❌ 错误: 未知参数 $1"
|
||||
show_help
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# 如果未指定版本,从文件读取
|
||||
if [ -z "$VERSION" ]; then
|
||||
if [ -f "$VERSION_FILE" ] && [ -s "$VERSION_FILE" ]; then
|
||||
VERSION=$(cat "$VERSION_FILE")
|
||||
else
|
||||
echo "❌ 错误: 未指定版本且无法从 $VERSION_FILE 读取版本号"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# 构建完整的镜像标签
|
||||
TAG="$IMAGE_NAME:$VERSION"
|
||||
|
||||
# 如果未指定输出文件,使用默认命名
|
||||
if [ -z "$OUTPUT_FILE" ]; then
|
||||
OUTPUT_FILE="${IMAGE_NAME}-${VERSION}.tar"
|
||||
fi
|
||||
|
||||
# 检查镜像是否存在
|
||||
if ! docker image inspect "$TAG" >/dev/null 2>&1; then
|
||||
echo "❌ 错误: 镜像 $TAG 不存在"
|
||||
echo "提示: 请先使用 build-docker.sh 构建镜像"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "🚀 开始导出Docker镜像..."
|
||||
echo "📦 镜像: $TAG"
|
||||
echo "📄 输出: $OUTPUT_FILE"
|
||||
|
||||
# 导出镜像
|
||||
if docker save -o "$OUTPUT_FILE" "$TAG"; then
|
||||
echo "✅ 导出成功!"
|
||||
echo "📁 文件大小: $(du -h "$OUTPUT_FILE" | cut -f1)"
|
||||
else
|
||||
echo "❌ 导出失败!"
|
||||
# 如果导出失败,清理可能部分写入的文件
|
||||
[ -f "$OUTPUT_FILE" ] && rm "$OUTPUT_FILE"
|
||||
exit 1
|
||||
fi
|
2078
static/bootstrap/css/bootstrap-icons.css
vendored
Normal file
2078
static/bootstrap/css/bootstrap-icons.css
vendored
Normal file
File diff suppressed because it is too large
Load Diff
6
static/bootstrap/css/bootstrap.min.css
vendored
Normal file
6
static/bootstrap/css/bootstrap.min.css
vendored
Normal file
File diff suppressed because one or more lines are too long
7
static/bootstrap/js/bootstrap.bundle.min.js
vendored
Normal file
7
static/bootstrap/js/bootstrap.bundle.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
1006
static/index.html
Normal file
1006
static/index.html
Normal file
File diff suppressed because it is too large
Load Diff
2
static/layui/css/layui.css
Normal file
2
static/layui/css/layui.css
Normal file
File diff suppressed because one or more lines are too long
2
static/layui/layui.js
Normal file
2
static/layui/layui.js
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user