初始化项目

2025-06-08 16:25:53 +08:00 · 2025-06-08 16:25:53 +08:00 · d3834eb37e
commit d3834eb37e
28 changed files with 5119 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,10 @@
+.venv
+venv
+
+__pycache__
+
+*.pyc
+.env
+data/videos.db
+
+output
--- a/.version
+++ b/.version
@ -0,0 +1 @@
+0.2
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,5 @@
+{
+    "python.analysis.extraPaths": [
+        "./scrapy_proj"
+    ]
+}
--- a/43
+++ b/43
@ -0,0 +1,43 @@
+# 使用官方Python基础镜像
+FROM python:3.9-bookworm
+
+# 设置工作目录
+WORKDIR /app
+
+# 设置时区（可选）
+ENV TZ=Asia/Shanghai
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+
+# 使用阿里云镜像源
+RUN echo "deb https://mirrors.aliyun.com/debian/ bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \
+    echo "deb https://mirrors.aliyun.com/debian/ bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \
+    echo "deb https://mirrors.aliyun.com/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list
+
+# 安装系统依赖
+RUN apt-get update && apt-get install -y \
+    gcc \
+    python3-dev \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/*
+
+# 复制requirements.txt并安装Python依赖（使用阿里pip源）
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt \
+    -i https://mirrors.aliyun.com/pypi/simple/ \
+    --trusted-host mirrors.aliyun.com
+
+# 复制项目文件
+COPY . .
+
+# 创建数据目录
+RUN mkdir -p /app/data
+
+# 暴露端口
+EXPOSE 8000
+
+# 设置环境变量
+ENV PYTHONPATH=/app
+ENV DATABASE_URL=sqlite:////app/data/videos.db
+
+# 启动命令
+CMD ["python", "main.py"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,5 @@
+创建虚拟环境：python -m venv .venv
+
+激活虚拟环境：.venv\Scripts\Activate.ps1
+source ./.venv/bin/activate
+
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,18 @@
+version: "3.8"
+
+services:
+  app:
+    image: crawler_zgjs:0.1
+    ports:
+      - "${PORT:-8000}:8000"
+    volumes:
+      - ./data:/app/data
+    env_file:
+      - .env
+    restart: unless-stopped
+    networks:
+      - app-network
+
+networks:
+  app-network:
+    driver: bridge
--- a/init_db.py
+++ b/init_db.py
@ -0,0 +1,18 @@
+import os
+from sqlalchemy import create_engine
+from scrapy_proj.models import Base
+
+# 确保data目录存在
+os.makedirs("data", exist_ok=True)
+
+# 创建数据库引擎
+DATABASE_URL = "sqlite:///data/videos.db"
+engine = create_engine(DATABASE_URL)
+
+def init_db():
+    """初始化数据库，创建所有表"""
+    Base.metadata.create_all(bind=engine)
+    print("数据库表创建成功！")
+
+if __name__ == "__main__":
+    init_db()
--- a/main.py
+++ b/main.py
@ -0,0 +1,744 @@
+import os
+import sys
+import uuid
+import logging
+import time
+from typing import Dict, List, Optional, Literal
+from datetime import datetime
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
+from pydantic import BaseModel, HttpUrl
+import uvicorn
+from multiprocessing import Process
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers.cron import CronTrigger
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from dotenv import load_dotenv
+
+# 加载环境变量
+load_dotenv()
+
+# 添加scrapy项目到Python路径
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "scrapy_proj"))
+
+# 导入爬虫
+from scrapy_proj.spiders.zgjs import ZGJSSpider
+from scrapy_proj.models import ScheduledTask
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+# 定时任务相关的Pydantic模型
+class ScheduledTaskCreate(BaseModel):
+    """创建定时任务请求模型"""
+    name: str
+    cron_expression: str
+    spider_name: str
+    url: str
+    video_list: int
+    enabled: bool = True
+
+class ScheduledTaskUpdate(BaseModel):
+    """更新定时任务请求模型"""
+    name: Optional[str] = None
+    cron_expression: Optional[str] = None
+    spider_name: Optional[str] = None
+    url: Optional[str] = None
+    video_list: Optional[int] = None
+    enabled: Optional[bool] = None
+
+class TaskStatus(BaseModel):
+    """任务状态响应模型"""
+    status: Literal['pending', 'running', 'completed', 'failed']
+    message: Optional[str] = None
+    start_time: Optional[str] = None
+    end_time: Optional[str] = None
+
+class ScheduledTaskResponse(BaseModel):
+    """定时任务响应模型"""
+    id: int
+    name: str
+    cron_expression: str
+    spider_name: str
+    url: str
+    video_list: int
+    enabled: bool
+    create_time: str
+    update_time: str
+
+# 存储任务状态
+task_status_store: Dict[str, Dict] = {}
+
+# 创建数据库引擎和会话
+DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///data/videos.db")
+engine = create_engine(DATABASE_URL)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+# 创建调度器
+scheduler = BackgroundScheduler()
+scheduler.start()
+
+# 创建FastAPI应用
+app = FastAPI(
+    title="Scrapy API",
+    description="影视资源爬虫API",
+    version="0.1.0",
+)
+
+# 存储爬虫任务状态
+spider_tasks = {}
+
+# 获取数据库会话
+def get_db():
+    db = SessionLocal()
+    try:
+        return db
+    finally:
+        db.close()
+
+
+class SpiderRequest(BaseModel):
+    """爬虫请求模型"""
+    url: Optional[HttpUrl] = None
+    spider_name: str = "example"
+    settings: Optional[Dict] = None
+    video_list: int
+
+
+class SpiderResponse(BaseModel):
+    """爬虫响应模型"""
+    task_id: str
+    status: str
+    spider_name: str
+    task_name: Optional[str] = None  # 添加任务名称字段
+    message: str
+    start_time: Optional[str] = None
+    end_time: Optional[str] = None
+
+class PaginatedSpiderResponse(BaseModel):
+    """分页爬虫响应模型"""
+    items: List[SpiderResponse]
+    total: int
+    page: int
+    page_size: int
+    total_pages: int
+
+
+def _run_spider_process(spider_name: str, url: Optional[str], video_list: int, settings: Dict):
+    """实际运行爬虫的进程函数"""
+    try:
+        # 获取项目设置
+        crawler_settings = get_project_settings()
+        
+        # 如果提供了自定义设置，则更新
+        if settings:
+            for key, value in settings.items():
+                crawler_settings.set(key, value)
+        
+        # 创建爬虫进程
+        process = CrawlerProcess(settings=crawler_settings)
+        
+        # 选择爬虫
+        if spider_name == "zgjs":
+            logger.info(f"启动爬虫 {spider_name}，URL: {url}")
+            process.crawl(ZGJSSpider, url=url, video_list=video_list)
+        else:
+            raise ValueError(f"未知的爬虫: {spider_name}")
+        
+        # 启动爬虫
+        process.start()
+        
+        # 确保爬虫进程正确关闭
+        try:
+            if hasattr(process, '_active') and not process._active:
+                logger.info("爬虫进程已完成")
+            elif hasattr(process, 'bootstrap_stopped') and process.bootstrap_stopped: # type: ignore
+                logger.info("爬虫进程正常停止")
+            else:
+                logger.warning("爬虫进程未正常停止，强制关闭")
+                process.stop()
+                
+            # 确保所有reactor线程都停止
+            from twisted.internet import reactor
+            if reactor.running: # type: ignore
+                logger.info("停止reactor")
+                reactor.stop() # type: ignore
+                
+        except Exception as e:
+            logger.error(f"关闭爬虫进程时出错: {str(e)}")
+            sys.exit(1)  # 非正常退出
+        
+        # 确保进程退出
+        logger.info("爬虫进程正常退出")
+        sys.exit(0)
+        
+    except Exception as e:
+        logger.error(f"爬虫进程运行错误: {str(e)}", exc_info=True)
+        sys.exit(1)  # 非正常退出
+
+def run_spider(task_id: str, spider_name: str, task_name: Optional[str] = None, url: Optional[str] = None, video_list: int = 0, settings: Optional[Dict] = None):
+    """在后台运行爬虫
+    
+    Args:
+        task_id: 任务ID
+        spider_name: 爬虫名称
+        task_name: 任务名称
+        url: 开始URL
+        video_list: 视频列表数量
+        settings: 爬虫设置
+    """
+    try:
+        # 确保settings是字典
+        settings = settings or {}
+        
+        # 初始化爬虫任务状态
+        if task_id not in spider_tasks:
+            spider_tasks[task_id] = {
+                "status": "pending",
+                "spider_name": spider_name,
+                "task_name": task_name,
+                "message": "爬虫任务初始化中",
+                "started_at": time.time(),
+                "finished_at": None
+            }
+        
+        # 创建并启动新进程
+        p = Process(
+            target=_run_spider_process,
+            args=(spider_name, url, video_list, settings),
+            daemon=True  # 设置为守护进程，确保主进程退出时子进程也会退出
+        )
+        p.start()
+        
+        # 更新任务状态为运行中
+        spider_tasks[task_id].update({
+            "status": "running",
+            "message": "爬虫任务正在运行",
+            "process": p,
+            "started_at": time.time()
+        })
+        
+        # 启动一个线程来监控进程状态
+        def monitor_process(process, task_id):
+            process.join()  # 等待进程结束
+            if task_id in spider_tasks:
+                finish_time = time.time()
+                spider_tasks[task_id]["finished_at"] = finish_time
+                
+                if process.exitcode == 0:
+                    spider_tasks[task_id].update({
+                        "status": "completed",
+                        "message": "爬虫任务成功完成"
+                    })
+                else:
+                    spider_tasks[task_id].update({
+                        "status": "failed",
+                        "message": f"爬虫任务失败，退出码: {process.exitcode}"
+                    })
+                
+                # 同步更新定时任务状态（如果存在）
+                for scheduled_task_id, status_info in task_status_store.items():
+                    if status_info.get("spider_task_id") == task_id:
+                        status_info.update({
+                            "status": spider_tasks[task_id]["status"],
+                            "message": spider_tasks[task_id]["message"],
+                            "end_time": datetime.fromtimestamp(finish_time).strftime("%Y-%m-%d %H:%M:%S")
+                        })
+        
+        from threading import Thread
+        monitor_thread = Thread(target=monitor_process, args=(p, task_id))
+        monitor_thread.daemon = True
+        monitor_thread.start()
+        
+    except Exception as e:
+        error_time = time.time()
+        # 更新任务状态为失败
+        spider_tasks[task_id].update({
+            "status": "failed",
+            "message": f"启动爬虫进程错误: {str(e)}",
+            "finished_at": error_time
+        })
+        
+        # 同步更新定时任务状态（如果存在）
+        for scheduled_task_id, status_info in task_status_store.items():
+            if status_info.get("spider_task_id") == task_id:
+                status_info.update({
+                    "status": "failed",
+                    "message": f"启动爬虫进程错误: {str(e)}",
+                    "end_time": datetime.fromtimestamp(error_time).strftime("%Y-%m-%d %H:%M:%S")
+                })
+        
+        logger.error(f"启动爬虫进程错误: {str(e)}")
+
+def cleanup_finished_processes():
+    """清理已完成的进程并更新状态"""
+    for task_id, task_info in list(spider_tasks.items()):
+        if "process" in task_info:
+            process = task_info["process"]
+            if not process.is_alive():
+                # 获取进程退出码
+                exitcode = process.exitcode
+                
+                # 清理进程资源
+                process.join()
+                process.close()
+                
+                # 当前时间
+                current_time = time.time()
+                
+                # 根据退出码更新状态
+                if exitcode == 0:
+                    task_info["status"] = "completed"
+                    task_info["message"] = "爬虫任务成功完成"
+                else:
+                    task_info["status"] = "failed"
+                    task_info["message"] = f"爬虫任务失败，退出码: {exitcode}"
+                
+                # 记录完成时间（如果还没有设置的话）
+                if "finished_at" not in task_info:
+                    task_info["finished_at"] = current_time
+                
+                # 确保有开始时间
+                if "started_at" not in task_info:
+                    task_info["started_at"] = task_info.get("finished_at", current_time)
+                
+                del spider_tasks[task_id]["process"]
+
+
+@app.post("/api/spiders/run", response_model=SpiderResponse)
+async def start_spider(spider_request: SpiderRequest, background_tasks: BackgroundTasks):
+    """启动爬虫任务
+    
+    Args:
+        spider_request: 爬虫请求参数
+        background_tasks: 后台任务
+        
+    Returns:
+        SpiderResponse: 爬虫响应
+    """
+    # 生成任务ID
+    task_id = str(uuid.uuid4())
+    
+    if (spider_request.url is None):
+        raise HTTPException(status_code=400, detail="缺少url参数")
+    if (spider_request.video_list is None):
+        raise HTTPException(status_code=400, detail="缺少video_list参数")
+    
+    # 当前时间戳
+    current_time = time.time()
+    
+    # 记录任务信息
+    spider_tasks[task_id] = {
+        "status": "pending",
+        "spider_name": spider_request.spider_name,
+        "message": "爬虫任务已创建，等待执行",
+        "started_at": current_time,  # 添加开始时间
+        "finished_at": None  # 初始化结束时间为None
+    }
+    
+    # 在后台运行爬虫
+    background_tasks.add_task(
+        run_spider,
+        task_id=task_id,
+        spider_name=spider_request.spider_name,
+        url=str(spider_request.url) if spider_request.url else None,
+        video_list=spider_request.video_list,
+        settings=spider_request.settings
+    )
+    
+    # 格式化时间为ISO格式
+    start_time = datetime.fromtimestamp(current_time).isoformat()
+    
+    return SpiderResponse(
+        task_id=task_id,
+        status="pending",
+        spider_name=spider_request.spider_name,
+        message="爬虫任务已创建，等待执行",
+        start_time=start_time,
+        end_time=None
+    )
+
+
+@app.get("/api/spiders/status/{task_id}", response_model=SpiderResponse)
+async def get_spider_status(task_id: str):
+    """获取爬虫任务状态
+    
+    Args:
+        task_id: 任务ID
+        
+    Returns:
+        SpiderResponse: 爬虫响应
+    """
+    # 先清理已完成进程
+    cleanup_finished_processes()
+    
+    if task_id not in spider_tasks:
+        raise HTTPException(status_code=404, detail="任务不存在")
+    
+    task_info = spider_tasks[task_id]
+    
+    return SpiderResponse(
+        task_id=task_id,
+        status=task_info["status"],
+        spider_name=task_info["spider_name"],
+        task_name=task_info.get("task_name"),
+        message=task_info["message"],
+        start_time=datetime.fromtimestamp(task_info["started_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("started_at") else None,
+        end_time=datetime.fromtimestamp(task_info["finished_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("finished_at") else None
+    )
+
+@app.post("/api/spiders/cleanup")
+async def cleanup_spiders():
+    """清理已完成的任务进程
+    
+    Returns:
+        Dict: 清理结果
+    """
+    cleanup_finished_processes()
+    return {"message": "已完成进程清理"}
+
+
+@app.get("/api/spiders/list", response_model=PaginatedSpiderResponse)
+async def list_spiders(page: int = 1, page_size: int = 10):
+    """列出爬虫任务(分页)
+    
+    Args:
+        page: 页码，从1开始
+        page_size: 每页数量
+        
+    Returns:
+        PaginatedSpiderResponse: 分页的爬虫任务列表
+    """
+    # 先清理已完成进程，确保状态最新
+    cleanup_finished_processes()
+    
+    # 计算分页参数
+    all_tasks = list(spider_tasks.items())
+    all_tasks.reverse()  # 倒序
+    total = len(all_tasks)
+    total_pages = (total + page_size - 1) // page_size
+    page = max(1, min(page, total_pages))
+    
+    # 获取当前页的数据
+    start = (page - 1) * page_size
+    end = start + page_size
+    paginated_tasks = all_tasks[start:end]
+    
+    return PaginatedSpiderResponse(
+        items=[
+            SpiderResponse(
+                task_id=str(task_id),
+                status=task_info["status"],
+                spider_name=task_info["spider_name"],
+                task_name=task_info.get("task_name"),
+                message=task_info["message"],
+                start_time=datetime.fromtimestamp(task_info["started_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("started_at") else None,
+                end_time=datetime.fromtimestamp(task_info["finished_at"]).strftime("%Y-%m-%d %H:%M:%S") if task_info.get("finished_at") else None
+            )
+            for task_id, task_info in paginated_tasks
+        ],
+        total=total,
+        page=page,
+        page_size=page_size,
+        total_pages=total_pages
+    )
+
+
+def add_job_to_scheduler(task: ScheduledTask):
+    """添加任务到调度器"""
+    if task.enabled: # type: ignore
+        # 创建一个包装函数来处理定时任务的状态更新
+        def scheduled_spider_run():
+            # 生成唯一的爬虫任务ID
+            spider_task_id = str(uuid.uuid4())
+            task_id_str = str(task.id)
+            current_time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            
+            # 记录定时任务状态
+            task_status_store[task_id_str] = {
+                "status": "running",
+                "message": "定时任务自动执行中",
+                "start_time": current_time_str,
+                "end_time": None,
+                "spider_task_id": spider_task_id
+            }
+            
+            # 运行爬虫
+            run_spider(
+                task_id=spider_task_id,
+                spider_name=str(task.spider_name),
+                task_name=str(task.name),
+                url=str(task.url),
+                video_list=task.video_list # type: ignore
+            )
+            
+            # 记录日志
+            logger.info(f"定时任务 {task.name} (ID: {task.id}) 已自动执行，爬虫任务ID: {spider_task_id}")
+        
+        # 添加到调度器
+        scheduler.add_job(
+            scheduled_spider_run,
+            CronTrigger.from_crontab(task.cron_expression),
+            id=str(task.id),
+            replace_existing=True
+        )
+
+@app.post("/api/scheduled-tasks", response_model=ScheduledTaskResponse)
+async def create_scheduled_task(task: ScheduledTaskCreate):
+    """创建定时任务"""
+    db = get_db()
+    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    
+    db_task = ScheduledTask(
+        name=task.name,
+        cron_expression=task.cron_expression,
+        spider_name=task.spider_name,
+        url=task.url,
+        video_list=task.video_list,
+        enabled=task.enabled,
+        create_time=current_time,
+        update_time=current_time
+    )
+    
+    try:
+        db.add(db_task)
+        db.commit()
+        db.refresh(db_task)
+        
+        # 如果任务启用，添加到调度器
+        if task.enabled:
+            add_job_to_scheduler(db_task)
+        
+        return db_task
+    except Exception as e:
+        db.rollback()
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        db.close()
+
+@app.get("/api/scheduled-tasks", response_model=List[ScheduledTaskResponse])
+async def list_scheduled_tasks():
+    """获取所有定时任务"""
+    db = get_db()
+    try:
+        tasks = db.query(ScheduledTask).all()
+        return tasks
+    finally:
+        db.close()
+
+@app.get("/api/scheduled-tasks/{task_id}", response_model=ScheduledTaskResponse)
+async def get_scheduled_task(task_id: int):
+    """获取指定定时任务"""
+    db = get_db()
+    try:
+        task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
+        if task is None:
+            raise HTTPException(status_code=404, detail="Task not found")
+        return task
+    finally:
+        db.close()
+
+@app.put("/api/scheduled-tasks/{task_id}", response_model=ScheduledTaskResponse)
+async def update_scheduled_task(task_id: int, task_update: ScheduledTaskUpdate):
+    """更新定时任务"""
+    db = get_db()
+    try:
+        db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
+        if db_task is None:
+            raise HTTPException(status_code=404, detail="Task not found")
+        
+        update_data = task_update.dict(exclude_unset=True)
+        for key, value in update_data.items():
+            setattr(db_task, key, value)
+        
+        db_task.update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # type: ignore
+        db.commit()
+        db.refresh(db_task)
+        
+        # 更新调度器中的任务
+        job_id = str(db_task.id)
+        if scheduler.get_job(job_id):
+            scheduler.remove_job(job_id)
+        if db_task.enabled: # type: ignore
+            add_job_to_scheduler(db_task)
+        
+        return db_task
+    finally:
+        db.close()
+
+@app.delete("/api/scheduled-tasks/{task_id}")
+async def delete_scheduled_task(task_id: int):
+    """删除定时任务"""
+    db = get_db()
+    try:
+        db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
+        if db_task is None:
+            raise HTTPException(status_code=404, detail="Task not found")
+        
+        # 从调度器中移除任务
+        job_id = str(db_task.id)
+        if scheduler.get_job(job_id):
+            scheduler.remove_job(job_id)
+        
+        db.delete(db_task)
+        db.commit()
+        return {"message": "Task deleted successfully"}
+    finally:
+        db.close()
+
+@app.post("/api/scheduled-tasks/{task_id}/toggle")
+async def toggle_scheduled_task(task_id: int):
+    """启用/禁用定时任务"""
+    db = get_db()
+    try:
+        db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
+        if db_task is None:
+            raise HTTPException(status_code=404, detail="Task not found")
+        
+        db_task.enabled = not db_task.enabled # type: ignore
+        db_task.update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # type: ignore
+        
+        # 更新调度器中的任务
+        job_id = str(db_task.id)
+        if scheduler.get_job(job_id):
+            scheduler.remove_job(job_id)
+        if db_task.enabled: # type: ignore
+            add_job_to_scheduler(db_task)
+        
+        db.commit()
+        db.refresh(db_task)
+        return {"message": f"Task {'enabled' if db_task.enabled else 'disabled'} successfully"} # type: ignore
+    finally:
+        db.close()
+
+@app.post("/api/scheduled-tasks/{task_id}/run")
+async def run_scheduled_task(task_id: int, background_tasks: BackgroundTasks):
+    """手动执行定时任务"""
+    db = get_db()
+    try:
+        db_task = db.query(ScheduledTask).filter(ScheduledTask.id == task_id).first()
+        if db_task is None:
+            raise HTTPException(status_code=404, detail="Task not found")
+        
+        # 生成唯一的爬虫任务ID
+        spider_task_id = str(uuid.uuid4())
+            
+        # 当前时间戳
+        current_time = time.time()
+        current_time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        
+        # 记录爬虫任务信息
+        spider_tasks[spider_task_id] = {
+            "status": "pending",
+            "spider_name": db_task.spider_name,
+            "task_name": db_task.name,
+            "message": "爬虫任务已创建，等待执行",
+            "started_at": current_time,
+            "finished_at": None
+        }
+        
+        # 记录定时任务状态
+        task_status_store[str(task_id)] = {
+            "status": "running",
+            "message": "定时任务正在执行",
+            "start_time": current_time_str,
+            "end_time": None,
+            "spider_task_id": spider_task_id,  # 关联爬虫任务ID
+            "task_name": db_task.name
+        }
+        
+        # 在后台运行爬虫
+        background_tasks.add_task(
+            run_spider,
+            task_id=spider_task_id,
+            spider_name=str(db_task.spider_name),
+            task_name=str(db_task.name),
+            url=str(db_task.url),
+            video_list=db_task.video_list # type: ignore
+        )
+
+        return {
+            "message": "任务已开始执行",
+            "task_id": task_id,
+            "spider_task_id": spider_task_id,
+            "task_name": db_task.name  # 添加任务名称到响应
+        }
+    finally:
+        db.close()
+
+@app.get("/api/task-status/{task_id}", response_model=TaskStatus)
+async def get_task_status(task_id: int):
+    """获取定时任务的执行状态"""
+    task_id_str = str(task_id)
+    
+    # 先清理已完成的爬虫进程，确保状态最新
+    cleanup_finished_processes()
+    
+    if task_id_str not in task_status_store:
+        return TaskStatus(status="pending", message="任务未执行")
+    
+    status_info = task_status_store[task_id_str]
+    
+    # 如果任务正在运行，检查爬虫任务的状态
+    if "spider_task_id" in status_info:
+        spider_task_id = status_info["spider_task_id"]
+        if spider_task_id in spider_tasks:
+            spider_info = spider_tasks[spider_task_id]
+            spider_status = spider_info["status"]
+            spider_message = spider_info["message"]
+            
+            # 同步状态
+            if spider_status != status_info["status"]:
+                status_info["status"] = spider_status
+                status_info["message"] = spider_message
+                
+                # 如果爬虫任务完成或失败，更新结束时间
+                if spider_status in ["completed", "failed"]:
+                    status_info["end_time"] = datetime.fromtimestamp(
+                        spider_info.get("finished_at", time.time())
+                    ).strftime("%Y-%m-%d %H:%M:%S")
+        else:
+            # 如果爬虫任务不存在且状态是running，可能是异常终止
+            if status_info["status"] == "running":
+                status_info["status"] = "failed"
+                status_info["message"] = "爬虫任务异常终止"
+                status_info["end_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    
+    return TaskStatus(
+        status=status_info["status"],
+        message=status_info["message"],
+        start_time=status_info.get("start_time"),
+        end_time=status_info.get("end_time")
+    )
+
+# 挂载静态文件目录
+app.mount("/static", StaticFiles(directory="static"), name="static")
+
+@app.get("/")
+async def read_index():
+    return FileResponse("static/index.html")
+
+if __name__ == "__main__":
+    host = os.getenv("HOST", "0.0.0.0")
+    port = int(os.getenv("PORT", "8000"))
+    log_level = os.getenv("LOG_LEVEL", "info")
+    
+    # 启动服务器
+    uvicorn.run(
+        app if not os.getenv("RELOAD") else "main:app",
+        host=host,
+        port=port,
+        log_level=log_level,
+        reload=bool(os.getenv("RELOAD"))
+    )
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,15 @@
+scrapy>=2.11.0
+aliyun-python-sdk-core>=2.13.3
+aliyun-python-sdk-vod>=2.16.16
+requests>=2.31.0
+fastapi>=0.104.1
+uvicorn>=0.24.0
+pydantic>=2.5.1
+python-multipart>=0.0.6
+SQLAlchemy>=2.0.23
+alembic>=1.12.1
+pymysql>=1.1.0
+oss2>=2.19.1
+apscheduler>=3.11.0
+jinja2>=3.1.6
+dotenv>=0.9.9
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = scrapy_proj.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = scrapy_proj
--- a/scrapy_proj/init.py
+++ b/scrapy_proj/init.py
--- a/scrapy_proj/database.py
+++ b/scrapy_proj/database.py
@ -0,0 +1,99 @@
+import os
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy.pool import QueuePool
+from contextlib import contextmanager
+from .models import Base
+
+class DatabaseManager:
+    """数据库管理器"""
+    
+    def __init__(self, settings):
+        """初始化数据库管理器
+        
+        Args:
+            settings: Scrapy设置对象
+        """
+        self.sqlite_file = settings.get('SQLITE_FILE', 'videos.db')
+        self.mysql_config = {
+            'host': settings.get('MYSQL_HOST', 'localhost'),
+            'port': settings.get('MYSQL_PORT', 3306),
+            'user': settings.get('MYSQL_USER', 'root'),
+            'password': settings.get('MYSQL_PASSWORD', ''),
+            'database': settings.get('MYSQL_DATABASE', 'crawler'),
+        }
+        
+        # 初始化数据库引擎
+        self._init_sqlite()
+        self._init_mysql()
+        
+        # 创建会话工厂
+        self.sqlite_session_maker = sessionmaker(bind=self.sqlite_engine)
+        self.mysql_session_maker = sessionmaker(bind=self.mysql_engine)
+    
+    def _init_sqlite(self):
+        """初始化SQLite数据库"""
+        # 确保数据库目录存在
+        db_dir = os.path.dirname(self.sqlite_file)
+        if db_dir and not os.path.exists(db_dir):
+            os.makedirs(db_dir)
+        
+        # 创建SQLite引擎
+        self.sqlite_engine = create_engine(
+            f'sqlite:///{self.sqlite_file}',
+            poolclass=QueuePool,
+            pool_size=5,
+            max_overflow=10,
+            pool_timeout=30
+        )
+        
+        # 自动创建所有表
+        Base.metadata.create_all(self.sqlite_engine)
+    
+    def _init_mysql(self):
+        """初始化MySQL/MariaDB数据库"""
+        # 创建MySQL引擎
+        self.mysql_engine = create_engine(
+            'mysql+pymysql://{user}:{password}@{host}:{port}/{database}?charset=utf8mb4'.format(
+                **self.mysql_config
+            ),
+            poolclass=QueuePool,
+            pool_size=5,
+            max_overflow=10,
+            pool_timeout=30,
+            pool_pre_ping=True  # 自动检测断开的连接
+        )
+    
+    @contextmanager
+    def sqlite_session(self):
+        """SQLite会话上下文管理器
+        
+        Yields:
+            Session: SQLite数据库会话
+        """
+        session = self.sqlite_session_maker()
+        try:
+            yield session
+            session.commit()
+        except Exception:
+            session.rollback()
+            raise
+        finally:
+            session.close()
+    
+    @contextmanager
+    def mysql_session(self):
+        """MySQL会话上下文管理器
+        
+        Yields:
+            Session: MySQL数据库会话
+        """
+        session = self.mysql_session_maker()
+        try:
+            yield session
+            session.commit()
+        except Exception:
+            session.rollback()
+            raise
+        finally:
+            session.close()
--- a/scrapy_proj/items.py
+++ b/scrapy_proj/items.py
@ -0,0 +1,31 @@
+from scrapy import Item, Field
+from datetime import datetime
+
+class VideoItem(Item):
+    """视频信息数据模型"""
+    # 基本信息
+    title = Field()  # 标题
+    description = Field()  # 描述
+    source_url = Field()  # 来源URL
+    publish_time = Field()  # 发布时间
+    create_time = Field(serializer=lambda x: datetime.now().strftime('%Y-%m-%d %H:%M:%S'))  # 创建时间
+    update_time = Field(serializer=lambda x: datetime.now().strftime('%Y-%m-%d %H:%M:%S'))  # 更新时间
+    
+    # 媒体信息
+    video_url = Field()  # 视频URL
+    source_thumbnail_url = Field()  # 原始缩略图URL
+    thumbnail_url = Field()  # 缩略图URL
+    duration = Field()  # 视频时长
+    
+    # 附加信息，辅助数据导入迁移
+    video_list = Field() # 视频分类ID
+    
+    # 阿里云点播信息
+    aliyun_video_id = Field()  # 阿里云视频ID
+    aliyun_status = Field()  # 阿里云处理状态
+    
+    # 其他信息
+    status = Field()  # 状态（0：待处理，1：处理中，2：处理完成，-1：处理失败）
+    
+    sqlite_id = Field()  # sqlite处理ID
+    
--- a/scrapy_proj/middlewares.py
+++ b/scrapy_proj/middlewares.py
@ -0,0 +1,100 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class ScrapyProjSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    async def process_start(self, start):
+        # Called with an async iterator over the spider start() method or the
+        # maching method of an earlier spider middleware.
+        async for item_or_request in start:
+            yield item_or_request
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class ScrapyProjDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/scrapy_proj/models.py
+++ b/scrapy_proj/models.py
@ -0,0 +1,38 @@
+from sqlalchemy import Column, Integer, String, Text, DateTime, SmallInteger, Boolean
+from sqlalchemy.ext.declarative import declarative_base
+
+Base = declarative_base()
+
+class ScheduledTask(Base):
+    """定时任务模型"""
+    __tablename__ = 'scheduled_tasks'
+    
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    name = Column(String(100), nullable=False)
+    cron_expression = Column(String(100), nullable=False)  # cron表达式
+    spider_name = Column(String(50), nullable=False)  # 爬虫名称
+    url = Column(Text, nullable=False)  # 爬取的URL
+    video_list = Column(Integer, nullable=False)  # 视频分类ID
+    enabled = Column(Boolean, default=True)  # 是否启用
+    create_time = Column(Text)  # SQLite中使用TEXT存储时间
+    update_time = Column(Text)  # SQLite中使用TEXT存储时间
+
+class VideoSQLite(Base):
+    """SQLite视频模型"""
+    __tablename__ = 'videos'
+    
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    title = Column(Text)
+    description = Column(Text)
+    source_url = Column(Text, unique=True)
+    publish_time = Column(Text)  # SQLite中使用TEXT存储时间
+    create_time = Column(Text)
+    update_time = Column(Text)
+    video_url = Column(Text)
+    source_thumbnail_url = Column(Text)
+    thumbnail_url = Column(Text)
+    duration = Column(Text)  # SQLite中duration是TEXT类型
+    aliyun_video_id = Column(Text)
+    aliyun_status = Column(Text)
+    status = Column(Integer) # 0 默认 1 已迁移
+    video_list = Column(Integer) # 视频分类ID
--- a/scrapy_proj/pipelines.py
+++ b/scrapy_proj/pipelines.py
@ -0,0 +1,479 @@
+from datetime import datetime
+import logging
+import json
+import requests
+import time
+import base64
+import oss2
+from aliyunsdkcore.client import AcsClient
+from aliyunsdkvod.request.v20170321.CreateUploadVideoRequest import CreateUploadVideoRequest
+from aliyunsdkvod.request.v20170321.GetVideoInfoRequest import GetVideoInfoRequest
+from aliyunsdkvod.request.v20170321.UpdateVideoInfoRequest import UpdateVideoInfoRequest
+from aliyunsdkvod.request.v20170321.CreateUploadImageRequest import CreateUploadImageRequest
+from sqlalchemy import text
+from .database import DatabaseManager
+from .models import VideoSQLite
+
+logger = logging.getLogger(__name__)
+
+
+class AliyunVodPipeline:
+    """阿里云视频点播处理中间件"""
+
+    def __init__(self, settings):
+        """初始化阿里云视频点播中间件
+           初始化SQLite中间件
+        
+        Args:
+            settings: Scrapy设置对象
+        """
+        self.settings = settings
+        self.access_key_id = settings.get('ALIYUN_ACCESS_KEY_ID')
+        self.access_key_secret = settings.get('ALIYUN_ACCESS_KEY_SECRET')
+        self.template_group_id = settings.get('ALIYUN_TEMPLATE_GROUP_ID')
+        self.client = AcsClient(self.access_key_id, self.access_key_secret, 'cn-shanghai')
+        self.oss_client = oss2.Auth(self.access_key_id, self.access_key_secret)
+
+        self.db_manager = DatabaseManager(settings)
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler.settings)
+
+    def upload_media_by_url(self, video_url, title, cover_url=None):
+        """通过URL上传视频到阿里云VOD
+        
+        Args:
+            video_url: 视频URL
+            title: 视频标题
+            cover_url: 封面URL
+            
+        Returns:
+            str: 上传任务ID (JobId)
+        """
+        from aliyunsdkvod.request.v20170321.UploadMediaByURLRequest import UploadMediaByURLRequest
+        
+        request = UploadMediaByURLRequest()
+        request.set_accept_format('JSON')
+        
+        # 设置视频URL
+        logger.info(f"上传视频URL: {video_url}")
+        request.set_UploadURLs(video_url)
+        
+        # 设置视频信息（需要是JSON数组字符串）
+        upload_metadata = [{
+            'Title': title,
+            'SourceURL': video_url,
+            'TemplateGroupId': self.template_group_id
+        }]
+        # 设置封面URL
+        # if cover_url:
+        #     upload_metadata[0]['CoverURL'] = cover_url
+            
+        request.set_UploadMetadatas(json.dumps(upload_metadata))
+        
+        response = self.client.do_action_with_exception(request)
+        result = json.loads(response)
+        
+        # 返回第一个上传任务的JobId
+        upload_jobs = result.get('UploadJobs', [])
+        if not upload_jobs:
+            raise Exception("No upload job created")
+            
+        job = upload_jobs[0]
+        # if job.get('Code') != 'Success':
+        #     raise Exception(f"Upload job failed: {job}")
+            
+        return job.get('JobId')  # 返回JobId而不是VideoId
+        
+    def get_upload_job_status(self, job_id):
+        """获取上传任务状态
+        
+        Args:
+            job_id: 上传任务ID
+            
+        Returns:
+            dict: 任务状态信息，包含VideoId（如果上传完成）
+        """
+        from aliyunsdkvod.request.v20170321.GetURLUploadInfosRequest import GetURLUploadInfosRequest
+        
+        request = GetURLUploadInfosRequest()
+        request.set_accept_format('JSON')
+        request.set_JobIds(job_id)
+        
+        response = self.client.do_action_with_exception(request)
+        result = json.loads(response)
+        
+        upload_jobs = result.get('URLUploadInfoList', [])
+        if not upload_jobs:
+            raise Exception(f"No upload job found with ID: {job_id}")
+            
+        job = upload_jobs[0]
+        return job
+        
+    def wait_for_video_id(self, job_id, max_retries=5, retry_interval=2):
+        """等待上传任务完成并获取VideoId
+        
+        Args:
+            job_id: 上传任务ID
+            max_retries: 最大重试次数
+            retry_interval: 重试间隔（秒）
+            
+        Returns:
+            str: 视频ID
+        """
+        import time
+        
+        for i in range(max_retries):
+            job_status = self.get_upload_job_status(job_id)
+            
+            if job_status.get('MediaId'):
+                return job_status.get('MediaId')
+                
+            # 等待一段时间后重试
+            time.sleep(retry_interval)
+            
+        raise Exception(f"Max retries reached, upload job not completed: {job_id}")
+    
+    def upload_image_to_oss(self, image_url, title):
+        """直接上传图片到阿里云OSS
+        
+        Args:
+            image_url: 图片URL
+            title: 图片标题
+            
+        Returns:
+            str: OSS中的图片URL
+        """
+        logger.info(f"开始上传图片到OSS: {image_url}")
+        
+        try:
+            # 1. 下载远程图片
+            image_response = requests.get(image_url, timeout=30)
+            image_response.raise_for_status()
+            image_content = image_response.content
+        except Exception as e:
+            logger.error(f"下载图片失败: {str(e)}")
+            raise Exception(f"下载图片失败: {str(e)}")
+        
+        try:
+            # 2. 生成OSS中的文件名（使用时间戳和原始文件名的组合）
+            timestamp = int(time.time())
+            file_ext = image_url.split('.')[-1] if '.' in image_url else 'jpg'
+            oss_filename = f"images/{timestamp}_{title[:30]}.{file_ext}"  # 限制标题长度，避免文件名过长
+            
+            # 3. 获取OSS bucket（从settings中获取配置）
+            bucket_name = self.settings.get('ALIYUN_OSS_BUCKET')
+            endpoint = self.settings.get('ALIYUN_OSS_ENDPOINT')
+            oss_bucket = oss2.Bucket(self.oss_client, endpoint, bucket_name)
+            
+            # 4. 上传图片到OSS
+            upload_response = oss_bucket.put_object(oss_filename, image_content)
+            if upload_response.status == 200:
+                # 5. 返回可访问的URL
+                oss_url = f"https://{bucket_name}.{endpoint}/{oss_filename}"
+                logger.info(f"图片上传成功: {oss_url}")
+                return oss_url
+            else:
+                raise Exception(f"图片上传失败: {upload_response.status}")
+        except Exception as e:
+            logger.error(f"上传图片到OSS失败: {str(e)}")
+            raise Exception(f"上传图片到OSS失败: {str(e)}")
+    
+
+    def process_item(self, item, spider):
+        """处理数据项，通过URL上传视频到阿里云VOD
+        
+        Args:
+            item: 爬取的数据项
+            spider: 爬虫实例
+            
+        Returns:
+            item: 处理后的数据项
+        """
+        # 如果已经有阿里云视频ID，跳过处理
+        with self.db_manager.sqlite_session() as session:
+            # 检查是否存在相同source_url的记录
+            existing_video = session.query(VideoSQLite).filter_by(source_url=item.get('source_url')).first()
+            if existing_video and existing_video.aliyun_video_id:
+                logger.info(f"阿里云视频ID已存在，跳过该任务: {item.get('title')}")
+                return item
+
+        video_url = item.get('video_url')
+        if not video_url:
+            logger.warning(f"视频URL为空，跳过处理: {item.get('source_url')}")
+            return item
+        
+        try:
+            # 1. 上传封面图片到OSS（如果有）
+            cover_url = item.get('source_thumbnail_url')
+            if cover_url:
+                try:
+                    oss_url = self.upload_image_to_oss(
+                        image_url=cover_url,
+                        title=item.get('title', '')
+                    )
+                    # 更新item中的封面URL为OSS URL
+                    item['thumbnail_url'] = oss_url
+                    logger.info(f"封面图片上传到OSS成功: {oss_url}")
+                except Exception as e:
+                    logger.error(f"封面图片上传到OSS失败: {str(e)}")
+                    # 如果封面上传失败，继续处理视频，不中断流程
+            
+            # 2. 通过URL上传视频，获取JobId
+            title = item.get('title', '')
+            job_id = self.upload_media_by_url(
+                video_url=video_url,
+                title=title,
+                cover_url=oss_url  # 使用刚上传的OSS封面URL
+            )
+            
+            logger.info(f"成功创建阿里云视频URL上传任务: job_id={job_id}, title={title}")
+            
+            # 2. 等待上传完成并获取VideoId
+            try:
+                video_id = self.wait_for_video_id(job_id)
+                logger.info(f"视频上传完成: video_id={video_id}, job_id={job_id}")
+                
+                # 3. 更新item中的阿里云视频ID和状态
+                item['aliyun_video_id'] = video_id
+                item['aliyun_status'] = 'Success'
+      
+            except Exception as e:
+                logger.error(f"等待视频上传完成失败: job_id={job_id}, error={str(e)}")
+                item['aliyun_video_id'] = ""
+                item['aliyun_status'] = 'Uploading'
+                raise  # 重新抛出异常，让上层错误处理来处理
+            
+        except Exception as e:
+            logger.error(f"阿里云视频URL上传失败: {str(e)}")
+            item['aliyun_status'] = 'Failed'
+            
+        return item
+
+
+class SQLitePipeline:
+    """SQLite数据库处理中间件"""
+
+    def __init__(self, settings):
+        """初始化SQLite中间件
+        
+        Args:
+            settings: Scrapy设置对象
+        """
+        self.db_manager = DatabaseManager(settings)
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler.settings)
+
+    def process_item(self, item, spider):
+        """处理数据项，保存到SQLite数据库
+        
+        Args:
+            item: 爬取的数据项
+            spider: 爬虫实例
+        
+        Returns:
+            item: 处理后的数据项
+        """
+        now = datetime.now()
+        now_str = now.strftime('%Y-%m-%d %H:%M:%S')
+
+        with self.db_manager.sqlite_session() as session:
+            # 检查是否存在相同source_url的记录
+            existing_video = session.query(VideoSQLite).filter_by(source_url=item.get('source_url')).first()
+            
+            if existing_video:
+                logger.info(f"发现重复视频: {item.get('source_url')}")
+                # 更新现有记录
+                existing_video.title = item.get('title', '')
+                existing_video.description = item.get('description', '')
+                existing_video.publish_time = item.get('publish_time', '')
+                existing_video.update_time = now_str
+                existing_video.video_url = item.get('video_url', '')
+                existing_video.source_thumbnail_url = item.get('source_thumbnail_url', '')
+                existing_video.duration = str(item.get('duration', ''))
+                existing_video.video_list = str(item.get('video_list', 0))
+                # 判断video_id、status、thumbnail_url防止被覆盖
+                if item.get('aliyun_video_id'):
+                    existing_video.aliyun_video_id = item['aliyun_video_id']
+                if item.get('aliyun_status'):
+                    existing_video.aliyun_status = item['aliyun_status']
+                if item.get('thumbnail_url'):
+                    existing_video.thumbnail_url = item['thumbnail_url']
+                # existing_video.status = 0  # 重置状态为0
+                
+                # 保存SQLite记录ID到item中，供后续中间件使用
+                item['sqlite_id'] = existing_video.id
+            
+            else:
+                # 创建新记录
+                sqlite_data = {
+                    'title': item.get('title', ''),
+                    'description': item.get('description', ''),
+                    'source_url': item.get('source_url', ''),
+                    'publish_time': item.get('publish_time', ''),
+                    'create_time': now_str,
+                    'update_time': now_str,
+                    'video_url': item.get('video_url', ''),
+                    'source_thumbnail_url': item.get('source_thumbnail_url', ''),
+                    'thumbnail_url': item.get('thumbnail_url', ''),
+                    'duration': str(item.get('duration', '')),
+                    'video_list': item.get('video_list', ''),
+                    'aliyun_video_id': item.get('aliyun_video_id', ''),
+                    'aliyun_status': item.get('aliyun_status', ''),
+                    'status': 0
+                }
+                
+                new_video = VideoSQLite(**sqlite_data)
+                session.add(new_video)
+                session.flush()  # 获取新插入记录的ID
+                
+                # 保存SQLite记录ID到item中，供后续中间件使用
+                item['sqlite_id'] = new_video.id
+
+        return item
+
+
+class MariaDBPipeline:
+    """将数据从SQLite迁移到MariaDB的管道"""
+    
+    def __init__(self, settings):
+        """初始化管道
+        
+        Args:
+            settings: Scrapy设置对象
+        """
+        self.db_manager = DatabaseManager(settings)
+        self.logger = logging.getLogger(__name__)
+    
+    @classmethod
+    def from_crawler(cls, crawler):
+        """从crawler创建管道实例
+        
+        Args:
+            crawler: Scrapy crawler对象
+            
+        Returns:
+            MariaDBPipeline: 管道实例
+        """
+        return cls(crawler.settings)
+    
+    def open_spider(self, spider):
+        """当spider开启时调用"""
+        self.logger.info("MariaDB管道已开启")
+    
+    def close_spider(self, spider):
+        """当spider关闭时调用"""
+        self.logger.info("MariaDB管道已关闭")
+        self.migrate_data()
+    
+    def process_item(self, item, spider):
+        """处理item
+        
+        Args:
+            item: Scrapy item对象
+            spider: Scrapy spider对象
+            
+        Returns:
+            item: 处理后的item
+        """
+        # 这里不需要处理item，因为我们要从SQLite读取数据
+        return item
+    
+    def migrate_data(self):
+        """从SQLite迁移数据到MariaDB"""
+        try:
+            with self.db_manager.sqlite_session() as sqlite_session, \
+                 self.db_manager.mysql_session() as mysql_session:
+                
+                # 1. 从SQLite读取视频数据
+                sqlite_videos = sqlite_session.query(VideoSQLite).where((VideoSQLite.aliyun_video_id != None) & (VideoSQLite.aliyun_video_id != '')).all()
+                # sqlite_videos = sqlite_session.query(VideoSQLite).all()
+                
+                # 2. 批量迁移到MariaDB
+                for video in sqlite_videos:
+                    # 根据video_id查重
+                    existing_video_id = mysql_session.execute(
+                        text("SELECT id FROM wz_video WHERE video_remote_id = :video_remote_id LIMIT 1"), {
+                            'video_remote_id': video.aliyun_video_id
+                        }
+                    )
+                    if existing_video_id.first():
+                        self.logger.info(f"远程数据库已存在该视频: {video.title}")
+                        continue
+                    
+                    # 映射到wz_video表
+                    wz_video = {
+                        'cid': 1,
+                        'title': video.title or '',
+                        'css': '',
+                        'thumb': video.thumbnail_url or '',
+                        'keywords': '',
+                        'remark': video.description or '',
+                        'block': 0,
+                        'url': '',
+                        'status': 9,
+                        'route': 0,
+                        'publisher': 'spider',
+                        'addtime': int(time.time()),
+                        'updatetime': int(time.time()),
+                        'area': '1',
+                        'category': '1',
+                        'theme': 0,
+                        'year': '2025',
+                        'video_remote_id': video.aliyun_video_id or '',
+                        'video_url': '',
+                        'video_list': video.video_list or 0,
+                        'month': '1'
+                    }
+                    
+                    # 映射到wz_video_data表
+                    wz_video_data = {
+                        'id': None,  # 将在插入后设置
+                        'content': '',
+                        'coin': 0,
+                        'groups': '',
+                        'pagetype': 0,
+                        'maxchars': 0,
+                        'template': '',
+                        'allowcomment': 1,
+                        'relation': ''
+                    }
+                    
+                    # 插入wz_video并获取ID
+                    result = mysql_session.execute(
+                        text("""INSERT INTO wz_video (
+                            cid, title, css, thumb, keywords, remark, block, url, 
+                            status, route, publisher, addtime, updatetime, area, 
+                            category, theme, year, video_remote_id, video_url, 
+                            video_list, month
+                        ) VALUES (
+                            :cid, :title, :css, :thumb, :keywords, :remark, :block, :url,
+                            :status, :route, :publisher, :addtime, :updatetime, :area,
+                            :category, :theme, :year, :video_remote_id, :video_url,
+                            :video_list, :month
+                        )"""),
+                        wz_video
+                    )
+                    video_id = result.lastrowid
+                    
+                    # 设置wz_video_data的id并插入
+                    wz_video_data['id'] = video_id
+                    mysql_session.execute(
+                        text("""INSERT INTO wz_video_data (
+                            id, content, coin, groups, pagetype, maxchars, 
+                            template, allowcomment, relation
+                        ) VALUES (
+                            :id, :content, :coin, :groups, :pagetype, :maxchars,
+                            :template, :allowcomment, :relation
+                        )"""),
+                        wz_video_data
+                    )
+                
+                mysql_session.commit()
+                self.logger.info(f"成功迁移 {len(sqlite_videos)} 条视频数据到线上数据库")
+                
+        except Exception as e:
+            self.logger.error(f"数据迁移失败: {str(e)}")
+            raise
--- a/scrapy_proj/settings.py
+++ b/scrapy_proj/settings.py
@ -0,0 +1,121 @@
+# Scrapy settings for scrapy_proj project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "scrapy_proj"
+
+SPIDER_MODULES = ["scrapy_proj.spiders"]
+NEWSPIDER_MODULE = "scrapy_proj.spiders"
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+CONCURRENT_REQUESTS = 8
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 1
+# The download delay setting will honor only one of:
+CONCURRENT_REQUESTS_PER_DOMAIN = 8
+CONCURRENT_REQUESTS_PER_IP = 8
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+DEFAULT_REQUEST_HEADERS = {
+   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+   "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "scrapy_proj.middlewares.ScrapyProjSpiderMiddleware": 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+   "scrapy_proj.middlewares.ScrapyProjDownloaderMiddleware": 543,
+}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   # "scrapy_proj.pipelines.AliyunVodPipeline": 300,  # 上传到阿里云视频点播
+   # "scrapy_proj.pipelines.SQLitePipeline": 400,     # 保存到SQLite
+   # "scrapy_proj.pipelines.MariaDBPipeline": 500,    # 最后保存到MariaDB
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
+
+# 项目根目录配置（用于SQLite数据库文件路径）
+import os
+from dotenv import load_dotenv
+
+load_dotenv()  # 加载.env文件中的环境变量
+PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+# 数据库配置
+SQLITE_FILE = os.getenv("SQLITE_FILE", "data/videos.db")
+
+# MariaDB配置
+MYSQL_HOST = os.getenv("MYSQL_HOST", "localhost")  # 数据库主机
+MYSQL_PORT = int(os.getenv("MYSQL_PORT", "3306"))  # 数据库端口
+MYSQL_USER = os.getenv("MYSQL_USER", "root")  # 数据库用户名
+MYSQL_PASSWORD = os.getenv("MYSQL_PASSWORD")  # 数据库密码
+MYSQL_DATABASE = os.getenv("MYSQL_DATABASE", "dev_yszy")  # 数据库名称
+
+# 阿里云配置
+ALIYUN_ACCESS_KEY_ID = os.getenv("ALIYUN_ACCESS_KEY_ID")  # 阿里云AccessKey ID
+ALIYUN_ACCESS_KEY_SECRET = os.getenv("ALIYUN_ACCESS_KEY_SECRET")  # 阿里云AccessKey Secret
+ALIYUN_TEMPLATE_GROUP_ID = os.getenv("ALIYUN_TEMPLATE_GROUP_ID")  # 转码模板组ID
+
+# 阿里云OSS配置
+ALIYUN_OSS_BUCKET = os.getenv("ALIYUN_OSS_BUCKET")  # 阿里云OSS Bucket名称
+ALIYUN_OSS_ENDPOINT = os.getenv("ALIYUN_OSS_ENDPOINT")  # 阿里云OSS Endpoint
--- a/scrapy_proj/spiders/init.py
+++ b/scrapy_proj/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/scrapy_proj/spiders/example.py
+++ b/scrapy_proj/spiders/example.py
@ -0,0 +1,32 @@
+import scrapy
+from typing import Optional
+
+
+class ExampleSpider(scrapy.Spider):
+    name = "example"
+    
+    def __init__(self, url: Optional[str] = None, *args, **kwargs):
+        """初始化爬虫
+        
+        Args:
+            url: 开始URL，可通过API传入
+        """
+        super(ExampleSpider, self).__init__(*args, **kwargs)
+        self.start_urls = [url] if url else ["http://quotes.toscrape.com"]
+    
+    def parse(self, response):
+        """解析页面数据
+        
+        这是一个示例解析器，从quotes.toscrape.com抓取引用和作者
+        """
+        for quote in response.css('div.quote'):
+            yield {
+                'text': quote.css('span.text::text').get(),
+                'author': quote.css('small.author::text').get(),
+                'tags': quote.css('div.tags a.tag::text').getall(),
+            }
+            
+        # 获取下一页链接
+        next_page = response.css('li.next a::attr(href)').get()
+        if next_page is not None:
+            yield response.follow(next_page, self.parse)
--- a/scrapy_proj/spiders/zgjs.py
+++ b/scrapy_proj/spiders/zgjs.py
@ -0,0 +1,110 @@
+import re
+import scrapy
+from datetime import datetime
+from urllib.parse import urljoin
+from ..items import VideoItem
+
+class ZGJSSpider(scrapy.Spider):
+    name = "zgjs"
+    allowed_domains = ["tv.81.cn"]
+    
+    def __init__(self, url: str = None, video_list: int = 0 , *args, **kwargs):
+        """初始化爬虫
+        
+        Args:
+            url: 开始URL，可通过API传入
+        """
+        super(ZGJSSpider, self).__init__(*args, **kwargs)
+        self.video_list = video_list
+        self.start_urls = [url] if url else ["http://tv.81.cn/zgjs/jsjs/index.html"]
+    
+    def parse(self, response):
+        """解析列表页
+        
+        Args:
+            response: 响应对象
+        """
+        print("开始爬取")
+        # 限制请求次数
+        limit_status = False
+        limit_count = 3
+        
+        # 解析视频列表
+        for video_item in response.xpath('//li[@class="content-box col-lg-2-10 col-sm-3-12 col-xs-6-12"]'):
+            if limit_status and limit_count <= 0:
+                return
+            limit_count -= 1
+            
+            # 获取详情页URL
+            detail_url = video_item.xpath('.//a/@href').get()
+            if detail_url:
+                detail_url = urljoin(response.url, detail_url)
+                # 获取基本信息
+                item = VideoItem()
+                item['video_list'] = self.video_list
+                item['source_url'] = detail_url
+                item['source_thumbnail_url'] = str.format("http://tv.81.cn" + video_item.xpath('.//img/@src').get()) if video_item.xpath('.//img/@src').get() else ""
+                item['duration'] = video_item.xpath('.//div[@class="video-des"]//span/text()').get().strip()
+                item['publish_time'] = video_item.xpath('.//small[@class="time hidden"]/text()').get()
+                item['status'] = 0  # 初始状态：待处理
+                
+                # 请求详情页
+                yield scrapy.Request(
+                    url=detail_url,
+                    callback=self.parse_detail,
+                    meta={'item': item}
+                )
+        
+        # 处理分页
+        # 使用正则匹配 createPageHTML 的参数
+        script_text = response.xpath('//script[contains(., "createPageHTML")]/text()').get()
+        if script_text:
+            page_match = re.findall(r"'([^']+)'", script_text)
+            if page_match:
+                max_page = int(page_match[0])  # 10
+                cur_page = int(page_match[1])  # 10
+                if max_page > cur_page:
+                    next_page = urljoin(response.url, f"index_{cur_page + 1}.html")
+                    if next_page and limit_status is False:
+                        print(f"开始爬取下一页：{next_page}")
+                        next_url = urljoin(response.url, next_page)
+                        yield scrapy.Request(url=next_url, callback=self.parse)
+    
+    def parse_detail(self, response):
+        """解析详情页
+        
+        Args:
+            response: 响应对象
+        """
+        item = response.meta['item']
+        
+        # 提取标题
+        item['title'] = response.xpath('//div[@class="video-header"]/h2/text()').get().strip()
+        
+        # 提取视频简介
+        description = response.xpath('//div[@id="content-source"]/text()').get()
+        item['description'] = description.strip() if description else ""
+        
+        # 提取视频URL
+        video_url = response.xpath('//div[@id="new_cmplayer"]/@data-media').get()
+        if video_url:
+            item['video_url'] = urljoin(response.url, video_url) if video_url else ""
+        
+        # 处理时间格式
+        if item.get('publish_time'):
+            try:
+                # 假设时间格式为 "YYYY-MM-DD HH:MM:SS"
+                datetime.strptime(item['publish_time'], '%Y-%m-%d %H:%M:%S')
+            except ValueError:
+                # 如果解析失败，使用当前时间
+                item['publish_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        
+        yield item
+    
+    def closed(self, reason):
+        """爬虫关闭时的回调函数
+        
+        Args:
+            reason: 关闭原因
+        """
+        self.logger.info(f'Spider closed: {reason}')
--- a/scripts/build-docker.sh
+++ b/scripts/build-docker.sh
@ -0,0 +1,39 @@
+#!/bin/bash
+
+# 版本号文件
+VERSION_FILE=".version"
+
+# 默认版本号
+DEFAULT_VERSION="0.1"
+
+# 如果未提供版本号，则自动增长
+if [ -z "$1" ]; then
+    # 检查是否有版本记录文件
+    if [ -f "$VERSION_FILE" ] && [ -s "$VERSION_FILE" ]; then  # 增加 -s 检查文件非空
+        CURRENT_VERSION=$(cat "$VERSION_FILE")
+        # 版本号 +0.1（仅支持 0.1 这种格式）
+        NEW_VERSION=$(echo "$CURRENT_VERSION + 0.1" | bc)
+    else
+        NEW_VERSION="$DEFAULT_VERSION"
+    fi
+else
+    # 使用手动指定的版本号
+    NEW_VERSION="$1"
+fi
+
+# 确保版本号格式正确（以数字开头）
+if [[ ! "$NEW_VERSION" =~ ^[0-9] ]]; then
+    NEW_VERSION="$DEFAULT_VERSION"
+fi
+
+# 保存新版本号
+echo "$NEW_VERSION" > "$VERSION_FILE"
+
+# 构建 Docker 镜像
+IMAGE_NAME="crawler_zgjs" 
+TAG="$IMAGE_NAME:$NEW_VERSION"
+
+echo "🛠️  构建Docker镜像: $TAG"
+docker build -t "$TAG" .
+
+echo "✅ 构建成功! 版本号: $NEW_VERSION"
--- a/scripts/export-docker.sh
+++ b/scripts/export-docker.sh
@ -0,0 +1,95 @@
+#!/bin/bash
+
+# 版本号文件
+VERSION_FILE=".version"
+
+# 默认镜像名称（与build-docker.sh保持一致）
+DEFAULT_IMAGE_NAME="crawler_zgjs"
+
+# 显示帮助信息
+show_help() {
+    echo "使用方法: $0 [选项]"
+    echo
+    echo "选项:"
+    echo "  -h, --help        显示帮助信息"
+    echo "  -v, --version     指定版本号（可选，默认使用.version文件中的版本）"
+    echo "  -n, --name        指定镜像名称（可选，默认为 $DEFAULT_IMAGE_NAME）"
+    echo "  -o, --output      指定输出文件路径（可选，默认为 {镜像名称}-{版本号}.tar）"
+    echo
+    echo "示例:"
+    echo "  $0                          # 使用默认设置导出镜像"
+    echo "  $0 -v 1.0                   # 导出指定版本的镜像"
+    echo "  $0 -n custom_name -v 2.0    # 导出指定名称和版本的镜像"
+    echo "  $0 -o /path/to/image.tar    # 指定输出文件路径"
+}
+
+# 解析命令行参数
+VERSION=""
+IMAGE_NAME="$DEFAULT_IMAGE_NAME"
+OUTPUT_FILE=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -h|--help)
+            show_help
+            exit 0
+            ;;
+        -v|--version)
+            VERSION="$2"
+            shift 2
+            ;;
+        -n|--name)
+            IMAGE_NAME="$2"
+            shift 2
+            ;;
+        -o|--output)
+            OUTPUT_FILE="$2"
+            shift 2
+            ;;
+        *)
+            echo "❌ 错误: 未知参数 $1"
+            show_help
+            exit 1
+            ;;
+    esac
+done
+
+# 如果未指定版本，从文件读取
+if [ -z "$VERSION" ]; then
+    if [ -f "$VERSION_FILE" ] && [ -s "$VERSION_FILE" ]; then
+        VERSION=$(cat "$VERSION_FILE")
+    else
+        echo "❌ 错误: 未指定版本且无法从 $VERSION_FILE 读取版本号"
+        exit 1
+    fi
+fi
+
+# 构建完整的镜像标签
+TAG="$IMAGE_NAME:$VERSION"
+
+# 如果未指定输出文件，使用默认命名
+if [ -z "$OUTPUT_FILE" ]; then
+    OUTPUT_FILE="${IMAGE_NAME}-${VERSION}.tar"
+fi
+
+# 检查镜像是否存在
+if ! docker image inspect "$TAG" >/dev/null 2>&1; then
+    echo "❌ 错误: 镜像 $TAG 不存在"
+    echo "提示: 请先使用 build-docker.sh 构建镜像"
+    exit 1
+fi
+
+echo "🚀 开始导出Docker镜像..."
+echo "📦 镜像: $TAG"
+echo "📄 输出: $OUTPUT_FILE"
+
+# 导出镜像
+if docker save -o "$OUTPUT_FILE" "$TAG"; then
+    echo "✅ 导出成功!"
+    echo "📁 文件大小: $(du -h "$OUTPUT_FILE" | cut -f1)"
+else
+    echo "❌ 导出失败!"
+    # 如果导出失败，清理可能部分写入的文件
+    [ -f "$OUTPUT_FILE" ] && rm "$OUTPUT_FILE"
+    exit 1
+fi
--- a/static/bootstrap/css/bootstrap-icons.css
+++ b/static/bootstrap/css/bootstrap-icons.css
--- a/static/bootstrap/css/bootstrap.min.css
+++ b/static/bootstrap/css/bootstrap.min.css
--- a/static/bootstrap/js/bootstrap.bundle.min.js
+++ b/static/bootstrap/js/bootstrap.bundle.min.js
--- a/static/index.html
+++ b/static/index.html
--- a/static/layui/css/layui.css
+++ b/static/layui/css/layui.css
--- a/static/layui/layui.js
+++ b/static/layui/layui.js