crawler_81tv/scrapy_proj/spiders/zgjs.py

import re
import scrapy
from datetime import datetime
from urllib.parse import urljoin
from ..items import VideoItem

class ZGJSSpider(scrapy.Spider):
    name = "zgjs"
    allowed_domains = ["tv.81.cn"]

    def __init__(self, url: str = None, video_list: int = 0 , *args, **kwargs):
        """初始化爬虫

        Args:
            url: 开始URL，可通过API传入
        """
        super(ZGJSSpider, self).__init__(*args, **kwargs)
        self.video_list = video_list
        self.start_urls = [url] if url else ["http://tv.81.cn/zgjs/jsjs/index.html"]

    def parse(self, response):
        """解析列表页

        Args:
            response: 响应对象
        """
        print("开始爬取")
        # 限制请求次数
        limit_status = False
        limit_count = 3

        # 解析视频列表
        for video_item in response.xpath('//li[@class="content-box col-lg-2-10 col-sm-3-12 col-xs-6-12"]'):
            if limit_status and limit_count <= 0:
                return
            limit_count -= 1

            # 获取详情页URL
            detail_url = video_item.xpath('.//a/@href').get()
            if detail_url:
                detail_url = urljoin(response.url, detail_url)
                # 获取基本信息
                item = VideoItem()
                item['video_list'] = self.video_list
                item['source_url'] = detail_url
                item['source_thumbnail_url'] = str.format("http://tv.81.cn" + video_item.xpath('.//img/@src').get()) if video_item.xpath('.//img/@src').get() else ""
                item['duration'] = video_item.xpath('.//div[@class="video-des"]//span/text()').get().strip()
                item['publish_time'] = video_item.xpath('.//small[@class="time hidden"]/text()').get()
                item['status'] = 0  # 初始状态：待处理

                # 请求详情页
                yield scrapy.Request(
                    url=detail_url,
                    callback=self.parse_detail,
                    meta={'item': item}
                )

        # 处理分页
        # 使用正则匹配 createPageHTML 的参数
        script_text = response.xpath('//script[contains(., "createPageHTML")]/text()').get()
        if script_text:
            page_match = re.findall(r"'([^']+)'", script_text)
            if page_match:
                max_page = int(page_match[0])  # 10
                cur_page = int(page_match[1])  # 10
                if max_page > cur_page:
                    next_page = urljoin(response.url, f"index_{cur_page + 1}.html")
                    if next_page and limit_status is False:
                        print(f"开始爬取下一页：{next_page}")
                        next_url = urljoin(response.url, next_page)
                        yield scrapy.Request(url=next_url, callback=self.parse)

    def parse_detail(self, response):
        """解析详情页

        Args:
            response: 响应对象
        """
        item = response.meta['item']

        # 提取标题
        item['title'] = response.xpath('//div[@class="video-header"]/h2/text()').get().strip()

        # 提取视频简介
        description = response.xpath('//div[@id="content-source"]/text()').get()
        item['description'] = description.strip() if description else ""

        # 提取视频URL
        video_url = response.xpath('//div[@id="new_cmplayer"]/@data-media').get()
        if video_url:
            item['video_url'] = urljoin(response.url, video_url) if video_url else ""

        # 处理时间格式
        if item.get('publish_time'):
            try:
                # 假设时间格式为 "YYYY-MM-DD HH:MM:SS"
                datetime.strptime(item['publish_time'], '%Y-%m-%d %H:%M:%S')
            except ValueError:
                # 如果解析失败，使用当前时间
                item['publish_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        yield item

    def closed(self, reason):
        """爬虫关闭时的回调函数

        Args:
            reason: 关闭原因
        """
        self.logger.info(f'Spider closed: {reason}')