import re import scrapy from datetime import datetime from urllib.parse import urljoin from ..items import VideoItem class ZGJSSpider(scrapy.Spider): name = "zgjs" allowed_domains = ["tv.81.cn"] def __init__(self, url: str = None, video_list: int = 0 , *args, **kwargs): """初始化爬虫 Args: url: 开始URL,可通过API传入 """ super(ZGJSSpider, self).__init__(*args, **kwargs) self.video_list = video_list self.start_urls = [url] if url else ["http://tv.81.cn/zgjs/jsjs/index.html"] def parse(self, response): """解析列表页 Args: response: 响应对象 """ print("开始爬取") # 限制请求次数 limit_status = False limit_count = 3 # 解析视频列表 for video_item in response.xpath('//li[@class="content-box col-lg-2-10 col-sm-3-12 col-xs-6-12"]'): if limit_status and limit_count <= 0: return limit_count -= 1 # 获取详情页URL detail_url = video_item.xpath('.//a/@href').get() if detail_url: detail_url = urljoin(response.url, detail_url) # 获取基本信息 item = VideoItem() item['video_list'] = self.video_list item['source_url'] = detail_url item['source_thumbnail_url'] = str.format("http://tv.81.cn" + video_item.xpath('.//img/@src').get()) if video_item.xpath('.//img/@src').get() else "" item['duration'] = video_item.xpath('.//div[@class="video-des"]//span/text()').get().strip() item['publish_time'] = video_item.xpath('.//small[@class="time hidden"]/text()').get() item['status'] = 0 # 初始状态:待处理 # 请求详情页 yield scrapy.Request( url=detail_url, callback=self.parse_detail, meta={'item': item} ) # 处理分页 # 使用正则匹配 createPageHTML 的参数 script_text = response.xpath('//script[contains(., "createPageHTML")]/text()').get() if script_text: page_match = re.findall(r"'([^']+)'", script_text) if page_match: max_page = int(page_match[0]) # 10 cur_page = int(page_match[1]) # 10 if max_page > cur_page: next_page = urljoin(response.url, f"index_{cur_page + 1}.html") if next_page and limit_status is False: print(f"开始爬取下一页:{next_page}") next_url = urljoin(response.url, next_page) yield scrapy.Request(url=next_url, callback=self.parse) def parse_detail(self, response): """解析详情页 Args: response: 响应对象 """ item = response.meta['item'] # 提取标题 item['title'] = response.xpath('//div[@class="video-header"]/h2/text()').get().strip() # 提取视频简介 description = response.xpath('//div[@id="content-source"]/text()').get() item['description'] = description.strip() if description else "" # 提取视频URL video_url = response.xpath('//div[@id="new_cmplayer"]/@data-media').get() if video_url: item['video_url'] = urljoin(response.url, video_url) if video_url else "" # 处理时间格式 if item.get('publish_time'): try: # 假设时间格式为 "YYYY-MM-DD HH:MM:SS" datetime.strptime(item['publish_time'], '%Y-%m-%d %H:%M:%S') except ValueError: # 如果解析失败,使用当前时间 item['publish_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') yield item def closed(self, reason): """爬虫关闭时的回调函数 Args: reason: 关闭原因 """ self.logger.info(f'Spider closed: {reason}')