crawler_81tv/scrapy_proj/spiders/zgjs.py
2025-06-08 16:25:53 +08:00

110 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import scrapy
from datetime import datetime
from urllib.parse import urljoin
from ..items import VideoItem
class ZGJSSpider(scrapy.Spider):
name = "zgjs"
allowed_domains = ["tv.81.cn"]
def __init__(self, url: str = None, video_list: int = 0 , *args, **kwargs):
"""初始化爬虫
Args:
url: 开始URL可通过API传入
"""
super(ZGJSSpider, self).__init__(*args, **kwargs)
self.video_list = video_list
self.start_urls = [url] if url else ["http://tv.81.cn/zgjs/jsjs/index.html"]
def parse(self, response):
"""解析列表页
Args:
response: 响应对象
"""
print("开始爬取")
# 限制请求次数
limit_status = False
limit_count = 3
# 解析视频列表
for video_item in response.xpath('//li[@class="content-box col-lg-2-10 col-sm-3-12 col-xs-6-12"]'):
if limit_status and limit_count <= 0:
return
limit_count -= 1
# 获取详情页URL
detail_url = video_item.xpath('.//a/@href').get()
if detail_url:
detail_url = urljoin(response.url, detail_url)
# 获取基本信息
item = VideoItem()
item['video_list'] = self.video_list
item['source_url'] = detail_url
item['source_thumbnail_url'] = str.format("http://tv.81.cn" + video_item.xpath('.//img/@src').get()) if video_item.xpath('.//img/@src').get() else ""
item['duration'] = video_item.xpath('.//div[@class="video-des"]//span/text()').get().strip()
item['publish_time'] = video_item.xpath('.//small[@class="time hidden"]/text()').get()
item['status'] = 0 # 初始状态:待处理
# 请求详情页
yield scrapy.Request(
url=detail_url,
callback=self.parse_detail,
meta={'item': item}
)
# 处理分页
# 使用正则匹配 createPageHTML 的参数
script_text = response.xpath('//script[contains(., "createPageHTML")]/text()').get()
if script_text:
page_match = re.findall(r"'([^']+)'", script_text)
if page_match:
max_page = int(page_match[0]) # 10
cur_page = int(page_match[1]) # 10
if max_page > cur_page:
next_page = urljoin(response.url, f"index_{cur_page + 1}.html")
if next_page and limit_status is False:
print(f"开始爬取下一页:{next_page}")
next_url = urljoin(response.url, next_page)
yield scrapy.Request(url=next_url, callback=self.parse)
def parse_detail(self, response):
"""解析详情页
Args:
response: 响应对象
"""
item = response.meta['item']
# 提取标题
item['title'] = response.xpath('//div[@class="video-header"]/h2/text()').get().strip()
# 提取视频简介
description = response.xpath('//div[@id="content-source"]/text()').get()
item['description'] = description.strip() if description else ""
# 提取视频URL
video_url = response.xpath('//div[@id="new_cmplayer"]/@data-media').get()
if video_url:
item['video_url'] = urljoin(response.url, video_url) if video_url else ""
# 处理时间格式
if item.get('publish_time'):
try:
# 假设时间格式为 "YYYY-MM-DD HH:MM:SS"
datetime.strptime(item['publish_time'], '%Y-%m-%d %H:%M:%S')
except ValueError:
# 如果解析失败,使用当前时间
item['publish_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
yield item
def closed(self, reason):
"""爬虫关闭时的回调函数
Args:
reason: 关闭原因
"""
self.logger.info(f'Spider closed: {reason}')