110 lines
4.3 KiB
Python
110 lines
4.3 KiB
Python
import re
|
||
import scrapy
|
||
from datetime import datetime
|
||
from urllib.parse import urljoin
|
||
from ..items import VideoItem
|
||
|
||
class ZGJSSpider(scrapy.Spider):
|
||
name = "zgjs"
|
||
allowed_domains = ["tv.81.cn"]
|
||
|
||
def __init__(self, url: str = None, video_list: int = 0 , *args, **kwargs):
|
||
"""初始化爬虫
|
||
|
||
Args:
|
||
url: 开始URL,可通过API传入
|
||
"""
|
||
super(ZGJSSpider, self).__init__(*args, **kwargs)
|
||
self.video_list = video_list
|
||
self.start_urls = [url] if url else ["http://tv.81.cn/zgjs/jsjs/index.html"]
|
||
|
||
def parse(self, response):
|
||
"""解析列表页
|
||
|
||
Args:
|
||
response: 响应对象
|
||
"""
|
||
print("开始爬取")
|
||
# 限制请求次数
|
||
limit_status = False
|
||
limit_count = 3
|
||
|
||
# 解析视频列表
|
||
for video_item in response.xpath('//li[@class="content-box col-lg-2-10 col-sm-3-12 col-xs-6-12"]'):
|
||
if limit_status and limit_count <= 0:
|
||
return
|
||
limit_count -= 1
|
||
|
||
# 获取详情页URL
|
||
detail_url = video_item.xpath('.//a/@href').get()
|
||
if detail_url:
|
||
detail_url = urljoin(response.url, detail_url)
|
||
# 获取基本信息
|
||
item = VideoItem()
|
||
item['video_list'] = self.video_list
|
||
item['source_url'] = detail_url
|
||
item['source_thumbnail_url'] = str.format("http://tv.81.cn" + video_item.xpath('.//img/@src').get()) if video_item.xpath('.//img/@src').get() else ""
|
||
item['duration'] = video_item.xpath('.//div[@class="video-des"]//span/text()').get().strip()
|
||
item['publish_time'] = video_item.xpath('.//small[@class="time hidden"]/text()').get()
|
||
item['status'] = 0 # 初始状态:待处理
|
||
|
||
# 请求详情页
|
||
yield scrapy.Request(
|
||
url=detail_url,
|
||
callback=self.parse_detail,
|
||
meta={'item': item}
|
||
)
|
||
|
||
# 处理分页
|
||
# 使用正则匹配 createPageHTML 的参数
|
||
script_text = response.xpath('//script[contains(., "createPageHTML")]/text()').get()
|
||
if script_text:
|
||
page_match = re.findall(r"'([^']+)'", script_text)
|
||
if page_match:
|
||
max_page = int(page_match[0]) # 10
|
||
cur_page = int(page_match[1]) # 10
|
||
if max_page > cur_page:
|
||
next_page = urljoin(response.url, f"index_{cur_page + 1}.html")
|
||
if next_page and limit_status is False:
|
||
print(f"开始爬取下一页:{next_page}")
|
||
next_url = urljoin(response.url, next_page)
|
||
yield scrapy.Request(url=next_url, callback=self.parse)
|
||
|
||
def parse_detail(self, response):
|
||
"""解析详情页
|
||
|
||
Args:
|
||
response: 响应对象
|
||
"""
|
||
item = response.meta['item']
|
||
|
||
# 提取标题
|
||
item['title'] = response.xpath('//div[@class="video-header"]/h2/text()').get().strip()
|
||
|
||
# 提取视频简介
|
||
description = response.xpath('//div[@id="content-source"]/text()').get()
|
||
item['description'] = description.strip() if description else ""
|
||
|
||
# 提取视频URL
|
||
video_url = response.xpath('//div[@id="new_cmplayer"]/@data-media').get()
|
||
if video_url:
|
||
item['video_url'] = urljoin(response.url, video_url) if video_url else ""
|
||
|
||
# 处理时间格式
|
||
if item.get('publish_time'):
|
||
try:
|
||
# 假设时间格式为 "YYYY-MM-DD HH:MM:SS"
|
||
datetime.strptime(item['publish_time'], '%Y-%m-%d %H:%M:%S')
|
||
except ValueError:
|
||
# 如果解析失败,使用当前时间
|
||
item['publish_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
yield item
|
||
|
||
def closed(self, reason):
|
||
"""爬虫关闭时的回调函数
|
||
|
||
Args:
|
||
reason: 关闭原因
|
||
"""
|
||
self.logger.info(f'Spider closed: {reason}') |