479 lines
19 KiB
Python
479 lines
19 KiB
Python
from datetime import datetime
|
||
import logging
|
||
import json
|
||
import requests
|
||
import time
|
||
import base64
|
||
import oss2
|
||
from aliyunsdkcore.client import AcsClient
|
||
from aliyunsdkvod.request.v20170321.CreateUploadVideoRequest import CreateUploadVideoRequest
|
||
from aliyunsdkvod.request.v20170321.GetVideoInfoRequest import GetVideoInfoRequest
|
||
from aliyunsdkvod.request.v20170321.UpdateVideoInfoRequest import UpdateVideoInfoRequest
|
||
from aliyunsdkvod.request.v20170321.CreateUploadImageRequest import CreateUploadImageRequest
|
||
from sqlalchemy import text
|
||
from .database import DatabaseManager
|
||
from .models import VideoSQLite
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class AliyunVodPipeline:
|
||
"""阿里云视频点播处理中间件"""
|
||
|
||
def __init__(self, settings):
|
||
"""初始化阿里云视频点播中间件
|
||
初始化SQLite中间件
|
||
|
||
Args:
|
||
settings: Scrapy设置对象
|
||
"""
|
||
self.settings = settings
|
||
self.access_key_id = settings.get('ALIYUN_ACCESS_KEY_ID')
|
||
self.access_key_secret = settings.get('ALIYUN_ACCESS_KEY_SECRET')
|
||
self.template_group_id = settings.get('ALIYUN_TEMPLATE_GROUP_ID')
|
||
self.client = AcsClient(self.access_key_id, self.access_key_secret, 'cn-shanghai')
|
||
self.oss_client = oss2.Auth(self.access_key_id, self.access_key_secret)
|
||
|
||
self.db_manager = DatabaseManager(settings)
|
||
@classmethod
|
||
def from_crawler(cls, crawler):
|
||
return cls(crawler.settings)
|
||
|
||
def upload_media_by_url(self, video_url, title, cover_url=None):
|
||
"""通过URL上传视频到阿里云VOD
|
||
|
||
Args:
|
||
video_url: 视频URL
|
||
title: 视频标题
|
||
cover_url: 封面URL
|
||
|
||
Returns:
|
||
str: 上传任务ID (JobId)
|
||
"""
|
||
from aliyunsdkvod.request.v20170321.UploadMediaByURLRequest import UploadMediaByURLRequest
|
||
|
||
request = UploadMediaByURLRequest()
|
||
request.set_accept_format('JSON')
|
||
|
||
# 设置视频URL
|
||
logger.info(f"上传视频URL: {video_url}")
|
||
request.set_UploadURLs(video_url)
|
||
|
||
# 设置视频信息(需要是JSON数组字符串)
|
||
upload_metadata = [{
|
||
'Title': title,
|
||
'SourceURL': video_url,
|
||
'TemplateGroupId': self.template_group_id
|
||
}]
|
||
# 设置封面URL
|
||
# if cover_url:
|
||
# upload_metadata[0]['CoverURL'] = cover_url
|
||
|
||
request.set_UploadMetadatas(json.dumps(upload_metadata))
|
||
|
||
response = self.client.do_action_with_exception(request)
|
||
result = json.loads(response)
|
||
|
||
# 返回第一个上传任务的JobId
|
||
upload_jobs = result.get('UploadJobs', [])
|
||
if not upload_jobs:
|
||
raise Exception("No upload job created")
|
||
|
||
job = upload_jobs[0]
|
||
# if job.get('Code') != 'Success':
|
||
# raise Exception(f"Upload job failed: {job}")
|
||
|
||
return job.get('JobId') # 返回JobId而不是VideoId
|
||
|
||
def get_upload_job_status(self, job_id):
|
||
"""获取上传任务状态
|
||
|
||
Args:
|
||
job_id: 上传任务ID
|
||
|
||
Returns:
|
||
dict: 任务状态信息,包含VideoId(如果上传完成)
|
||
"""
|
||
from aliyunsdkvod.request.v20170321.GetURLUploadInfosRequest import GetURLUploadInfosRequest
|
||
|
||
request = GetURLUploadInfosRequest()
|
||
request.set_accept_format('JSON')
|
||
request.set_JobIds(job_id)
|
||
|
||
response = self.client.do_action_with_exception(request)
|
||
result = json.loads(response)
|
||
|
||
upload_jobs = result.get('URLUploadInfoList', [])
|
||
if not upload_jobs:
|
||
raise Exception(f"No upload job found with ID: {job_id}")
|
||
|
||
job = upload_jobs[0]
|
||
return job
|
||
|
||
def wait_for_video_id(self, job_id, max_retries=5, retry_interval=2):
|
||
"""等待上传任务完成并获取VideoId
|
||
|
||
Args:
|
||
job_id: 上传任务ID
|
||
max_retries: 最大重试次数
|
||
retry_interval: 重试间隔(秒)
|
||
|
||
Returns:
|
||
str: 视频ID
|
||
"""
|
||
import time
|
||
|
||
for i in range(max_retries):
|
||
job_status = self.get_upload_job_status(job_id)
|
||
|
||
if job_status.get('MediaId'):
|
||
return job_status.get('MediaId')
|
||
|
||
# 等待一段时间后重试
|
||
time.sleep(retry_interval)
|
||
|
||
raise Exception(f"Max retries reached, upload job not completed: {job_id}")
|
||
|
||
def upload_image_to_oss(self, image_url, title):
|
||
"""直接上传图片到阿里云OSS
|
||
|
||
Args:
|
||
image_url: 图片URL
|
||
title: 图片标题
|
||
|
||
Returns:
|
||
str: OSS中的图片URL
|
||
"""
|
||
logger.info(f"开始上传图片到OSS: {image_url}")
|
||
|
||
try:
|
||
# 1. 下载远程图片
|
||
image_response = requests.get(image_url, timeout=30)
|
||
image_response.raise_for_status()
|
||
image_content = image_response.content
|
||
except Exception as e:
|
||
logger.error(f"下载图片失败: {str(e)}")
|
||
raise Exception(f"下载图片失败: {str(e)}")
|
||
|
||
try:
|
||
# 2. 生成OSS中的文件名(使用时间戳和原始文件名的组合)
|
||
timestamp = int(time.time())
|
||
file_ext = image_url.split('.')[-1] if '.' in image_url else 'jpg'
|
||
oss_filename = f"images/{timestamp}_{title[:30]}.{file_ext}" # 限制标题长度,避免文件名过长
|
||
|
||
# 3. 获取OSS bucket(从settings中获取配置)
|
||
bucket_name = self.settings.get('ALIYUN_OSS_BUCKET')
|
||
endpoint = self.settings.get('ALIYUN_OSS_ENDPOINT')
|
||
oss_bucket = oss2.Bucket(self.oss_client, endpoint, bucket_name)
|
||
|
||
# 4. 上传图片到OSS
|
||
upload_response = oss_bucket.put_object(oss_filename, image_content)
|
||
if upload_response.status == 200:
|
||
# 5. 返回可访问的URL
|
||
oss_url = f"https://{bucket_name}.{endpoint}/{oss_filename}"
|
||
logger.info(f"图片上传成功: {oss_url}")
|
||
return oss_url
|
||
else:
|
||
raise Exception(f"图片上传失败: {upload_response.status}")
|
||
except Exception as e:
|
||
logger.error(f"上传图片到OSS失败: {str(e)}")
|
||
raise Exception(f"上传图片到OSS失败: {str(e)}")
|
||
|
||
|
||
def process_item(self, item, spider):
|
||
"""处理数据项,通过URL上传视频到阿里云VOD
|
||
|
||
Args:
|
||
item: 爬取的数据项
|
||
spider: 爬虫实例
|
||
|
||
Returns:
|
||
item: 处理后的数据项
|
||
"""
|
||
# 如果已经有阿里云视频ID,跳过处理
|
||
with self.db_manager.sqlite_session() as session:
|
||
# 检查是否存在相同source_url的记录
|
||
existing_video = session.query(VideoSQLite).filter_by(source_url=item.get('source_url')).first()
|
||
if existing_video and existing_video.aliyun_video_id:
|
||
logger.info(f"阿里云视频ID已存在,跳过该任务: {item.get('title')}")
|
||
return item
|
||
|
||
video_url = item.get('video_url')
|
||
if not video_url:
|
||
logger.warning(f"视频URL为空,跳过处理: {item.get('source_url')}")
|
||
return item
|
||
|
||
try:
|
||
# 1. 上传封面图片到OSS(如果有)
|
||
cover_url = item.get('source_thumbnail_url')
|
||
if cover_url:
|
||
try:
|
||
oss_url = self.upload_image_to_oss(
|
||
image_url=cover_url,
|
||
title=item.get('title', '')
|
||
)
|
||
# 更新item中的封面URL为OSS URL
|
||
item['thumbnail_url'] = oss_url
|
||
logger.info(f"封面图片上传到OSS成功: {oss_url}")
|
||
except Exception as e:
|
||
logger.error(f"封面图片上传到OSS失败: {str(e)}")
|
||
# 如果封面上传失败,继续处理视频,不中断流程
|
||
|
||
# 2. 通过URL上传视频,获取JobId
|
||
title = item.get('title', '')
|
||
job_id = self.upload_media_by_url(
|
||
video_url=video_url,
|
||
title=title,
|
||
cover_url=oss_url # 使用刚上传的OSS封面URL
|
||
)
|
||
|
||
logger.info(f"成功创建阿里云视频URL上传任务: job_id={job_id}, title={title}")
|
||
|
||
# 2. 等待上传完成并获取VideoId
|
||
try:
|
||
video_id = self.wait_for_video_id(job_id)
|
||
logger.info(f"视频上传完成: video_id={video_id}, job_id={job_id}")
|
||
|
||
# 3. 更新item中的阿里云视频ID和状态
|
||
item['aliyun_video_id'] = video_id
|
||
item['aliyun_status'] = 'Success'
|
||
|
||
except Exception as e:
|
||
logger.error(f"等待视频上传完成失败: job_id={job_id}, error={str(e)}")
|
||
item['aliyun_video_id'] = ""
|
||
item['aliyun_status'] = 'Uploading'
|
||
raise # 重新抛出异常,让上层错误处理来处理
|
||
|
||
except Exception as e:
|
||
logger.error(f"阿里云视频URL上传失败: {str(e)}")
|
||
item['aliyun_status'] = 'Failed'
|
||
|
||
return item
|
||
|
||
|
||
class SQLitePipeline:
|
||
"""SQLite数据库处理中间件"""
|
||
|
||
def __init__(self, settings):
|
||
"""初始化SQLite中间件
|
||
|
||
Args:
|
||
settings: Scrapy设置对象
|
||
"""
|
||
self.db_manager = DatabaseManager(settings)
|
||
|
||
@classmethod
|
||
def from_crawler(cls, crawler):
|
||
return cls(crawler.settings)
|
||
|
||
def process_item(self, item, spider):
|
||
"""处理数据项,保存到SQLite数据库
|
||
|
||
Args:
|
||
item: 爬取的数据项
|
||
spider: 爬虫实例
|
||
|
||
Returns:
|
||
item: 处理后的数据项
|
||
"""
|
||
now = datetime.now()
|
||
now_str = now.strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
with self.db_manager.sqlite_session() as session:
|
||
# 检查是否存在相同source_url的记录
|
||
existing_video = session.query(VideoSQLite).filter_by(source_url=item.get('source_url')).first()
|
||
|
||
if existing_video:
|
||
logger.info(f"发现重复视频: {item.get('source_url')}")
|
||
# 更新现有记录
|
||
existing_video.title = item.get('title', '')
|
||
existing_video.description = item.get('description', '')
|
||
existing_video.publish_time = item.get('publish_time', '')
|
||
existing_video.update_time = now_str
|
||
existing_video.video_url = item.get('video_url', '')
|
||
existing_video.source_thumbnail_url = item.get('source_thumbnail_url', '')
|
||
existing_video.duration = str(item.get('duration', ''))
|
||
existing_video.video_list = str(item.get('video_list', 0))
|
||
# 判断video_id、status、thumbnail_url防止被覆盖
|
||
if item.get('aliyun_video_id'):
|
||
existing_video.aliyun_video_id = item['aliyun_video_id']
|
||
if item.get('aliyun_status'):
|
||
existing_video.aliyun_status = item['aliyun_status']
|
||
if item.get('thumbnail_url'):
|
||
existing_video.thumbnail_url = item['thumbnail_url']
|
||
# existing_video.status = 0 # 重置状态为0
|
||
|
||
# 保存SQLite记录ID到item中,供后续中间件使用
|
||
item['sqlite_id'] = existing_video.id
|
||
|
||
else:
|
||
# 创建新记录
|
||
sqlite_data = {
|
||
'title': item.get('title', ''),
|
||
'description': item.get('description', ''),
|
||
'source_url': item.get('source_url', ''),
|
||
'publish_time': item.get('publish_time', ''),
|
||
'create_time': now_str,
|
||
'update_time': now_str,
|
||
'video_url': item.get('video_url', ''),
|
||
'source_thumbnail_url': item.get('source_thumbnail_url', ''),
|
||
'thumbnail_url': item.get('thumbnail_url', ''),
|
||
'duration': str(item.get('duration', '')),
|
||
'video_list': item.get('video_list', ''),
|
||
'aliyun_video_id': item.get('aliyun_video_id', ''),
|
||
'aliyun_status': item.get('aliyun_status', ''),
|
||
'status': 0
|
||
}
|
||
|
||
new_video = VideoSQLite(**sqlite_data)
|
||
session.add(new_video)
|
||
session.flush() # 获取新插入记录的ID
|
||
|
||
# 保存SQLite记录ID到item中,供后续中间件使用
|
||
item['sqlite_id'] = new_video.id
|
||
|
||
return item
|
||
|
||
|
||
class MariaDBPipeline:
|
||
"""将数据从SQLite迁移到MariaDB的管道"""
|
||
|
||
def __init__(self, settings):
|
||
"""初始化管道
|
||
|
||
Args:
|
||
settings: Scrapy设置对象
|
||
"""
|
||
self.db_manager = DatabaseManager(settings)
|
||
self.logger = logging.getLogger(__name__)
|
||
|
||
@classmethod
|
||
def from_crawler(cls, crawler):
|
||
"""从crawler创建管道实例
|
||
|
||
Args:
|
||
crawler: Scrapy crawler对象
|
||
|
||
Returns:
|
||
MariaDBPipeline: 管道实例
|
||
"""
|
||
return cls(crawler.settings)
|
||
|
||
def open_spider(self, spider):
|
||
"""当spider开启时调用"""
|
||
self.logger.info("MariaDB管道已开启")
|
||
|
||
def close_spider(self, spider):
|
||
"""当spider关闭时调用"""
|
||
self.logger.info("MariaDB管道已关闭")
|
||
self.migrate_data()
|
||
|
||
def process_item(self, item, spider):
|
||
"""处理item
|
||
|
||
Args:
|
||
item: Scrapy item对象
|
||
spider: Scrapy spider对象
|
||
|
||
Returns:
|
||
item: 处理后的item
|
||
"""
|
||
# 这里不需要处理item,因为我们要从SQLite读取数据
|
||
return item
|
||
|
||
def migrate_data(self):
|
||
"""从SQLite迁移数据到MariaDB"""
|
||
try:
|
||
with self.db_manager.sqlite_session() as sqlite_session, \
|
||
self.db_manager.mysql_session() as mysql_session:
|
||
|
||
# 1. 从SQLite读取视频数据
|
||
sqlite_videos = sqlite_session.query(VideoSQLite).where((VideoSQLite.aliyun_video_id != None) & (VideoSQLite.aliyun_video_id != '')).all()
|
||
# sqlite_videos = sqlite_session.query(VideoSQLite).all()
|
||
|
||
# 2. 批量迁移到MariaDB
|
||
for video in sqlite_videos:
|
||
# 根据video_id查重
|
||
existing_video_id = mysql_session.execute(
|
||
text("SELECT id FROM wz_video WHERE video_remote_id = :video_remote_id LIMIT 1"), {
|
||
'video_remote_id': video.aliyun_video_id
|
||
}
|
||
)
|
||
if existing_video_id.first():
|
||
self.logger.info(f"远程数据库已存在该视频: {video.title}")
|
||
continue
|
||
|
||
# 映射到wz_video表
|
||
wz_video = {
|
||
'cid': 1,
|
||
'title': video.title or '',
|
||
'css': '',
|
||
'thumb': video.thumbnail_url or '',
|
||
'keywords': '',
|
||
'remark': video.description or '',
|
||
'block': 0,
|
||
'url': '',
|
||
'status': 9,
|
||
'route': 0,
|
||
'publisher': 'spider',
|
||
'addtime': int(time.time()),
|
||
'updatetime': int(time.time()),
|
||
'area': '1',
|
||
'category': '1',
|
||
'theme': 0,
|
||
'year': '2025',
|
||
'video_remote_id': video.aliyun_video_id or '',
|
||
'video_url': '',
|
||
'video_list': video.video_list or 0,
|
||
'month': '1'
|
||
}
|
||
|
||
# 映射到wz_video_data表
|
||
wz_video_data = {
|
||
'id': None, # 将在插入后设置
|
||
'content': '',
|
||
'coin': 0,
|
||
'groups': '',
|
||
'pagetype': 0,
|
||
'maxchars': 0,
|
||
'template': '',
|
||
'allowcomment': 1,
|
||
'relation': ''
|
||
}
|
||
|
||
# 插入wz_video并获取ID
|
||
result = mysql_session.execute(
|
||
text("""INSERT INTO wz_video (
|
||
cid, title, css, thumb, keywords, remark, block, url,
|
||
status, route, publisher, addtime, updatetime, area,
|
||
category, theme, year, video_remote_id, video_url,
|
||
video_list, month
|
||
) VALUES (
|
||
:cid, :title, :css, :thumb, :keywords, :remark, :block, :url,
|
||
:status, :route, :publisher, :addtime, :updatetime, :area,
|
||
:category, :theme, :year, :video_remote_id, :video_url,
|
||
:video_list, :month
|
||
)"""),
|
||
wz_video
|
||
)
|
||
video_id = result.lastrowid
|
||
|
||
# 设置wz_video_data的id并插入
|
||
wz_video_data['id'] = video_id
|
||
mysql_session.execute(
|
||
text("""INSERT INTO wz_video_data (
|
||
id, content, coin, groups, pagetype, maxchars,
|
||
template, allowcomment, relation
|
||
) VALUES (
|
||
:id, :content, :coin, :groups, :pagetype, :maxchars,
|
||
:template, :allowcomment, :relation
|
||
)"""),
|
||
wz_video_data
|
||
)
|
||
|
||
mysql_session.commit()
|
||
self.logger.info(f"成功迁移 {len(sqlite_videos)} 条视频数据到线上数据库")
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"数据迁移失败: {str(e)}")
|
||
raise |