from datetime import datetime import logging import json import requests import time import base64 import oss2 from aliyunsdkcore.client import AcsClient from aliyunsdkvod.request.v20170321.CreateUploadVideoRequest import CreateUploadVideoRequest from aliyunsdkvod.request.v20170321.GetVideoInfoRequest import GetVideoInfoRequest from aliyunsdkvod.request.v20170321.UpdateVideoInfoRequest import UpdateVideoInfoRequest from aliyunsdkvod.request.v20170321.CreateUploadImageRequest import CreateUploadImageRequest from sqlalchemy import text from .database import DatabaseManager from .models import VideoSQLite logger = logging.getLogger(__name__) class AliyunVodPipeline: """阿里云视频点播处理中间件""" def __init__(self, settings): """初始化阿里云视频点播中间件 初始化SQLite中间件 Args: settings: Scrapy设置对象 """ self.settings = settings self.access_key_id = settings.get('ALIYUN_ACCESS_KEY_ID') self.access_key_secret = settings.get('ALIYUN_ACCESS_KEY_SECRET') self.template_group_id = settings.get('ALIYUN_TEMPLATE_GROUP_ID') self.client = AcsClient(self.access_key_id, self.access_key_secret, 'cn-shanghai') self.oss_client = oss2.Auth(self.access_key_id, self.access_key_secret) self.db_manager = DatabaseManager(settings) @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def upload_media_by_url(self, video_url, title, cover_url=None): """通过URL上传视频到阿里云VOD Args: video_url: 视频URL title: 视频标题 cover_url: 封面URL Returns: str: 上传任务ID (JobId) """ from aliyunsdkvod.request.v20170321.UploadMediaByURLRequest import UploadMediaByURLRequest request = UploadMediaByURLRequest() request.set_accept_format('JSON') # 设置视频URL logger.info(f"上传视频URL: {video_url}") request.set_UploadURLs(video_url) # 设置视频信息(需要是JSON数组字符串) upload_metadata = [{ 'Title': title, 'SourceURL': video_url, 'TemplateGroupId': self.template_group_id }] # 设置封面URL # if cover_url: # upload_metadata[0]['CoverURL'] = cover_url request.set_UploadMetadatas(json.dumps(upload_metadata)) response = self.client.do_action_with_exception(request) result = json.loads(response) # 返回第一个上传任务的JobId upload_jobs = result.get('UploadJobs', []) if not upload_jobs: raise Exception("No upload job created") job = upload_jobs[0] # if job.get('Code') != 'Success': # raise Exception(f"Upload job failed: {job}") return job.get('JobId') # 返回JobId而不是VideoId def get_upload_job_status(self, job_id): """获取上传任务状态 Args: job_id: 上传任务ID Returns: dict: 任务状态信息,包含VideoId(如果上传完成) """ from aliyunsdkvod.request.v20170321.GetURLUploadInfosRequest import GetURLUploadInfosRequest request = GetURLUploadInfosRequest() request.set_accept_format('JSON') request.set_JobIds(job_id) response = self.client.do_action_with_exception(request) result = json.loads(response) upload_jobs = result.get('URLUploadInfoList', []) if not upload_jobs: raise Exception(f"No upload job found with ID: {job_id}") job = upload_jobs[0] return job def wait_for_video_id(self, job_id, max_retries=5, retry_interval=2): """等待上传任务完成并获取VideoId Args: job_id: 上传任务ID max_retries: 最大重试次数 retry_interval: 重试间隔(秒) Returns: str: 视频ID """ import time for i in range(max_retries): job_status = self.get_upload_job_status(job_id) if job_status.get('MediaId'): return job_status.get('MediaId') # 等待一段时间后重试 time.sleep(retry_interval) raise Exception(f"Max retries reached, upload job not completed: {job_id}") def upload_image_to_oss(self, image_url, title): """直接上传图片到阿里云OSS Args: image_url: 图片URL title: 图片标题 Returns: str: OSS中的图片URL """ logger.info(f"开始上传图片到OSS: {image_url}") try: # 1. 下载远程图片 image_response = requests.get(image_url, timeout=30) image_response.raise_for_status() image_content = image_response.content except Exception as e: logger.error(f"下载图片失败: {str(e)}") raise Exception(f"下载图片失败: {str(e)}") try: # 2. 生成OSS中的文件名(使用时间戳和原始文件名的组合) timestamp = int(time.time()) file_ext = image_url.split('.')[-1] if '.' in image_url else 'jpg' oss_filename = f"images/{timestamp}_{title[:30]}.{file_ext}" # 限制标题长度,避免文件名过长 # 3. 获取OSS bucket(从settings中获取配置) bucket_name = self.settings.get('ALIYUN_OSS_BUCKET') endpoint = self.settings.get('ALIYUN_OSS_ENDPOINT') oss_bucket = oss2.Bucket(self.oss_client, endpoint, bucket_name) # 4. 上传图片到OSS upload_response = oss_bucket.put_object(oss_filename, image_content) if upload_response.status == 200: # 5. 返回可访问的URL oss_url = f"https://{bucket_name}.{endpoint}/{oss_filename}" logger.info(f"图片上传成功: {oss_url}") return oss_url else: raise Exception(f"图片上传失败: {upload_response.status}") except Exception as e: logger.error(f"上传图片到OSS失败: {str(e)}") raise Exception(f"上传图片到OSS失败: {str(e)}") def process_item(self, item, spider): """处理数据项,通过URL上传视频到阿里云VOD Args: item: 爬取的数据项 spider: 爬虫实例 Returns: item: 处理后的数据项 """ # 如果已经有阿里云视频ID,跳过处理 with self.db_manager.sqlite_session() as session: # 检查是否存在相同source_url的记录 existing_video = session.query(VideoSQLite).filter_by(source_url=item.get('source_url')).first() if existing_video and existing_video.aliyun_video_id: logger.info(f"阿里云视频ID已存在,跳过该任务: {item.get('title')}") return item video_url = item.get('video_url') if not video_url: logger.warning(f"视频URL为空,跳过处理: {item.get('source_url')}") return item try: # 1. 上传封面图片到OSS(如果有) cover_url = item.get('source_thumbnail_url') if cover_url: try: oss_url = self.upload_image_to_oss( image_url=cover_url, title=item.get('title', '') ) # 更新item中的封面URL为OSS URL item['thumbnail_url'] = oss_url logger.info(f"封面图片上传到OSS成功: {oss_url}") except Exception as e: logger.error(f"封面图片上传到OSS失败: {str(e)}") # 如果封面上传失败,继续处理视频,不中断流程 # 2. 通过URL上传视频,获取JobId title = item.get('title', '') job_id = self.upload_media_by_url( video_url=video_url, title=title, cover_url=oss_url # 使用刚上传的OSS封面URL ) logger.info(f"成功创建阿里云视频URL上传任务: job_id={job_id}, title={title}") # 2. 等待上传完成并获取VideoId try: video_id = self.wait_for_video_id(job_id) logger.info(f"视频上传完成: video_id={video_id}, job_id={job_id}") # 3. 更新item中的阿里云视频ID和状态 item['aliyun_video_id'] = video_id item['aliyun_status'] = 'Success' except Exception as e: logger.error(f"等待视频上传完成失败: job_id={job_id}, error={str(e)}") item['aliyun_video_id'] = "" item['aliyun_status'] = 'Uploading' raise # 重新抛出异常,让上层错误处理来处理 except Exception as e: logger.error(f"阿里云视频URL上传失败: {str(e)}") item['aliyun_status'] = 'Failed' return item class SQLitePipeline: """SQLite数据库处理中间件""" def __init__(self, settings): """初始化SQLite中间件 Args: settings: Scrapy设置对象 """ self.db_manager = DatabaseManager(settings) @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def process_item(self, item, spider): """处理数据项,保存到SQLite数据库 Args: item: 爬取的数据项 spider: 爬虫实例 Returns: item: 处理后的数据项 """ now = datetime.now() now_str = now.strftime('%Y-%m-%d %H:%M:%S') with self.db_manager.sqlite_session() as session: # 检查是否存在相同source_url的记录 existing_video = session.query(VideoSQLite).filter_by(source_url=item.get('source_url')).first() if existing_video: logger.info(f"发现重复视频: {item.get('source_url')}") # 更新现有记录 existing_video.title = item.get('title', '') existing_video.description = item.get('description', '') existing_video.publish_time = item.get('publish_time', '') existing_video.update_time = now_str existing_video.video_url = item.get('video_url', '') existing_video.source_thumbnail_url = item.get('source_thumbnail_url', '') existing_video.duration = str(item.get('duration', '')) existing_video.video_list = str(item.get('video_list', 0)) # 判断video_id、status、thumbnail_url防止被覆盖 if item.get('aliyun_video_id'): existing_video.aliyun_video_id = item['aliyun_video_id'] if item.get('aliyun_status'): existing_video.aliyun_status = item['aliyun_status'] if item.get('thumbnail_url'): existing_video.thumbnail_url = item['thumbnail_url'] # existing_video.status = 0 # 重置状态为0 # 保存SQLite记录ID到item中,供后续中间件使用 item['sqlite_id'] = existing_video.id else: # 创建新记录 sqlite_data = { 'title': item.get('title', ''), 'description': item.get('description', ''), 'source_url': item.get('source_url', ''), 'publish_time': item.get('publish_time', ''), 'create_time': now_str, 'update_time': now_str, 'video_url': item.get('video_url', ''), 'source_thumbnail_url': item.get('source_thumbnail_url', ''), 'thumbnail_url': item.get('thumbnail_url', ''), 'duration': str(item.get('duration', '')), 'video_list': item.get('video_list', ''), 'aliyun_video_id': item.get('aliyun_video_id', ''), 'aliyun_status': item.get('aliyun_status', ''), 'status': 0 } new_video = VideoSQLite(**sqlite_data) session.add(new_video) session.flush() # 获取新插入记录的ID # 保存SQLite记录ID到item中,供后续中间件使用 item['sqlite_id'] = new_video.id return item class MariaDBPipeline: """将数据从SQLite迁移到MariaDB的管道""" def __init__(self, settings): """初始化管道 Args: settings: Scrapy设置对象 """ self.db_manager = DatabaseManager(settings) self.logger = logging.getLogger(__name__) @classmethod def from_crawler(cls, crawler): """从crawler创建管道实例 Args: crawler: Scrapy crawler对象 Returns: MariaDBPipeline: 管道实例 """ return cls(crawler.settings) def open_spider(self, spider): """当spider开启时调用""" self.logger.info("MariaDB管道已开启") def close_spider(self, spider): """当spider关闭时调用""" self.logger.info("MariaDB管道已关闭") self.migrate_data() def process_item(self, item, spider): """处理item Args: item: Scrapy item对象 spider: Scrapy spider对象 Returns: item: 处理后的item """ # 这里不需要处理item,因为我们要从SQLite读取数据 return item def migrate_data(self): """从SQLite迁移数据到MariaDB""" try: with self.db_manager.sqlite_session() as sqlite_session, \ self.db_manager.mysql_session() as mysql_session: # 1. 从SQLite读取视频数据 sqlite_videos = sqlite_session.query(VideoSQLite).where((VideoSQLite.aliyun_video_id != None) & (VideoSQLite.aliyun_video_id != '')).all() # sqlite_videos = sqlite_session.query(VideoSQLite).all() # 2. 批量迁移到MariaDB for video in sqlite_videos: # 根据video_id查重 existing_video_id = mysql_session.execute( text("SELECT id FROM wz_video WHERE video_remote_id = :video_remote_id LIMIT 1"), { 'video_remote_id': video.aliyun_video_id } ) if existing_video_id.first(): self.logger.info(f"远程数据库已存在该视频: {video.title}") continue # 映射到wz_video表 wz_video = { 'cid': 1, 'title': video.title or '', 'css': '', 'thumb': video.thumbnail_url or '', 'keywords': '', 'remark': video.description or '', 'block': 0, 'url': '', 'status': 9, 'route': 0, 'publisher': 'spider', 'addtime': int(time.time()), 'updatetime': int(time.time()), 'area': '1', 'category': '1', 'theme': 0, 'year': '2025', 'video_remote_id': video.aliyun_video_id or '', 'video_url': '', 'video_list': video.video_list or 0, 'month': '1' } # 映射到wz_video_data表 wz_video_data = { 'id': None, # 将在插入后设置 'content': '', 'coin': 0, 'groups': '', 'pagetype': 0, 'maxchars': 0, 'template': '', 'allowcomment': 1, 'relation': '' } # 插入wz_video并获取ID result = mysql_session.execute( text("""INSERT INTO wz_video ( cid, title, css, thumb, keywords, remark, block, url, status, route, publisher, addtime, updatetime, area, category, theme, year, video_remote_id, video_url, video_list, month ) VALUES ( :cid, :title, :css, :thumb, :keywords, :remark, :block, :url, :status, :route, :publisher, :addtime, :updatetime, :area, :category, :theme, :year, :video_remote_id, :video_url, :video_list, :month )"""), wz_video ) video_id = result.lastrowid # 设置wz_video_data的id并插入 wz_video_data['id'] = video_id mysql_session.execute( text("""INSERT INTO wz_video_data ( id, content, coin, groups, pagetype, maxchars, template, allowcomment, relation ) VALUES ( :id, :content, :coin, :groups, :pagetype, :maxchars, :template, :allowcomment, :relation )"""), wz_video_data ) mysql_session.commit() self.logger.info(f"成功迁移 {len(sqlite_videos)} 条视频数据到线上数据库") except Exception as e: self.logger.error(f"数据迁移失败: {str(e)}") raise