import re
import requests
import json
import os
import time
import random
from urllib.parse import urlparse, parse_qs, urljoin
from typing import Dict, List, Optional, Tuple, Any
import logging
from concurrent.futures import ThreadPoolExecutor
import hashlib
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class UniversalVideoSpider:
"""万能通用视频爬虫"""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
})
# 预定义的正则表达式模式(针对视频网站优化)
self.patterns = {
# 视频标题
'title': [
r'
]*>([^<]+)',
r']*>([^<]+)',
r'class="video-title"[^>]*>([^<]+)',
r'id="video-title"[^>]*>([^<]+)',
],
# 视频描述
'description': [
r']*>([^<]+)',
r'id="description"[^>]*>([^<]+)',
],
# 视频URL(直接视频文件)
'video_url': [
r'src="([^"]+\.(mp4|flv|avi|mov|wmv|mkv|webm|m3u8)[^"]*)"',
r'video-src="([^"]+)"',
r'data-video="([^"]+)"',
r']*src="([^"]+)"',
r'player\.load\([^{]*\{[^}]*url:\s*[\'"]([^\'"]+)[\'"]',
r'video_url:\s*[\'"]([^\'"]+)[\'"]',
r'播放地址.*?[\'"](https?://[^\'"]+\.m3u8[^\'"]*)[\'"]',
],
# 封面图片
'cover_image': [
r']*src="([^"]+)"',
r'thumbnail:\s*[\'"]([^\'"]+)[\'"]',
],
# 视频时长
'duration': [
r'duration["\']?\s*:\s*["\']?([0-9:]+)',
r'时长[::]\s*([0-9:]+)',
r']*class="duration"[^>]*>([0-9:]+)',
r'data-duration="([^"]+)"',
],
# 发布时间
'publish_time': [
r'发布时间[::]\s*([^<]+)',
r'发布于[::]\s*([^<]+)',
r'',
r'datetime="([^"]+)"',
r'publish_time["\']?\s*:\s*["\']?([^"\']+)["\']?',
],
# 播放次数
'view_count': [
r'播放[::]\s*([0-9,]+)',
r'观看[::]\s*([0-9,]+)',
r'播放量[::]\s*([0-9,]+)',
r']*class="views"[^>]*>([^<]+)',
r'view_count["\']?\s*:\s*["\']?([0-9,]+)',
],
# M3U8相关
'm3u8_url': [
r'(https?://[^\s"\'<>]+\.m3u8[^\s"\']*)',
r'var\s+url\s*=\s*["\'](https?://[^"\']+\.m3u8)["\']',
r'm3u8["\']?\s*:\s*["\'](https?://[^"\']+)["\']',
],
# JSON数据(包含视频信息)
'json_data': [
r'',
r'window\.__INITIAL_STATE__\s*=\s*({[^;]+});',
r'var\s+videoInfo\s*=\s*({[^;]+});',
],
# iframe视频嵌入
'iframe': [
r'