# -*- coding: utf-8 -*- import re from base.spider import Spider class Spider(Spider): def __init__(self): self.name = 'Bad.news' self.host = 'https://bad.news' self.headers = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36', 'Referer': self.host + '/', 'Origin': self.host, 'Accept-Language': 'zh-CN,zh;q=0.9' } def getName(self): return self.name def init(self, extend=""): pass # ========================= # 首页分类 # ========================= def homeContent(self, filter): return { 'class': [ {'type_id': '', 'type_name': '新出品'}, {'type_id': '/dm', 'type_name': 'H动漫'}, {'type_id': '/av/release', 'type_name': '日本AV'}, {'type_id': '/tag/long-porn', 'type_name': '长视频'} ] } def homeVideoContent(self): return self.categoryContent('', '1', False, {}) # ========================= # 列表解析 # ========================= def parse_list(self, html): videos = [] # 定义黑名单关键词 black_list = ['热点', '招聘', '20k', '工作制', '双休', '远程', '月薪'] # 1. 解析瀑布流 (p1) p1 = re.findall( r'href="([^"]+)"[^>]*title="([^"]+)"[^>]*(?:data-echo-background|poster)="([^"]+)"', html, re.S ) for path, title, pic in p1: # 过滤逻辑:检查标题是否包含黑名单中的任何词 if any(word in title for word in black_list): continue if path.startswith('/'): videos.append({ 'vod_id': path, 'vod_name': title.strip(), 'vod_pic': pic.split('?')[0], 'vod_remarks': '' }) # 2. 解析 table 信息流 (p2) p2 = re.findall(r'(.*?)', html, re.S) for block in p2: # 先提取标题进行预校验 title_m = re.search(r'(.*?)', block, re.S) raw_title = re.sub('<[^>]+>', '', title_m.group(1)).strip() if title_m else '' # 如果标题为空或者是黑名单广告,直接跳过 if not raw_title or any(word in raw_title for word in black_list): continue link = re.search(r'href="([^"]+)"', block) if not link: continue path = link.group(1) if not path.startswith('/') or any(v['vod_id'] == path for v in videos): continue pic_m = re.search(r'poster="([^"]+)"', block) videos.append({ 'vod_id': path, 'vod_name': raw_title, 'vod_pic': pic_m.group(1).split('?')[0] if pic_m else '', 'vod_remarks': '' }) return videos # ========================= # 分类 # ========================= def categoryContent(self, tid, pg, filter, extend): pg = int(pg) url = f'{self.host}{tid}/page-{pg}' if tid else (self.host if pg == 1 else f'{self.host}/page-{pg}') res = self.fetch(url, headers=self.headers) return {'list': self.parse_list(res.text), 'page': pg, 'pagecount': 999} # ========================= # 详情页(HTML + DM 分流) # ========================= def detailContent(self, ids): path = ids[0] url = self.host + path html = self.fetch(url, headers=self.headers).text title_m = re.search(r'(.*?)', html) title = title_m.group(1).split('-')[0].strip() if title_m else 'Bad.news' # ===== DM(H动漫)========= if path.startswith('/dm'): iframe = re.search(r']+src="([^"]+)"', html) play_url = iframe.group(1) if iframe else url if play_url.startswith('/'): play_url = self.host + play_url return {'list': [{ 'vod_id': play_url, 'vod_name': title, 'vod_play_from': 'DM-Web', 'vod_play_url': f'播放${play_url}' }]} # ===== 普通 HTML 视频 ===== m = re.search(r']+data-source="([^"]+)"', html) if m: return {'list': [{ 'vod_id': path, 'vod_name': title, 'vod_play_from': 'HTML', 'vod_play_url': f'播放${m.group(1)}' }]} return {'list': []} # ========================= # 播放器 # ========================= def playerContent(self, flag, id, vipFlags): headers = { 'User-Agent': self.headers['User-Agent'], 'Referer': self.host + '/', 'Origin': self.host, 'Range': 'bytes=0-' } # DM 用 WebView 嗅探 if flag == 'DM-Web': return { 'parse': 1, 'sniff': 1, 'url': id, 'header': headers, 'sniff_include': ['.mp4', '.m3u8'], 'sniff_exclude': [ '.html', '.js', '.css', '.jpg', '.png', '.gif', 'google', 'facebook', 'doubleclick', 'analytics', 'ads', 'tracker' ] } # HTML 直连 return {'parse': 0, 'url': id} # ========================= # 搜索 # ========================= def searchContent(self, key, quick, pg="1"): url = f'{self.host}/search/q-{key}' res = self.fetch(url, headers=self.headers) return {'list': self.parse_list(res.text)}