# -*- coding: utf-8 -*- import re from base.spider import Spider class Spider(Spider): def __init__(self): self.name = 'Bad.news' self.host = 'https://bad.news' self.headers = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36', 'Referer': self.host + '/', 'Origin': self.host, 'Accept-Language': 'zh-CN,zh;q=0.9' } def getName(self): return self.name def init(self, extend=""): pass # ========================= # 首页分类 # ========================= def homeContent(self, filter): return { 'class': [ {'type_id': '', 'type_name': '新出品'}, {'type_id': '/dm', 'type_name': 'H动漫'}, {'type_id': '/av/release', 'type_name': '日本AV'}, {'type_id': '/tag/long-porn', 'type_name': '长视频'} ] } def homeVideoContent(self): return self.categoryContent('', '1', False, {}) # ========================= # 列表解析 # ========================= def parse_list(self, html): videos = [] # 定义黑名单关键词 black_list = ['热点', '招聘', '20k', '工作制', '双休', '远程', '月薪'] # 1. 解析瀑布流 (p1) p1 = re.findall( r'href="([^"]+)"[^>]*title="([^"]+)"[^>]*(?:data-echo-background|poster)="([^"]+)"', html, re.S ) for path, title, pic in p1: # 过滤逻辑:检查标题是否包含黑名单中的任何词 if any(word in title for word in black_list): continue if path.startswith('/'): videos.append({ 'vod_id': path, 'vod_name': title.strip(), 'vod_pic': pic.split('?')[0], 'vod_remarks': '' }) # 2. 解析 table 信息流 (p2) p2 = re.findall(r'