# -*- coding: utf-8 -*- import re import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry requests.packages.urllib3.disable_warnings() import base64 from urllib.parse import urljoin, unquote from base.spider import Spider class Spider(Spider): def getName(self): return "AV研究所" def init(self, extend=""): super().init(extend) self.site_url = "https://xn--cdn0308-1yjs01cc-rf0zn60cta5031amw9i.yanjiusuo0038.top" self.headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 10; Mobile) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Mobile Safari/537.36", "Referer": self.site_url, "Accept-Language": "zh-CN,zh;q=0.9" } self.sess = requests.Session() retry = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504]) self.sess.mount("https://", HTTPAdapter(max_retries=retry)) self.sess.mount("http://", HTTPAdapter(max_retries=retry)) self.page_size = 20 self.total = 9999 def fetch(self, url, timeout=10): try: return self.sess.get(url, headers=self.headers, timeout=timeout, verify=False) except Exception: return None def _abs(self, u): if not u: return "" if u.startswith("//"): return "https:" + u if u.startswith(("http://", "https://")): return u return self.site_url + (u if u.startswith("/") else "/" + u) def _clean(self, s): if not s: return "" s = re.sub(r"<[^>]+>", "", s, flags=re.S) return re.sub(r"\s+", " ", s).strip() def homeContent(self, filter): # 可按你的站点改成动态抓取;这里先给固定分类 cate_list = [ {"type_name": "最新", "type_id": "latest-insert"}, {"type_name": "最近发布", "type_id": "recent-release"}, {"type_name": "评分榜", "type_id": "top-rating"}, {"type_name": "收藏榜", "type_id": "top-favorites"}, ] return {"class": cate_list} def _parse_video_list(self, html): video_list = [] # 匹配