You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Ru/c/PY1/JAV目录.py

251 lines
8.8 KiB

# -*- coding: utf-8 -*-
import re
import sys
from pyquery import PyQuery as pq
from base64 import b64decode, b64encode
from requests import Session
sys.path.append('..')
from base.spider import Spider
class Spider(Spider):
def init(self, extend=""):
self.headers['referer'] = f'{self.host}/'
self.session = Session()
self.session.headers.update(self.headers)
def getName(self):
return "JAV目录大全"
def isVideoFormat(self, url):
pass
def manualVideoCheck(self):
pass
def destroy(self):
pass
host = "https://javmenu.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-full-version': '"133.0.6943.98"',
'sec-ch-ua-arch': '"x86"',
'sec-ch-ua-platform': '"Windows"',
'sec-ch-ua-platform-version': '"19.0.0"',
'sec-ch-ua-model': '""',
'sec-ch-ua-full-version-list': '"Not(A:Brand";v="99.0.0.0", "Google Chrome";v="133.0.6943.98", "Chromium";v="133.0.6943.98"',
'dnt': '1',
'upgrade-insecure-requests': '1',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'priority': 'u=0, i'
}
# -------------------- 业务接口 --------------------
def homeContent(self, filter):
cateManual = {
"FC2在线": "/zh/fc2/online",
"成人动画": "/zh/hanime/online",
"国产在线": "/zh/chinese/online",
"有码在线": "/zh/censored/online",
"无码在线": "/zh/uncensored/online",
"欧美在线": "/zh/western/online"
}
classes = [{'type_name': k, 'type_id': v} for k, v in cateManual.items()]
return {'class': classes}
def homeVideoContent(self):
data = self.getpq("/zh")
return {'list': self.getlist(data(".video-list-item"))}
def categoryContent(self, tid, pg, filter, extend):
url = f"{self.host}{tid}" if pg == '1' else f"{self.host}{tid}?page={pg}"
data = self.getpq(url)
return {
'list': self.getlist(data(".video-list-item")),
'page': pg,
'pagecount': 9999,
'limit': 90,
'total': 999999
}
def detailContent(self, ids):
vod_id = ids[0]
if not vod_id.startswith('http'):
url = f"{self.host}{vod_id}"
else:
url = vod_id
vod_id = vod_id.replace(self.host, '')
data = self.getpq(url)
vod = {
'vod_id': vod_id,
'vod_name': data('h1').text() or data('title').text().split(' - ')[0],
'vod_pic': self.getCover(data),
'vod_content': data('.card-text').text() or '',
'vod_director': '',
'vod_actor': self.getActors(data),
'vod_area': '日本',
'vod_year': self.getYear(data('.text-muted').text()),
'vod_remarks': self.getRemarks(data),
'vod_play_from': 'JAV在线',
'vod_play_url': self.getPlaylist(data, url)
}
return {'list': [vod]}
def searchContent(self, key, quick, pg="1"):
url = f"{self.host}/zh/search?wd={key}&page={pg}"
data = self.getpq(url)
return {'list': self.getlist(data(".video-list-item"))}
def playerContent(self, flag, id, vipFlags):
return {'parse': 0, 'url': self.d64(id), 'header': self.headers}
# -------------------- 私有工具 --------------------
def getlist(self, data):
vlist = []
for item in data.items():
link = item('a').attr('href')
if not link or '/zh/' not in link:
continue
link = link.replace(self.host, '') if link.startswith(self.host) else link
name = item('.card-title').text() or item('img').attr('alt') or ''
if not name:
continue
vlist.append({
'vod_id': link,
'vod_name': name.split(' - ')[0].strip(),
'vod_pic': self.getListPicture(item),
'vod_remarks': (item('.text-muted').text() or '').strip(),
'style': {'ratio': 1.5, 'type': 'rect'}
})
return vlist
# ******** 修复版本:支持LazyLoad和正确过滤 ********
def getListPicture(self, item):
"""
获取列表中的图片
支持LazyLoad延迟加载机制
过滤水印、占位符和无预览图
"""
# 获取所有img标签
imgs = item('img')
for img in imgs.items():
# 优先级:先从data-src获取(LazyLoad属性),再从src获取
pic = img.attr('data-src') or img.attr('src')
# 过滤条件:排除水印、占位符、加载图片
if pic and not any(keyword in pic for keyword in ['button_logo', 'no_preview', 'loading.gif', 'loading.png']):
return pic
return ''
def getCover(self, data):
"""
获取详情页的图片
支持LazyLoad延迟加载机制
过滤水印、占位符和无预览图
"""
# 获取所有img标签
imgs = data('img')
for img in imgs.items():
# 优先级:先从data-src获取(LazyLoad属性),再从src获取
pic = img.attr('data-src') or img.attr('src')
# 过滤条件:排除水印、占位符、加载图片
if pic and not any(keyword in pic for keyword in ['button_logo', 'no_preview', 'loading.gif', 'loading.png', 'website_building']):
return pic
return ''
# **********************************
def getActors(self, data):
"""获取演员信息"""
actors = []
h1_text = data('h1').text()
if h1_text:
actors.extend(h1_text.strip().split()[1:])
actor_links = data('a[href*="/actor/"]')
for actor_link in actor_links.items():
actor_text = actor_link.text()
if actor_text and actor_text not in actors:
actors.append(actor_text)
return ','.join(actors) if actors else '未知'
def getYear(self, date_str):
"""从日期字符串中提取年份"""
m = re.search(r'(\d{4})-\d{2}-\d{2}', date_str or '')
return m.group(1) if m else ''
def getRemarks(self, data):
"""获取备注信息(标签)"""
tags = [tag.text() for tag in data('.badge').items() if tag.text()]
return ' '.join(set(tags)) if tags else ''
def getPlaylist(self, data, url):
"""
获取播放列表
从source、video标签和脚本中提取m3u8链接
"""
play_urls, seen = [], set()
# 从source标签获取
for src in data('source').items():
u = src.attr('src')
if u and u not in seen:
play_urls.append(f"{len(play_urls)+1}${self.e64(u)}")
seen.add(u)
# 从video标签获取
for u in data('video').items():
u = u.attr('src')
if u and u not in seen:
play_urls.append(f"线路{len(play_urls)+1}${self.e64(u)}")
seen.add(u)
# 从脚本中提取m3u8链接
for m in re.findall(r'https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*', data('script').text()):
if m not in seen:
play_urls.append(f"线路{len(play_urls)+1}${self.e64(m)}")
seen.add(m)
# 如果没有找到播放链接,使用页面URL
if not play_urls:
play_urls.append(f"在线播放${self.e64(url)}")
return '#'.join(play_urls)
def getpq(self, path=''):
"""获取页面内容并返回PyQuery对象"""
url = path if path.startswith('http') else f'{self.host}{path}'
try:
rsp = self.session.get(url, timeout=20)
rsp.encoding = 'utf-8'
return pq(rsp.text)
except Exception as e:
print(f"getpq error: {e}")
return pq('')
def e64(self, text):
"""Base64编码"""
try:
return b64encode(text.encode('utf-8')).decode('utf-8')
except Exception:
return ''
def d64(self, encoded_text):
"""Base64解码"""
try:
return b64decode(encoded_text.encode('utf-8')).decode('utf-8')
except Exception:
return ''