You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Ru/c/PY1/avjoy.py

392 lines
16 KiB

# -*- coding: utf-8 -*-
import json
import re
import requests
from pyquery import PyQuery as pq
import sys
sys.path.append('..')
from base.spider import Spider
class Spider(Spider):
host = 'https://cn.avjoy.me'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'referer': 'https://cn.avjoy.me/',
'origin': 'https://cn.avjoy.me',
}
def init(self, extend=''):
self.proxies = json.loads(extend).get('proxy', {}) if extend else {}
self.session = requests.Session()
self.session.headers.update(self.headers)
def getName(self):
return "hohoj"
def fetch(self, url, params=None):
try:
resp = self.session.get(url, headers=self.session.headers, params=params,
proxies=self.proxies, timeout=10, allow_redirects=True)
return resp.text
except:
return ''
def fetch_resp(self, url, params=None, extra_headers=None, stream=False):
try:
hdrs = self.session.headers.copy()
if extra_headers:
hdrs.update(extra_headers)
return self.session.get(url, headers=hdrs, params=params,
proxies=self.proxies, timeout=10,
allow_redirects=True, stream=stream)
except Exception:
return None
def homeContent(self, filter):
html = self.fetch(self.host)
return {
'class': [
{'type_name': '最新上传视频', 'type_id': 'videos'},
{'type_name': '视频', 'type_id': 'videos'},
{'type_name': '类别', 'type_id': 'categories'},
{'type_name': '标签', 'type_id': 'tags'}
],
'filters': self.get_filters(),
'list': self.parse_videos_from_list_html(pq(html))
}
def get_filters(self):
return {}
def categoryContent(self, tid, pg, filter, extend):
norm = tid.lstrip('/') if not tid.startswith('http') else tid
if '?' in norm and not norm.startswith('http'):
norm = norm.split('?', 1)[0]
url = f"{self.host}/{norm}" if not norm.startswith('http') else norm
params = (extend or {}).copy()
try:
if int(pg) > 1:
params['page'] = pg
except:
pass
params.pop('o', None)
html = self.fetch(url, params)
doc = pq(html)
m_cur = re.search(r"current_url\s*=\s*\"([^\"]+)\"", html)
if m_cur:
base_path = m_cur.group(1)
if base_path.startswith('/videos/') or base_path.startswith('/search/videos/'):
url = f"{self.host}{base_path}"
html = self.fetch(url, params)
doc = pq(html)
def uniq_append(items, entry):
key = (entry.get('vod_id'), entry.get('vod_name'))
if key and key not in {(i.get('vod_id'), i.get('vod_name')) for i in items}:
items.append(entry)
if tid == 'categories':
items = []
for card in doc('div.content-left .row.content-row > div').items():
a = card.find('a').eq(0)
href = (a.attr('href') or '').strip()
name = card.find('.category-title .title-truncate').text().strip()
pic = (card.find('.thumb-overlay img').attr('src') or '').strip()
if href and name and href.startswith('/videos/'):
cat_id = href.lstrip('/')
if pic and pic.startswith('/'):
pic = f"{self.host}{pic}"
uniq_append(items, {
'vod_id': cat_id,
'vod_name': name,
'vod_pic': pic,
'vod_tag': 'folder',
'style': {"type": "rect", "ratio": 1.1}
})
for a in doc('.dropdown-menu.multi-column-dropdown a').items():
href = (a.attr('href') or '').strip()
name = a.text().strip()
if href.startswith('/videos/') and name:
uniq_append(items, {
'vod_id': href.lstrip('/'),
'vod_name': name,
'vod_pic': '',
'vod_tag': 'folder',
'style': {"type": "rect", "ratio": 1.1}
})
return {
'list': items,
'page': '1',
'pagecount': 1,
'limit': 90,
'total': len(items)
}
if tid == 'tags':
items = []
for a in doc('.popular-tag a').items():
name = a.text().strip()
href = (a.attr('href') or '').strip()
if href.startswith('/search/videos/') and name:
uniq_append(items, {
'vod_id': href.lstrip('/'),
'vod_name': name,
'vod_tag': 'folder',
'style': {"type": "rect", "ratio": 1.0}
})
for a in doc('.trending-searches a').items():
name = a.text().strip()
href = (a.attr('href') or '').strip()
if href.startswith('/search/videos/') and name:
uniq_append(items, {
'vod_id': href.lstrip('/'),
'vod_name': name,
'vod_tag': 'folder',
'style': {"type": "rect", "ratio": 1.0}
})
return {
'list': items,
'page': '1',
'pagecount': 1,
'limit': 90,
'total': len(items)
}
videos = self.parse_videos_from_list_html(doc)
if not videos:
fallback = []
for a in doc('a[href^="/video/"]').items():
href = a.attr('href')
title = a.text().strip()
img = a.parents().find('img').eq(0).attr('src')
if href and title:
uniq_append(fallback, {
'vod_id': href,
'vod_name': title,
'vod_pic': img,
'style': {"type": "rect", "ratio": 1.5}
})
videos = fallback
pagecount = 1
try:
pagecount = doc('.pagination a').length or 1
except:
pagecount = 1
return {
'list': videos,
'page': pg,
'pagecount': pagecount,
'limit': 90,
'total': 999999
}
def detailContent(self, ids):
vid = ids[0]
url = f"{self.host}{vid}" if vid.startswith('/') else f"{self.host}/{vid}"
html = self.fetch(url)
data = pq(html)
title = data('h1').text() or data('title').text() or ''
title = re.sub(r'\s*HoHoJ.*$', '', title)
title = re.sub(r'\s*\|.*$', '', title)
title = title.strip()
poster = data('video#video').attr('poster') or data('meta[property="og:image"]').attr('content')
vod_year = data('.info span').eq(-1).text()
m_vid = re.search(r"video_id\s*=\s*\"(\d+)\"", html)
video_id = m_vid.group(1) if m_vid else ''
if not video_id:
m_url_id = re.search(r"/video/(\d+)", url) or re.search(r"/video/(\d+)", html)
video_id = m_url_id.group(1) if m_url_id else ''
m_vkey = re.search(r"/embed/([a-zA-Z0-9]+)", html)
vkey = m_vkey.group(1) if m_vkey else ''
play_id = video_id or vkey
vod = {
'vod_id': vid,
'vod_name': title,
'vod_play_from': '撸出血',
'vod_play_url': f"{title}${play_id or ''}",
'vod_pic': poster,
'vod_year': vod_year,
}
tags = []
for tag in data('a.tag').items():
name = tag.text().strip()
href = tag.attr('href')
if name and href:
tags.append(f'[a=cr:{json.dumps({"id": href, "name": name})}/]{name}[/a]')
if tags:
vod['vod_content'] = ' '.join(tags)
director_name = data('a[href^="/user/"]').text().strip()
if director_name:
try:
from urllib.parse import quote
director_href = f"/search/videos/{quote(director_name)}"
except:
director_href = f"/search/videos/{director_name}"
director_link = f"[a=cr:{json.dumps({'id': director_href, 'name': director_name})}/]{director_name}[/a]"
vod['vod_content'] = (vod.get('vod_content', '') + ('\n' if vod.get('vod_content') else '') + '导演:' + director_link)
intro = (data('section.video-description').text() or '').strip()
if not intro:
intro = (data('meta[name="description"]').attr('content') or '').strip()
if intro:
vod['vod_content'] = (vod.get('vod_content', '') + ('\n' if vod.get('vod_content') else '') + '影片介绍:' + intro)
return {'list': [vod]}
def searchContent(self, key, quick, pg="1"):
params = {}
try:
if int(pg) > 1:
params['page'] = pg
except:
pass
url = f"{self.host}/search/videos/{requests.utils.quote(key)}"
html = self.fetch(url, params)
if not html:
html = self.fetch(f"{self.host}/search", {'text': key, **params})
return {'list': self.parse_videos_from_list_html(pq(html)), 'page': pg}
def playerContent(self, flag, id, vipFlags):
def pick_best_source(html_text):
sources = []
for m in re.finditer(r"<source[^>]+src=\"([^\"]+)\"[^>]*>", html_text):
frag = html_text[m.start():m.end()]
src = m.group(1)
res_m = re.search(r"res=\'?(\d+)\'?", frag)
label_m = re.search(r"label=\'([^\']+)\'", frag)
res = int(res_m.group(1)) if res_m else 0
label = label_m.group(1) if label_m else ''
sources.append((res, label, src))
if sources:
sources.sort(reverse=True)
return sources[0][2]
mv = re.search(r"<video[^>]+src=\"([^\"]+)\"", html_text)
if mv:
return mv.group(1)
mv2 = re.search(r"var\s+videoSrc\s*=\s*[\"']([^\"']+)[\"']", html_text)
if mv2:
return mv2.group(1)
doc = pq(html_text)
return doc('video source').attr('src') or doc('video').attr('src') or ''
raw = str(id).strip()
if re.match(r'^https?://', raw) and self.isVideoFormat(raw):
return {
'parse': 0,
'url': raw,
'header': {
'user-agent': self.headers['user-agent'],
'referer': self.host,
'origin': self.host,
}
}
m = re.search(r"/video/(\d+)", raw) or re.search(r"id=(\d+)", raw)
if m:
raw = m.group(1)
is_numeric = re.match(r"^\d+$", raw) is not None
video_url = ''
referer_used = ''
if is_numeric:
for path in [f"{self.host}/video/{raw}", f"{self.host}/video/{raw}/"]:
self.session.headers['referer'] = path
play_html = self.fetch(path)
video_url = pick_best_source(play_html)
if video_url:
referer_used = path
break
m_dl = re.search(r"href=\"(/download\\.php\\?id=\\d+[^\"]*label=1080p)\"", play_html)
if not m_dl:
m_dl = re.search(r"href=\"(/download\\.php\\?id=\\d+[^\"]*)\"", play_html)
if m_dl:
dl_url = f"{self.host}{m_dl.group(1)}"
resp = self.fetch_resp(dl_url, extra_headers={'referer': path}, stream=True)
if resp and resp.ok:
resp.close()
video_url = resp.url
referer_used = path
break
if not video_url:
embed_url = f"{self.host}/embed/{raw}" if not is_numeric else f"{self.host}/embed?id={raw}"
self.session.headers['referer'] = embed_url
html = self.fetch(embed_url)
v2 = pick_best_source(html)
if v2:
video_url = v2
referer_used = embed_url
return {
'parse': 0,
'url': video_url or '',
'header': {
'user-agent': self.headers['user-agent'],
'referer': referer_used or self.host,
'origin': self.host,
}
}
def parse_videos_from_list_html(self, doc: pq):
videos = []
for item in doc('.row.content-row > div').items():
link = item.find('a').eq(0).attr('href')
img = item.find('.thumb-overlay img').eq(0).attr('src')
info = item.find('.content-info').eq(0)
title = info.find('.content-title').text().strip()
duration = (item.find('.video-duration, .thumb-overlay .duration, .content-duration, .duration').eq(0).text() or '').strip()
overlay_text = (item.find('.thumb-overlay').text() or '').strip()
hd_flag = bool(item.find('.hd, .icon-hd, .hd-icon, .badge-hd, .label-hd').length) or ('HD' in overlay_text)
if not link or not title:
continue
parts = []
if hd_flag:
parts.append('HD')
if duration:
parts.append(duration)
remarks = ''.join(parts)
videos.append({
'vod_id': link,
'vod_name': re.sub(r'\s*\|.*$', '', re.sub(r'\s*HoHoJ.*$', '', title)).strip(),
'vod_pic': img,
'vod_remarks': remarks or '',
'vod_tag': '',
'style': {"type": "rect", "ratio": 1.5}
})
if not videos:
for info in doc('.content-info').items():
a = info('a').eq(0)
link = a.attr('href')
title = info('.content-title').text().strip()
if not link or not title:
continue
img = info.prev('a').find('img').attr('src') or info.prevAll('a').eq(0).find('img').attr('src')
duration = (info.parents().find('.video-duration, .thumb-overlay .duration, .content-duration, .duration').eq(0).text() or '').strip()
overlay_text = (info.parents().find('.thumb-overlay').text() or '').strip()
hd_flag = bool(info.parents().find('.hd, .icon-hd, .hd-icon, .badge-hd, .label-hd').length) or ('HD' in overlay_text)
parts = []
if hd_flag:
parts.append('HD')
if duration:
parts.append(duration)
remarks = ''.join(parts)
videos.append({
'vod_id': link,
'vod_name': re.sub(r'\s*\|.*$', '', re.sub(r'\s*HoHoJ.*$', '', title)).strip(),
'vod_pic': img,
'vod_remarks': remarks or '',
'vod_tag': '',
'style': {"type": "rect", "ratio": 1.5}
})
return videos
def isVideoFormat(self, url):
return bool(url) and (url.lower().endswith('.mp4') or url.lower().endswith('.m3u8'))
def manualVideoCheck(self):
pass
def destroy(self):
pass
def homeVideoContent(self):
pass
def localProxy(self, param):
pass
def liveContent(self, url):
pass