From 26c33ff31401b49dab3767ec8a4c3d9081335ac8 Mon Sep 17 00:00:00 2001 From: glyh Date: Sun, 29 Sep 2024 01:04:44 +0800 Subject: [PATCH 01/10] checkpoint push: single crawler works, but __main__ broken --- config.yml | 24 +- javsp/__main__.py | 211 +++++++--------- javsp/config.py | 6 +- javsp/crawlers/all.py | 30 +++ javsp/{web => crawlers}/exceptions.py | 0 javsp/crawlers/interface.py | 21 ++ javsp/crawlers/proxyfree.py | 103 ++++++++ javsp/crawlers/sites/airav.py | 124 +++++++++ javsp/crawlers/sites/arzon.py | 105 ++++++++ javsp/crawlers/sites/arzon_iv.py | 100 ++++++++ javsp/crawlers/sites/avsox.py | 88 +++++++ javsp/crawlers/sites/avwiki.py | 82 ++++++ javsp/crawlers/sites/dl_getchu.py | 131 ++++++++++ javsp/crawlers/sites/fanza.py | 246 ++++++++++++++++++ javsp/crawlers/sites/fc2.py | 120 +++++++++ javsp/crawlers/sites/fc2ppvdb.py | 92 +++++++ javsp/crawlers/sites/gyutto.py | 106 ++++++++ javsp/crawlers/sites/jav321.py | 117 +++++++++ javsp/crawlers/sites/javbus.py | 129 ++++++++++ javsp/crawlers/sites/javdb.py | 350 ++++++++++++++++++++++++++ javsp/crawlers/sites/javlib.py | 115 +++++++++ javsp/crawlers/sites/javmenu.py | 100 ++++++++ javsp/crawlers/sites/mgstage.py | 127 ++++++++++ javsp/crawlers/sites/njav.py | 150 +++++++++++ javsp/crawlers/sites/prestige.py | 101 ++++++++ javsp/func.py | 22 +- javsp/network/client.py | 43 ++++ javsp/network/utils.py | 91 +++++++ javsp/{web => }/translate.py | 26 +- javsp/web/airav.py | 118 --------- javsp/web/arzon.py | 100 -------- javsp/web/arzon_iv.py | 93 ------- javsp/web/avsox.py | 75 ------ javsp/web/avwiki.py | 72 ------ javsp/web/base.py | 270 -------------------- javsp/web/dl_getchu.py | 122 --------- javsp/web/fanza.py | 231 ----------------- javsp/web/fc2.py | 105 -------- javsp/web/fc2fan.py | 80 ------ javsp/web/fc2ppvdb.py | 76 ------ javsp/web/gyutto.py | 87 ------- javsp/web/jav321.py | 100 -------- javsp/web/javbus.py | 115 --------- javsp/web/javdb.py | 333 ------------------------ javsp/web/javlib.py | 141 ----------- javsp/web/javmenu.py | 88 ------- javsp/web/mgstage.py | 114 --------- javsp/web/njav.py | 134 ---------- javsp/web/prestige.py | 83 ------ javsp/web/proxyfree.py | 75 ------ poetry.lock | 158 +++++++++++- pyproject.toml | 3 +- tools/config_migration.py | 13 +- 53 files changed, 2956 insertions(+), 2790 deletions(-) create mode 100644 javsp/crawlers/all.py rename javsp/{web => crawlers}/exceptions.py (100%) create mode 100644 javsp/crawlers/interface.py create mode 100644 javsp/crawlers/proxyfree.py create mode 100644 javsp/crawlers/sites/airav.py create mode 100644 javsp/crawlers/sites/arzon.py create mode 100644 javsp/crawlers/sites/arzon_iv.py create mode 100644 javsp/crawlers/sites/avsox.py create mode 100644 javsp/crawlers/sites/avwiki.py create mode 100644 javsp/crawlers/sites/dl_getchu.py create mode 100644 javsp/crawlers/sites/fanza.py create mode 100644 javsp/crawlers/sites/fc2.py create mode 100644 javsp/crawlers/sites/fc2ppvdb.py create mode 100644 javsp/crawlers/sites/gyutto.py create mode 100644 javsp/crawlers/sites/jav321.py create mode 100644 javsp/crawlers/sites/javbus.py create mode 100644 javsp/crawlers/sites/javdb.py create mode 100644 javsp/crawlers/sites/javlib.py create mode 100644 javsp/crawlers/sites/javmenu.py create mode 100644 javsp/crawlers/sites/mgstage.py create mode 100644 javsp/crawlers/sites/njav.py create mode 100644 javsp/crawlers/sites/prestige.py create mode 100644 javsp/network/client.py create mode 100644 javsp/network/utils.py rename javsp/{web => }/translate.py (94%) delete mode 100644 javsp/web/airav.py delete mode 100644 javsp/web/arzon.py delete mode 100644 javsp/web/arzon_iv.py delete mode 100644 javsp/web/avsox.py delete mode 100644 javsp/web/avwiki.py delete mode 100644 javsp/web/base.py delete mode 100644 javsp/web/dl_getchu.py delete mode 100644 javsp/web/fanza.py delete mode 100644 javsp/web/fc2.py delete mode 100644 javsp/web/fc2fan.py delete mode 100644 javsp/web/fc2ppvdb.py delete mode 100644 javsp/web/gyutto.py delete mode 100644 javsp/web/jav321.py delete mode 100644 javsp/web/javbus.py delete mode 100644 javsp/web/javdb.py delete mode 100644 javsp/web/javlib.py delete mode 100644 javsp/web/javmenu.py delete mode 100644 javsp/web/mgstage.py delete mode 100644 javsp/web/njav.py delete mode 100644 javsp/web/prestige.py delete mode 100644 javsp/web/proxyfree.py diff --git a/config.yml b/config.yml index 53fac4863..7d8790195 100644 --- a/config.yml +++ b/config.yml @@ -25,16 +25,24 @@ network: # 设置代理服务器地址,支持 http, socks5/socks5h 代理,比如'http://127.0.0.1:1080' # null表示禁用代理 proxy_server: null - # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置 - proxy_free: - avsox: 'https://avsox.click' - javbus: 'https://www.seedmm.help' - javdb: 'https://javdb368.com' - javlib: 'https://www.y78k.com' # 网络问题导致抓取数据失败时的重试次数,通常3次就差不多了 - retry: 3 + retries: 3 # https://en.wikipedia.org/wiki/ISO_8601#Durations timeout: PT10S + # 对列表中的地址不使用梯子(如果启用了的话) + unproxied: [ + 'https://www.seedmm.help', + 'https://javdb368.com', + 'https://www.y78k.com', + 'https://www.javbus.one', + 'https://www.tellme.pw', + ] + # 各个站点的代替地址。 + # JavSP会按顺序尝试列表里的每一个服务器,如果都不行会使用默认的主站点地址 + fallback: + javbus: ['https://www.seedmm.help'] + javdb: ['https://javdb368.com'] + javlib: ['https://www.y78k.com'] ################################ crawler: @@ -52,8 +60,6 @@ crawler: hardworking: true # 使用网页番号作为最终番号(启用时会对番号大小写等进行更正) respect_site_avid: true - # fc2fan已关站。如果你有镜像,请设置本地镜像文件夹的路径,此文件夹内要有类似'FC2-12345.html'的网页文件 - fc2fan_local_path: null # 刮削一部电影后的等待时间(设置为0禁用此功能) # https://en.wikipedia.org/wiki/ISO_8601#Durations sleep_after_scraping: PT1S diff --git a/javsp/__main__.py b/javsp/__main__.py index 7771170e7..cf73ffd09 100644 --- a/javsp/__main__.py +++ b/javsp/__main__.py @@ -2,14 +2,18 @@ import re import sys import json +import asyncio import time import logging from PIL import Image +from lxml.etree import Comment from pydantic import ValidationError +from pydantic_core import Url from pydantic_extra_types.pendulum_dt import Duration -import requests import threading -from typing import Dict, List +from typing import Any, Coroutine, Dict, List +from javsp.crawlers.interface import Crawler +from javsp.crawlers.all import crawlers sys.stdout.reconfigure(encoding='utf-8') @@ -23,7 +27,7 @@ from javsp.print import TqdmOut -from javsp.cropper import Cropper, get_cropper +from javsp.cropper import get_cropper # 将StreamHandler的stream修改为TqdmOut,以与Tqdm协同工作 @@ -41,11 +45,11 @@ from javsp.func import * from javsp.image import * from javsp.datatype import Movie, MovieInfo -from javsp.web.base import download -from javsp.web.exceptions import * -from javsp.web.translate import translate_movie_info +from javsp.network.utils import url_download +from javsp.crawlers.exceptions import * +from javsp.translate import translate_movie_info -from javsp.config import Cfg, CrawlerID +from javsp.config import Cfg, CrawlerID, UseJavDBCover actressAliasMap = {} @@ -57,86 +61,49 @@ def resolve_alias(name): return name # 如果找不到别名对应的固定名字,则返回原名 -def import_crawlers(): - """按配置文件的抓取器顺序将该字段转换为抓取器的函数列表""" - unknown_mods = [] - for _, mods in Cfg().crawler.selection.items(): - valid_mods = [] - for name in mods: - try: - # 导入fc2fan抓取器的前提: 配置了fc2fan的本地路径 - # if name == 'fc2fan' and (not os.path.isdir(Cfg().Crawler.fc2fan_local_path)): - # logger.debug('由于未配置有效的fc2fan路径,已跳过该抓取器') - # continue - import_name = 'javsp.web.' + name - __import__(import_name) - valid_mods.append(import_name) # 抓取器有效: 使用完整模块路径,便于程序实际使用 - except ModuleNotFoundError: - unknown_mods.append(name) # 抓取器无效: 仅使用模块名,便于显示 - if unknown_mods: - logger.warning('配置的抓取器无效: ' + ', '.join(unknown_mods)) - - # 爬虫是IO密集型任务,可以通过多线程提升效率 -def parallel_crawler(movie: Movie, tqdm_bar=None): +async def parallel_crawler(movie: Movie, tqdm_bar=None) -> dict[CrawlerID, MovieInfo]: """使用多线程抓取不同网站的数据""" - def wrapper(parser, info: MovieInfo, retry): + + async def wrapper(id: CrawlerID, movie: MovieInfo) -> None: """对抓取器函数进行包装,便于更新提示信息和自动重试""" - crawler_name = threading.current_thread().name - task_info = f'Crawler: {crawler_name}: {info.dvdid}' - for cnt in range(retry): - try: - parser(info) - movie_id = info.dvdid or info.cid - logger.debug(f"{crawler_name}: 抓取成功: '{movie_id}': '{info.url}'") - setattr(info, 'success', True) - if isinstance(tqdm_bar, tqdm): - tqdm_bar.set_description(f'{crawler_name}: 抓取完成') - break - except MovieNotFoundError as e: - logger.debug(e) - break - except MovieDuplicateError as e: - logger.exception(e) - break - except (SiteBlocked, SitePermissionError, CredentialError) as e: - logger.error(e) - break - except requests.exceptions.RequestException as e: - logger.debug(f'{crawler_name}: 网络错误,正在重试 ({cnt+1}/{retry}): \n{repr(e)}') - if isinstance(tqdm_bar, tqdm): - tqdm_bar.set_description(f'{crawler_name}: 网络错误,正在重试') - except Exception as e: - logger.exception(e) + try: + crawler = await crawlers[id].create() + await crawler.crawl_and_fill(movie) + movie_id = info.dvdid or info.cid + logger.debug(f"{crawler.id.value}: 抓取成功: '{movie_id}': '{info.url}'") + setattr(info, 'success', True) + if isinstance(tqdm_bar, tqdm): + tqdm_bar.set_description(f'{crawler.id.value}: 抓取完成') + except MovieNotFoundError as e: + logger.debug(e) + except MovieDuplicateError as e: + logger.exception(e) + except (SiteBlocked, SitePermissionError, CredentialError) as e: + logger.error(e) + except Exception as e: + logger.exception(e) # 根据影片的数据源获取对应的抓取器 - crawler_mods: List[CrawlerID] = Cfg().crawler.selection[movie.data_src] + crawler_to_use: List[CrawlerID] = Cfg().crawler.selection[movie.data_src] + + all_info: Dict[CrawlerID, MovieInfo] = {i: MovieInfo(movie) for i in crawler_to_use} - all_info = {i.value: MovieInfo(movie) for i in crawler_mods} # 番号为cid但同时也有有效的dvdid时,也尝试使用普通模式进行抓取 if movie.data_src == 'cid' and movie.dvdid: - crawler_mods = crawler_mods + Cfg().crawler.selection.normal + crawler_to_use += Cfg().crawler.selection.normal for i in all_info.values(): i.dvdid = None for i in Cfg().crawler.selection.normal: all_info[i] = MovieInfo(movie.dvdid) - thread_pool = [] - for mod_partial, info in all_info.items(): - mod = f"javsp.web.{mod_partial}" - parser = getattr(sys.modules[mod], 'parse_data') - # 将all_info中的info实例传递给parser,parser抓取完成后,info实例的值已经完成更新 - # TODO: 抓取器如果带有parse_data_raw,说明它已经自行进行了重试处理,此时将重试次数设置为1 - if hasattr(sys.modules[mod], 'parse_data_raw'): - th = threading.Thread(target=wrapper, name=mod, args=(parser, info, 1)) - else: - th = threading.Thread(target=wrapper, name=mod, args=(parser, info, Cfg().network.retry)) - th.start() - thread_pool.append(th) - # 等待所有线程结束 - timeout = Cfg().network.retry * Cfg().network.timeout.total_seconds() - for th in thread_pool: - th: threading.Thread - th.join(timeout=timeout) + + co_pool: list[Coroutine[Any, Any, None]] = [] + for crawler_id, info in all_info.items(): + co_pool.append(wrapper(crawler_id, info)) + + # 等待所有协程结束 + asyncio.gather(*co_pool) + # 根据抓取结果更新影片类型判定 if movie.data_src == 'cid' and movie.dvdid: titles = [all_info[i].title for i in Cfg().crawler.selection[movie.data_src]] @@ -148,22 +115,22 @@ def wrapper(parser, info: MovieInfo, retry): movie.data_src = 'normal' movie.cid = None all_info = {k: v for k, v in all_info.items() if k not in Cfg().crawler.selection['cid']} + # 删除抓取失败的站点对应的数据 all_info = {k:v for k,v in all_info.items() if hasattr(v, 'success')} for info in all_info.values(): del info.success - # 删除all_info中键名中的'web.' - all_info = {k[4:]:v for k,v in all_info.items()} + return all_info -def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): +def info_summary(movie: Movie, all_info: Dict[CrawlerID, MovieInfo]): """汇总多个来源的在线数据生成最终数据""" final_info = MovieInfo(movie) ########## 部分字段配置了专门的选取逻辑,先处理这些字段 ########## # genre - if 'javdb' in all_info and all_info['javdb'].genre: - final_info.genre = all_info['javdb'].genre + if 'javdb' in all_info and all_info[CrawlerID.javdb].genre: + final_info.genre = all_info[CrawlerID.javdb].genre ########## 移除所有抓取器数据中,标题尾部的女优名 ########## if Cfg().summarizer.title.remove_trailing_actor_name: @@ -197,7 +164,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): setattr(final_info, attr, incoming) absorbed.append(attr) if absorbed: - logger.debug(f"从'{name}'中获取了字段: " + ' '.join(absorbed)) + logger.debug(f"从'{name.value}'中获取了字段: " + ' '.join(absorbed)) # 使用网站的番号作为番号 if Cfg().crawler.respect_site_avid: id_weight = {} @@ -216,7 +183,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): else: final_info.cid = final_id # javdb封面有水印,优先采用其他站点的封面 - javdb_cover = getattr(all_info.get('javdb'), 'cover', None) + javdb_cover = getattr(all_info.get(CrawlerID.javdb), 'cover', None) if javdb_cover is not None: match Cfg().crawler.use_javdb_cover: case UseJavDBCover.fallback: @@ -402,7 +369,7 @@ def should_use_ai_crop_match(label): fanart_cropped = add_label_to_poster(fanart_cropped, UNCENSORED_MARK_FILE, LabelPostion.BOTTOM_LEFT) fanart_cropped.save(movie.poster_file) -def RunNormalMode(all_movies): +async def RunNormalMode(all_movies): """普通整理模式""" def check_step(result, msg='步骤错误'): """检查一个整理步骤的结果,并负责更新tqdm的进度""" @@ -427,7 +394,7 @@ def check_step(result, msg='步骤错误'): inner_bar = tqdm(total=total_step, desc='步骤', ascii=True, leave=False) # 依次执行各个步骤 inner_bar.set_description(f'启动并发任务') - all_info = parallel_crawler(movie, inner_bar) + all_info = await parallel_crawler(movie, inner_bar) msg = f'为其配置的{len(Cfg().crawler.selection[movie.data_src])}个抓取器均未获取到影片信息' check_step(all_info, msg) @@ -447,9 +414,9 @@ def check_step(result, msg='步骤错误'): inner_bar.set_description('下载封面图片') if Cfg().summarizer.cover.highres: - cover_dl = download_cover(movie.info.covers, movie.fanart_file, movie.info.big_covers) + cover_dl = await download_cover(movie.info.covers, movie.fanart_file, movie.info.big_covers) else: - cover_dl = download_cover(movie.info.covers, movie.fanart_file) + cover_dl = await download_cover(movie.info.covers, movie.fanart_file) check_step(cover_dl, '下载封面图片失败') cover, pic_path = cover_dl # 确保实际下载的封面的url与即将写入到movie.info中的一致 @@ -476,12 +443,12 @@ def check_step(result, msg='步骤错误'): fanart_destination = f"{extrafanartdir}/{id}.png" try: - info = download(pic_url, fanart_destination) + info = await url_download(Url(pic_url), fanart_destination) if valid_pic(fanart_destination): filesize = get_fmt_size(pic_path) width, height = get_pic_size(pic_path) - elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed'])) - speed = get_fmt_size(info['rate']) + '/s' + elapsed = str(info.elapsed) + speed = f"{info.get_rate()}Mbps" logger.info(f"已下载剧照{pic_url} {id}.png: {width}x{height}, {filesize} [{elapsed}, {speed}]") else: check_step(False, f"下载剧照{id}: {pic_url}失败") @@ -512,38 +479,29 @@ def check_step(result, msg='步骤错误'): return return_movies -def download_cover(covers, fanart_path, big_covers=[]): +async def download_cover(covers, fanart_path, big_covers=[]): """下载封面图片""" # 优先下载高清封面 for url in big_covers: pic_path = get_pic_path(fanart_path, url) - for _ in range(Cfg().network.retry): - try: - info = download(url, pic_path) - if valid_pic(pic_path): - filesize = get_fmt_size(pic_path) - width, height = get_pic_size(pic_path) - elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed'])) - speed = get_fmt_size(info['rate']) + '/s' - logger.info(f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]") - return (url, pic_path) - except requests.exceptions.HTTPError: - # HTTPError通常说明猜测的高清封面地址实际不可用,因此不再重试 - break + info = await url_download(Url(url), pic_path) + if valid_pic(pic_path): + filesize = get_fmt_size(pic_path) + width, height = get_pic_size(pic_path) + elapsed = str(info.elapsed) + speed = f"{info.get_rate()}Mbps" + logger.info(f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]") + return (url, pic_path) # 如果没有高清封面或高清封面下载失败 for url in covers: pic_path = get_pic_path(fanart_path, url) - for _ in range(Cfg().network.retry): - try: - download(url, pic_path) - if valid_pic(pic_path): - logger.debug(f"已下载封面: '{url}'") - return (url, pic_path) - else: - logger.debug(f"图片无效或已损坏: '{url}',尝试更换下载地址") - break - except Exception as e: - logger.debug(e, exc_info=True) + await url_download(Url(url), pic_path) + if valid_pic(pic_path): + logger.debug(f"已下载封面: '{url}'") + return (url, pic_path) + else: + logger.debug(f"图片无效或已损坏: '{url}',尝试更换下载地址") + break logger.error(f"下载封面图片失败") logger.debug('big_covers:'+str(big_covers) + ', covers'+str(covers)) return None @@ -558,14 +516,7 @@ def get_pic_path(fanart_path, url): pic_path = fanart_base + "." + pic_extend return pic_path -def error_exit(success, err_info): - """检查业务逻辑是否成功完成,如果失败则报错退出程序""" - if not success: - logger.error(err_info) - sys.exit(1) - - -def entry(): +async def aentry(): try: Cfg() except ValidationError as e: @@ -583,22 +534,28 @@ def entry(): # 检查更新 version_info = 'JavSP ' + getattr(sys, 'javsp_version', '未知版本/从代码运行') logger.debug(version_info.center(60, '=')) - check_update(Cfg().other.check_update, Cfg().other.auto_update) + await check_update(Cfg().other.check_update, Cfg().other.auto_update) root = get_scan_dir(Cfg().scanner.input_directory) - error_exit(root, '未选择要扫描的文件夹') + if root is None: + logger.error('未选择要扫描的文件夹') + sys.exit(1) # 导入抓取器,必须在chdir之前 - import_crawlers() os.chdir(root) print(f'扫描影片文件...') recognized = scan_movies(root) movie_count = len(recognized) recognize_fail = [] - error_exit(movie_count, '未找到影片文件') + if movie_count == 0: + logger.error('未找到影片文件') + sys.exit(1) logger.info(f'扫描影片文件:共找到 {movie_count} 部影片') - RunNormalMode(recognized + recognize_fail) + await RunNormalMode(recognized + recognize_fail) sys.exit(0) +def entry(): + asyncio.run(aentry(), debug=True) + if __name__ == "__main__": entry() diff --git a/javsp/config.py b/javsp/config.py index 3fbc8f071..e87b5dc28 100644 --- a/javsp/config.py +++ b/javsp/config.py @@ -39,9 +39,10 @@ class CrawlerID(str, Enum): class Network(BaseConfig): proxy_server: Url | None - retry: NonNegativeInt = 3 + retries: NonNegativeInt = 3 timeout: Duration - proxy_free: Dict[CrawlerID, Url] + unproxied: List[Url] + fallback: Dict[CrawlerID, List[str]] class CrawlerSelect(BaseConfig): def items(self) -> List[tuple[str, list[CrawlerID]]]: @@ -109,7 +110,6 @@ class Crawler(BaseConfig): required_keys: list[MovieInfoField] hardworking: bool respect_site_avid: bool - fc2fan_local_path: Path | None sleep_after_scraping: Duration use_javdb_cover: UseJavDBCover normalize_actress_name: bool diff --git a/javsp/crawlers/all.py b/javsp/crawlers/all.py new file mode 100644 index 000000000..8c262ecc1 --- /dev/null +++ b/javsp/crawlers/all.py @@ -0,0 +1,30 @@ +from collections.abc import Coroutine +from typing import Any, Dict +from javsp.config import CrawlerID +from javsp.crawlers.interface import Crawler +from javsp.crawlers.sites import \ + airav, arzon, arzon_iv, avsox, avwiki, dl_getchu, fanza, fc2, fc2ppvdb, \ + gyutto, jav321, javbus, javdb, javlib, javmenu, mgstage, njav, prestige + +__all__ = ['crawlers'] + +crawlers: Dict[CrawlerID, type[Crawler]] = { + CrawlerID.airav: airav. AiravCrawler, + CrawlerID.arzon: arzon. ArzonCrawler, + CrawlerID.arzon_iv: arzon_iv. ArzonIvCrawler, + CrawlerID.avsox: avsox. AvsoxCrawler, + CrawlerID.avwiki: avwiki. AvWikiCrawler, + CrawlerID.dl_getchu: dl_getchu.DlGetchuCrawler, + CrawlerID.fanza: fanza. FanzaCrawler, + CrawlerID.fc2: fc2. Fc2Crawler, + CrawlerID.fc2ppvdb: fc2ppvdb. Fc2PpvDbCrawler, + CrawlerID.gyutto: gyutto. GyuttoCrawler, + CrawlerID.jav321: jav321. Jav321Crawler, + CrawlerID.javbus: javbus. JavbusCrawler, + CrawlerID.javdb: javdb. JavDbCrawler, + CrawlerID.javlib: javlib. JavLibCrawler, + CrawlerID.javmenu: javmenu. JavMenuCrawler, + CrawlerID.mgstage: mgstage. MgstageCrawler, + CrawlerID.njav: njav. NjavCrawler, + CrawlerID.prestige: prestige. PrestigeCrawler, +} diff --git a/javsp/web/exceptions.py b/javsp/crawlers/exceptions.py similarity index 100% rename from javsp/web/exceptions.py rename to javsp/crawlers/exceptions.py diff --git a/javsp/crawlers/interface.py b/javsp/crawlers/interface.py new file mode 100644 index 000000000..a641b0a27 --- /dev/null +++ b/javsp/crawlers/interface.py @@ -0,0 +1,21 @@ +from httpx import AsyncClient +from javsp.config import CrawlerID +from javsp.datatype import MovieInfo +from abc import ABC, abstractmethod +from typing import Self + + +class Crawler(ABC): + base_url: str + client: AsyncClient + id: CrawlerID + + + @classmethod + @abstractmethod + async def create(cls) -> Self: + pass + + @abstractmethod + async def crawl_and_fill(self, movie: MovieInfo) -> None: + pass diff --git a/javsp/crawlers/proxyfree.py b/javsp/crawlers/proxyfree.py new file mode 100644 index 000000000..cafcfe062 --- /dev/null +++ b/javsp/crawlers/proxyfree.py @@ -0,0 +1,103 @@ +"""获取各个网站的免代理地址""" +from collections.abc import Callable, Coroutine +import re +import sys +from typing import Any, Dict + +from pydantic_core import Url +from pydantic_extra_types.pendulum_dt import Duration +from lxml import html + +from javsp.config import CrawlerID +from javsp.network.utils import test_connect +from javsp.network.client import get_client + + +async def _get_avsox_urls() -> list: + link = 'https://tellme.pw/avsox' + client = get_client(Url(link)) + resp = await client.get(link) + tree = html.fromstring(resp.text) + urls = tree.xpath('//h4/strong/a/@href') + return urls + + +async def _get_javbus_urls() -> list: + link = 'https://www.javbus.one/' + client = get_client(Url(link)) + resp = await client.get(link) + text = resp.text + urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A) + return urls + + +async def _get_javlib_urls() -> list: + link = 'https://github.com/javlibcom' + client = get_client(Url(link)) + resp = await client.get(link) + tree = html.fromstring(resp.text) + text = tree.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content() + match = re.search(r'[\w\.]+', text, re.A) + if match: + domain = f'https://www.{match.group(0)}.com' + return [domain] + + +async def _get_javdb_urls() -> list: + root_link = 'https://jav524.app' + client = get_client(Url(root_link)) + resp = await client.get(root_link) + tree = html.fromstring(resp.text) + js_links = tree.xpath("//script[@src]/@src") + for link in js_links: + if '/js/index' in link: + link = root_link + link + resp = await client.get(link) + text = resp.text + match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A) + if match: + return [match.group(1)] + +proxy_free_fns: Dict[CrawlerID, Callable[[], Coroutine[Any, Any, list[str]]]]= { + CrawlerID.avsox: _get_avsox_urls, + CrawlerID.javdb: _get_javdb_urls, + CrawlerID.javbus: _get_javbus_urls, + CrawlerID.javlib: _get_javlib_urls, +} + +def _choose_one(urls: list[str]) -> str: + for url in urls: + if test_connect(url, Duration(seconds=5)): + return url + return '' + +async def get_proxy_free_url(site_name: CrawlerID, prefer_url: str | None=None) -> str: + """获取指定网站的免代理地址 + Args: + site_name (str): 站点名称 + prefer_url (str, optional): 优先测试此url是否可用 + Returns: + str: 指定站点的免代理地址(失败时为空字符串) + """ + if prefer_url and test_connect(prefer_url, Duration(seconds=5)): + return prefer_url + + if site_name in proxy_free_fns: + try: + urls = await proxy_free_fns[site_name]() + return _choose_one(urls) + except: + return '' + else: + raise Exception("Dont't know how to get proxy-free url for " + site_name) + + + +if __name__ == "__main__": + + async def test_main(): + print('javdb:\t', await _get_javdb_urls()) + print('javlib:\t', await _get_javlib_urls()) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/airav.py b/javsp/crawlers/sites/airav.py new file mode 100644 index 000000000..5afd46998 --- /dev/null +++ b/javsp/crawlers/sites/airav.py @@ -0,0 +1,124 @@ +"""从airav抓取数据""" +import re +from html import unescape + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.network.client import get_client +from javsp.network.utils import resolve_site_fallback +from javsp.config import Cfg, CrawlerID +from javsp.datatype import MovieInfo +from javsp.crawlers.interface import Crawler + + +class AiravCrawler(Crawler): + id = CrawlerID.airav + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.airav.wiki') + self.base_url = str(url) + self.client = get_client(url) + self.client.headers['Accept-Language'] = 'zh-TW,zh;q=0.9' + return self + + async def search_movie(self, dvdid: str): + """通过搜索番号获取指定的影片在网站上的ID""" + # 部分影片的ID并不直接等于番号(如012717-360),此时需要尝试通过搜索来寻找影片 + page = 0 + count = 1 + result = [] + while len(result) < count: + url = f'{self.base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}' + response = await self.client.get(url) + resp = response.json() + # {"offset": 2460, "count": 12345, "result": [...], "status": "ok"} + if resp['result']: + result.extend(resp['result']) + count = resp['count'] + page += 1 + else: # 结果为空,结束循环 + break + # 如果什么都没搜索到,直接返回 + if not result: + raise MovieNotFoundError(__name__, dvdid) + # 排序,以优先选择更符合预期的结果(如'012717_472'对应的'1pondo_012717_472'和'_1pondo_012717_472') + result.sort(key=lambda x:x['barcode']) + # 从所有搜索结果中选择最可能的番号,返回它的URL + target = dvdid.replace('-', '_') + for item in result: + # {'vid': '', 'slug': '', 'name': '', 'url': '', 'view': '', 'img_url': '', 'barcode': ''} + barcode = item['barcode'].replace('-', '_') + if target in barcode: + return item['barcode'] + raise MovieNotFoundError(__name__, dvdid, result) + + + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据 + url = f'{self.base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW' + response = await self.client.get(url) + resp_json = response.json() + # 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息 + if resp_json['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid): + barcode = await self.search_movie(movie.dvdid) + if barcode: + url = f'{self.base_url}/api/video/barcode/{barcode}?lng=zh-TW' + response = await self.client.get(url) + resp_json = response.json() + + if resp_json['count'] == 0: + raise MovieNotFoundError(__name__, movie.dvdid, resp_json) + + # 从API返回的数据中提取需要的字段 + # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展 + data = resp_json['result'] + dvdid = data['barcode'] + movie.dvdid = dvdid + movie.url = self.base_url + '/video/' + dvdid + # plot和title中可能含有HTML的转义字符,需要进行解转义处理 + movie.plot = unescape(data['description']) or None + movie.cover = data['img_url'] + # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id + movie.genre = [i['name'] for i in data['tags']] + movie.title = unescape(data['name']) + movie.actress = [i['name'] for i in data['actors']] + movie.publish_date = data['publish_date'] + movie.preview_pics = data['images'] or [] + if data['factories']: + movie.producer = data['factories'][0]['name'] + + if Cfg().crawler.hardworking: + # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472') + video_url = f"{self.base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}" + response = await self.client.get(video_url) + resp = response.json() + # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'} + if 'data' in resp: + # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址 + # TODO: 发现部分影片(如080719-976)的传统格式预览片错误 + movie.preview_video = resp['data'].get('url') + + # airav上部分影片会被标记为'馬賽克破壞版'等,这些影片的title、plot和genre都不再准确 + for keyword in ('馬賽克破壞版', '馬賽克破解版', '無碼流出版'): + if movie.title and keyword in movie.title: + movie.title = None + movie.genre = [] + if movie.plot and keyword in movie.plot: + movie.plot = None + movie.genre = [] + if not any([movie.title, movie.plot, movie.genre]): + break + +if __name__ == "__main__": + + async def test_main(): + crawler = await AiravCrawler.create() + movie = MovieInfo("DSAD-938") + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/arzon.py b/javsp/crawlers/sites/arzon.py new file mode 100644 index 000000000..f4887f4d7 --- /dev/null +++ b/javsp/crawlers/sites/arzon.py @@ -0,0 +1,105 @@ +"""从arzon抓取数据""" +import re + +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from javsp.crawlers.exceptions import * +from javsp.datatype import MovieInfo +from lxml import html + +class ArzonCrawler(Crawler): + id = CrawlerID.arzon + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, "https://www.arzon.jp") + self.base_url = str(url) + self.client = get_client(url) + # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F + skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1" + await self.client.get(skip_verify_url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + full_id = movie.dvdid + url = f'{self.base_url}/itemlist.html?t=&m=all&s=&q={full_id}' + # url = f'{base_url}/imagelist.html?q={full_id}' + + r = await self.client.get(url) + if r.status_code == 404: + raise MovieNotFoundError(__name__, movie.dvdid) + # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported + data = html.fromstring(r.content) + + urls = data.xpath("//h2/a/@href") + if len(urls) == 0: + raise MovieNotFoundError(__name__, movie.dvdid) + + item_url = self.base_url + urls[0] + e = await self.client.get(item_url) + item = html.fromstring(e.content) + + title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0] + cover = item.xpath("//td[@align='center']//a/img/@src")[0] + item_text = item.xpath("//div[@class='item_text']/text()") + plot = [item.strip() for item in item_text if item.strip() != ''][0] + preview_pics_arr = item.xpath("//div[@class='detail_img']//img/@src") + # 使用列表推导式添加 "http:" 并去除 "m_" + preview_pics = [("https:" + url).replace("m_", "") for url in preview_pics_arr] + + container = item.xpath("//div[@class='item_register']/table//tr") + for row in container: + key = row.xpath("./td[1]/text()")[0] + contents = row.xpath("./td[2]//text()") + content = [item.strip() for item in contents if item.strip() != ''] + index = 0 + value = content[index] if content and index < len(content) else None + if key == "AV女優:": + movie.actress = content + if key == "AVメーカー:": + movie.producer = value + if key == "AVレーベル:": + video_type = value + if key == "シリーズ:": + movie.serial = value + if key == "監督:": + movie.director = value + if key == "発売日:" and value: + movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") + if key == "収録時間:" and value: + movie.duration = re.search(r'([\d.]+)分', value).group(1) + if key == "品番:": + dvd_id = value + elif key == "タグ:": + genre = value + + genres = '' + if video_type: + genres = [video_type] + if(genre != None): + genres.append(genre) + + movie.genre = genres + movie.url = item_url + movie.title = title + movie.plot = plot + movie.cover = f'https:{cover}' + movie.preview_pics = preview_pics + +if __name__ == "__main__": + + async def test_main(): + crawler = await ArzonCrawler.create() + movie = MovieInfo("CSCT-011") + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/arzon_iv.py b/javsp/crawlers/sites/arzon_iv.py new file mode 100644 index 000000000..a84c97aea --- /dev/null +++ b/javsp/crawlers/sites/arzon_iv.py @@ -0,0 +1,100 @@ +"""从arzon_iv抓取数据""" +import re + + +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from javsp.crawlers.exceptions import * +from javsp.datatype import MovieInfo +from lxml import html + +class ArzonIvCrawler(Crawler): + id = CrawlerID.arzon_iv + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, "https://www.arzon.jp") + self.base_url = str(url) + self.client = get_client(url) + # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F + skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1" + await self.client.get(skip_verify_url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + full_id = movie.dvdid + url = f'{self.base_url}/imagelist.html?q={full_id}' + # url = f'{base_url}/imagelist.html?q={full_id}' + + r = await self.client.get(url) + if r.status_code == 404: + raise MovieNotFoundError(__name__, movie.dvdid) + # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported + data = html.fromstring(r.content) + + urls = data.xpath("//h2/a/@href") + if len(urls) == 0: + raise MovieNotFoundError(__name__, movie.dvdid) + + item_url = self.base_url + urls[0] + e = await self.client.get(item_url) + item = html.fromstring(e.content) + + title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0] + cover = item.xpath("//td[@align='center']//a/img/@src")[0] + item_text = item.xpath("//div[@class='item_text']/text()") + plot = [item.strip() for item in item_text if item.strip() != ''][0] + + container = item.xpath("//div[@class='item_register']/table//tr") + for row in container: + key = row.xpath("./td[1]/text()")[0] + contents = row.xpath("./td[2]//text()") + content = [item.strip() for item in contents if item.strip() != ''] + index = 0 + value = content[index] if content and index < len(content) else None + if key == "タレント:": + movie.actress = content + if key == "イメージメーカー:": + movie.producer = value + if key == "イメージレーベル:": + video_type = value + if key == "監督:": + movie.director = value + if key == "発売日:" and value: + movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") + if key == "収録時間:" and value: + movie.duration = re.search(r'([\d.]+)分', value).group(1) + if key == "品番:": + dvd_id = value + elif key == "タグ:": + genre = value + + genres = '' + if video_type: + genres = [video_type] + if(genre != None): + genres.append(genre) + + movie.genre = genres + movie.url = item_url + movie.title = title + movie.plot = plot + movie.cover = f'https:{cover}' + +if __name__ == "__main__": + + async def test_main(): + crawler = await ArzonIvCrawler.create() + movie = MovieInfo("KIDM-1137B") + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/avsox.py b/javsp/crawlers/sites/avsox.py new file mode 100644 index 000000000..47b0ea32d --- /dev/null +++ b/javsp/crawlers/sites/avsox.py @@ -0,0 +1,88 @@ +"""从avsox抓取数据""" + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + +class AvsoxCrawler(Crawler): + id = CrawlerID.avsox + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, "https://avsox.click/") + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + full_id: str = movie.dvdid + if full_id.startswith('FC2-'): + full_id = full_id.replace('FC2-', 'FC2-PPV-') + resp = await self.client.get(f'{self.base_url}tw/search/{full_id}') + tree = html.fromstring(resp.text) + tree.make_links_absolute(str(resp.url), resolve_base_href=True) + ids = tree.xpath("//div[@class='photo-info']/span/date[1]/text()") + urls = tree.xpath("//a[contains(@class, 'movie-box')]/@href") + ids_lower = list(map(str.lower, ids)) + if full_id.lower() not in ids_lower: + raise MovieNotFoundError(__name__, movie.dvdid, ids) + + url = urls[ids_lower.index(full_id.lower())] + url = url.replace('/tw/', '/cn/', 1) + + # 提取影片信息 + resp = await self.client.get(url) + # with open('file.html', 'wb') as f: + # f.write(resp.content) + tree = html.fromstring(resp.text) + container = tree.xpath("/html/body/div[@class='container']")[0] + title = container.xpath("h3/text()")[0] + cover = container.xpath("//a[@class='bigImage']/@href")[0] + info = container.xpath("div/div[@class='col-md-3 info']")[0] + dvdid = info.xpath("p/span[@style]/text()")[0] + publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip() + duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟', '').strip() + producer, serial = None, None + producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a") + if producer_tag: + producer = producer_tag[0].text_content() + serial_tag = info.xpath("p[text()='系列:']") + if serial_tag: + serial = serial_tag[0].getnext().xpath("a/text()")[0] + genre = info.xpath("p/span[@class='genre']/a/text()") + actress = container.xpath("//a[@class='avatar-box']/span/text()") + + movie.dvdid = dvdid.replace('FC2-PPV-', 'FC2-') + movie.url = url + movie.title = title.replace(dvdid, '').strip() + movie.cover = cover + movie.publish_date = publish_date + movie.duration = duration + movie.genre = genre + movie.actress = actress + if full_id.startswith('FC2-'): + # avsox把FC2作品的拍摄者归类到'系列'而制作商固定为'FC2-PPV',这既不合理也与其他的站点不兼容,因此进行调整 + movie.producer = serial + else: + movie.producer = producer + movie.serial = serial + + +if __name__ == "__main__": + + async def test_main(): + crawler = await AvsoxCrawler.create() + movie = MovieInfo("082713-417") + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/avwiki.py b/javsp/crawlers/sites/avwiki.py new file mode 100644 index 000000000..7bc2041e5 --- /dev/null +++ b/javsp/crawlers/sites/avwiki.py @@ -0,0 +1,82 @@ +"""从av-wiki抓取数据""" + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.crawlers.interface import Crawler +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.config import CrawlerID +from lxml import html + +class AvWikiCrawler(Crawler): + id = CrawlerID.avwiki + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://av-wiki.net') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """从网页抓取并解析指定番号的数据 + Args: + movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 + """ + movie.url = url = f'{self.base_url}/{movie.dvdid}' + + resp = await self.client.get(url) + if resp.status_code == 404: + raise MovieNotFoundError(__name__, movie.dvdid) + tree = html.fromstring(resp.content) + + cover_tag = tree.xpath("//header/div/a[@class='image-link-border']/img") + if cover_tag: + try: + srcset = cover_tag[0].get('srcset').split(', ') + src_set_urls = {} + for src in srcset: + url, width = src.split() + width = int(width.rstrip('w')) + src_set_urls[width] = url + max_pic = sorted(src_set_urls.items(), key=lambda x:x[0], reverse=True) + movie.cover = max_pic[0][1] + except: + movie.cover = cover_tag[0].get('src') + body = tree.xpath("//section[@class='article-body']")[0] + title = body.xpath("div/p/text()")[0] + title = title.replace(f"【{movie.dvdid}】", '') + cite_url = body.xpath("div/cite/a/@href")[0] + cite_url = cite_url.split('?aff=')[0] + info = body.xpath("dl[@class='dltable']")[0] + dt_txt_ls, dd_tags = info.xpath("dt/text()"), info.xpath("dd") + data = {} + for dt_txt, dd in zip(dt_txt_ls, dd_tags): + dt_txt = dt_txt.strip() + a_tag = dd.xpath('a') + if len(a_tag) == 0: + dd_txt = dd.text.strip() + else: + dd_txt = [i.text.strip() for i in a_tag] + if isinstance(dd_txt, list) and dt_txt != 'AV女優名': # 只有女优名以列表的数据格式保留 + dd_txt = dd_txt[0] + data[dt_txt] = dd_txt + + ATTR_MAP = {'メーカー': 'producer', 'AV女優名': 'actress', 'メーカー品番': 'dvdid', 'シリーズ': 'serial', '配信開始日': 'publish_date'} + for key, attr in ATTR_MAP.items(): + setattr(movie, attr, data.get(key)) + movie.title = title + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + + +if __name__ == "__main__": + + async def test_main(): + crawler = await AvWikiCrawler.create() + movie = MovieInfo("259LUXU-593") + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/dl_getchu.py b/javsp/crawlers/sites/dl_getchu.py new file mode 100644 index 000000000..c2ab0814f --- /dev/null +++ b/javsp/crawlers/sites/dl_getchu.py @@ -0,0 +1,131 @@ +"""从dl.getchu官网抓取数据""" +import re +import logging + +from javsp.config import CrawlerID +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.crawlers.interface import Crawler +from javsp.network.client import get_client +from javsp.network.utils import resolve_site_fallback +from javsp.crawlers.exceptions import * +from javsp.datatype import MovieInfo +from lxml import html +from lxml.html import HtmlElement + +def get_movie_title(tree: HtmlElement): + container = tree.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[2]") + if len(container) > 0: + container = container[0] + rows = container.xpath('.//tr') + title = '' + for row in rows: + for cell in row.xpath('.//td/div'): + # 获取单元格文本内容 + if cell.text: + title = str(cell.text).strip() + return title + + +def get_movie_img(tree: HtmlElement, getchu_id: str): + img_src = '' + container = tree.xpath(f'//img[contains(@src, "{getchu_id}top.jpg")]') + if len(container) > 0: + container = container[0] + img_src = container.get('src') + return img_src + + +def get_movie_preview(tree: HtmlElement, getchu_id: str): + preview_pics = [] + container = tree.xpath(f'//img[contains(@src, "{getchu_id}_")]') + if len(container) > 0: + for c in container: + preview_pics.append(c.get('src')) + return preview_pics + + +DURATION_PATTERN = re.compile(r'(?:動画)?(\d+)分') + + +class DlGetchuCrawler(Crawler): + id = CrawlerID.dl_getchu + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://dl.getchu.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + # 去除番号中的'GETCHU'字样 + id_uc = movie.dvdid.upper() + if not id_uc.startswith('GETCHU-'): + raise ValueError('Invalid GETCHU number: ' + movie.dvdid) + getchu_id = id_uc.replace('GETCHU-', '') + # 抓取网页 + url = f'{self.base_url}/i/item{getchu_id}' + r = await self.client.get(url) + if r.status_code == 404: + raise MovieNotFoundError(__name__, movie.dvdid) + tree = html.fromstring(r.text) + container = tree.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[3]") + if len(container) > 0: + container = container[0] + # 将表格提取为键值对 + rows = container.xpath('.//table/tr') + kv_rows = [i for i in rows if len(i) == 2] + data = {} + for row in kv_rows: + # 获取单元格文本内容 + key = row.xpath("td[@class='bluetext']/text()")[0] + # 是否包含a标签: 有的属性是用表示的,不是text + a_tags = row.xpath("td[2]/a") + if a_tags: + value = [i.text for i in a_tags] + else: + # 获取第2个td标签的内容(下标从1开始计数) + value = row.xpath("td[2]/text()") + data[key] = value + + for key, value in data.items(): + if key == 'サークル': + movie.producer = value[0] + elif key == '作者': + # 暂时没有在getchu找到多个actress的片子 + movie.actress = [i.strip() for i in value] + elif key == '画像数&ページ数': + match = DURATION_PATTERN.search(' '.join(value)) + if match: + movie.duration = match.group(1) + elif key == '配信開始日': + movie.publish_date = value[0].replace('/', '-') + elif key == '趣向': + movie.genre = value + elif key == '作品内容': + idx = -1 + for i, line in enumerate(value): + if line.lstrip().startswith('※'): + idx = i + break + movie.plot = ''.join(value[:idx]) + + movie.title = get_movie_title(tree) + movie.cover = get_movie_img(tree, getchu_id) + movie.preview_pics = get_movie_preview(tree, getchu_id) + movie.dvdid = id_uc + movie.url = url + + +if __name__ == "__main__": + + async def test_main(): + crawler = await DlGetchuCrawler.create() + movie = MovieInfo('getchu-4041026') + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/fanza.py b/javsp/crawlers/sites/fanza.py new file mode 100644 index 000000000..66b895df5 --- /dev/null +++ b/javsp/crawlers/sites/fanza.py @@ -0,0 +1,246 @@ +"""从fanza抓取数据""" + +import re +import json +import logging +from typing import Dict, List, Tuple + +from httpx import Response + + +from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.config import Cfg +from javsp.datatype import MovieInfo + +from lxml import html +from lxml.html import HtmlElement + +logger = logging.getLogger(__name__) + + +_PRODUCT_PRIORITY = {'digital': 10, 'mono': 5, 'monthly': 2, 'rental': 1} +_TYPE_PRIORITY = {'videoa': 10, 'anime': 8, 'nikkatsu': 6, 'doujin': 4, 'dvd': 3, 'ppr': 2, 'paradisetv': 1} +def sort_search_result(result: List[Dict]): + """排序搜索结果""" + scores = {i['url']:(_PRODUCT_PRIORITY.get(i['product'], 0), _TYPE_PRIORITY.get(i['type'], 0)) for i in result} + sorted_result = sorted(result, key=lambda x:scores[x['url']], reverse=True) + return sorted_result + + +def resp2html_wrapper(resp: Response) -> HtmlElement: + tree = html.fromstring(resp.text) + if 'not available in your region' in tree.text_content(): + raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置') + elif '/login/' in str(resp.url): + raise SiteBlocked('FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP') + return tree + + + + +def parse_anime_page(movie: MovieInfo, tree: HtmlElement): + """解析动画影片的页面布局""" + title = tree.xpath("//h1[@id='title']/text()")[0] + container = tree.xpath("//table[@class='mg-b12']/tr/td")[0] + cover = container.xpath("//img[@name='package-image']/@src")[0] + date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip() + publish_date = date_str.replace('/', '-') + duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()") + if duration_tag: + movie.duration = duration_tag[0].strip().replace('分', '') + serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()") + if serial_tag: + movie.serial = serial_tag[0].strip() + producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()") + if producer_tag: + movie.producer = producer_tag[0].strip() + genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]") + genre, genre_id = [], [] + for tag in genre_tags: + genre.append(tag.text.strip()) + genre_id.append(tag.get('href').split('=')[-1].strip('/')) + cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() + plot = container.xpath("//div[@class='mg-b20 lh4']/p")[0].text_content().strip() + preview_pics = container.xpath("//a[@name='sample-image']/img/@data-lazy") + score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0] + score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 + + movie.cid = cid + movie.title = title + movie.cover = cover + movie.publish_date = publish_date + movie.genre = genre + movie.genre_id = genre_id + movie.plot = plot + movie.score = f'{score/5:.2f}' # 转换为10分制 + movie.preview_pics = preview_pics + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + + +# parse_dvd_page = parse_videoa_page # 118wtktabf067 +# parse_ppr_page = parse_videoa_page +# parse_nikkatsu_page = parse_videoa_page +# parse_doujin_page = parse_anime_page + +class FanzaCrawler(Crawler): + id = CrawlerID.fanza + + async def get_urls_of_cid(self, cid: str) -> Tuple[str, str]: + """搜索cid可能的影片URL""" + r = await self.client.get(f"{self.base_url}search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0") + if r.status_code == 404: + raise MovieNotFoundError(__name__, cid) + r.raise_for_status() + tree = resp2html_wrapper(r) + result = tree.xpath("//ul[@id='list']/li/div/p/a/@href") + parsed_result = {} + for url in result: + items = url.split('/') + type_, cid = None, None + for i, part in enumerate(items): + if part == '-': + product, type_ = items[i-2], items[i-1] + elif part.startswith('cid='): + cid = part[4:] + new_url = '/'.join(i for i in items if not i.startswith('?')) + '/' + parsed_result.setdefault(cid, []).append({'product': product, 'type': type_, 'url': new_url}) + break + if cid not in parsed_result: + if len(result) > 0: + logger.debug(f"Unknown URL in search result: " + ', '.join(result)) + raise MovieNotFoundError(__name__, cid) + sorted_result = sort_search_result(parsed_result[cid]) + return sorted_result + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.dmm.co.jp') + self.base_url = str(url) + self.client = get_client(url) + + # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) + self.client.cookies = {'age_check_done': '1'} + self.client.headers['Accept-Language'] = 'ja,en-US;q=0.9' + return self + + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + default_url = f'{self.base_url}digital/videoa/-/detail/=/cid={movie.cid}/' + r0 = await self.client.get(default_url) + if r0.status_code == 404: + urls = await self.get_urls_of_cid(movie.cid) + for d in urls: + func_name = f"parse_{d['type']}_page" + if func_name in globals(): + parse_func = globals()[func_name] + else: + logger.debug(f"不知道怎么解析 fanza {d['type']} 的页面: {d['url']}") + continue + r = await self.client.get(d['url']) + tree = resp2html_wrapper(r) + try: + parse_func(movie, tree) + movie.url = d['url'] + break + except: + logger.debug(f"Fail to parse {d['url']}", exc_info=True) + if d is urls[-1]: + logger.warning(f"在fanza查找到的cid={movie.cid}的影片页面均解析失败") + raise + else: + tree = resp2html_wrapper(r0) + await self.parse_videoa_page(movie, tree) + movie.url = default_url + + async def parse_videoa_page(self, movie: MovieInfo, tree: HtmlElement): + """解析AV影片的页面布局""" + title = tree.xpath("//div[@class='hreview']/h1/text()")[0] + # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来 + container = tree.xpath("//table[@class='mg-b12']/tr/td")[0] + cover = container.xpath("//div[@id='sample-video']/a/@href")[0] + # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083 + date_tag = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()") + if date_tag: + movie.publish_date = date_tag[0].strip().replace('/', '-') + duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip() + match = re.search(r'\d+', duration_str) + if match: + movie.duration = match.group(0) + # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况 + actress = container.xpath("//span[@id='performer']/a/text()") + director_tag = container.xpath("//td[text()='監督:']/following-sibling::td/a/text()") + if director_tag: + movie.director = director_tag[0].strip() + serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()") + if serial_tag: + movie.serial = serial_tag[0].strip() + producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()") + if producer_tag: + movie.producer = producer_tag[0].strip() + # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 + # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()") + # if label_tag: + # label = label_tag[0].strip() + # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选 + genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]") + genre, genre_id = [], [] + for tag in genre_tags: + genre.append(tag.text.strip()) + genre_id.append(tag.get('href').split('=')[-1].strip('/')) + cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() + plot = container.xpath("//div[contains(@class, 'mg-b20 lh4')]/text()")[0].strip() + preview_pics = container.xpath("//a[@name='sample-image']/img/@src") + score_tag = container.xpath("//p[@class='d-review__average']/strong/text()") + if score_tag: + match = re.search(r'\d+', score_tag[0].strip()) + if match: + score = float(match.group()) * 2 + movie.score = f'{score:.2f}' + else: + score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0] + movie.score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 + + if Cfg().crawler.hardworking: + # 预览视频是动态加载的,不在静态网页中 + video_url = f'{self.base_url}service/digitalapi/-/html5_player/=/cid={movie.cid}' + resp = await self.client.get(video_url) + tree2 = html.fromstring(resp.text) + # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据 + script = tree2.xpath("//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()")[0].strip() + match = re.search(r'\{.*\}', script) + # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配 + try: + data = json.loads(match.group()) + video_url = data.get('src') + if video_url and video_url.startswith('//'): + video_url = 'https:' + video_url + movie.preview_video = video_url + except Exception as e: + logger.debug('解析视频地址时异常: ' + repr(e)) + + movie.cid = cid + movie.title = title + movie.cover = cover + movie.actress = actress + movie.genre = genre + movie.genre_id = genre_id + movie.plot = plot + movie.preview_pics = preview_pics + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + +if __name__ == "__main__": + + async def test_main(): + crawler = await FanzaCrawler.create() + movie = MovieInfo(cid="d_aisoft3356") + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/fc2.py b/javsp/crawlers/sites/fc2.py new file mode 100644 index 000000000..0ce072b90 --- /dev/null +++ b/javsp/crawlers/sites/fc2.py @@ -0,0 +1,120 @@ +"""从FC2官网抓取数据""" +import logging + +from lxml import html + + +from javsp.crawlers.exceptions import * +from javsp.config import Cfg +from javsp.lib import strftime_to_minutes +from javsp.datatype import MovieInfo +from javsp.crawlers.interface import Crawler +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.config import CrawlerID + + +logger = logging.getLogger(__name__) + +class Fc2Crawler(Crawler): + id = CrawlerID.fc2 + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://adult.contents.fc2.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def get_movie_score(self, fc2_id: str) -> float | None: + """通过评论数据来计算FC2的影片评分(10分制),无法获得评分时返回None""" + resp = await self.client.get(f'{self.base_url}/article/{fc2_id}/review') + tree = html.fromstring(resp.text) + review_tags = tree.xpath("//ul[@class='items_comment_headerReviewInArea']/li") + reviews = {} + for tag in review_tags: + score = int(tag.xpath("div/span/text()")[0]) + vote = int(tag.xpath("span")[0].text_content()) + reviews[score] = vote + total_votes = sum(reviews.values()) + if (total_votes >= 2): # 至少也该有两个人评价才有参考意义一点吧 + summary = sum([k*v for k, v in reviews.items()]) + final_score = summary / total_votes * 2 # 乘以2转换为10分制 + return final_score + + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + # 去除番号中的'FC2'字样 + id_uc = movie.dvdid.upper() + if not id_uc.startswith('FC2-'): + raise ValueError('Invalid FC2 number: ' + movie.dvdid) + fc2_id = id_uc.replace('FC2-', '') + # 抓取网页 + url = f'{self.base_url}/article/{fc2_id}/' + resp = await self.client.get(url) + if '/id.fc2.com/' in str(resp.url): + raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP') + tree = html.fromstring(resp.text) + container = tree.xpath("//div[@class='items_article_left']") + if len(container) > 0: + container = container[0] + else: + raise MovieNotFoundError(__name__, movie.dvdid) + # FC2 标题增加反爬乱码,使用数组合并标题 + title_arr = container.xpath("//div[@class='items_article_headerInfo']/h3/text()") + title = ''.join(title_arr) + thumb_tag = container.xpath("//div[@class='items_article_MainitemThumb']")[0] + thumb_pic = thumb_tag.xpath("span/img/@src")[0] + duration_str = thumb_tag.xpath("span/p[@class='items_article_info']/text()")[0] + # FC2没有制作商和发行商的区分,作为个人市场,影片页面的'by'更接近于制作商 + producer = container.xpath("//li[text()='by ']/a/text()")[0] + genre = container.xpath("//a[@class='tag tagTag']/text()") + date_str = container.xpath("//div[@class='items_article_Releasedate']/p/text()")[0] + publish_date = date_str[-10:].replace('/', '-') # '販売日 : 2017/11/30' + preview_pics = container.xpath("//ul[@data-feed='sample-images']/li/a/@href") + + if Cfg().crawler.hardworking: + # 通过评论数据来计算准确的评分 + score = await self.get_movie_score(fc2_id) + if score: + movie.score = f'{score:.2f}' + # 预览视频是动态加载的,不在静态网页中 + desc_frame_url = container.xpath("//section[@class='items_article_Contents']/iframe/@src")[0] + key = desc_frame_url.split('=')[-1] # /widget/article/718323/description?ac=60fc08fa... + api_url = f'{self.base_url}/api/v2/videos/{fc2_id}/sample?key={key}' + resp = await self.client.get(api_url) + j = resp.json() + movie.preview_video = j['path'] + else: + # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星 + score_tag_attr = container.xpath("//a[@class='items_article_Stars']/p/span/@class")[0] + score = int(score_tag_attr[-1]) * 2 + movie.score = f'{score:.2f}' + + movie.dvdid = id_uc + movie.url = url + movie.title = title + movie.genre = genre + movie.producer = producer + movie.duration = str(strftime_to_minutes(duration_str)) + movie.publish_date = publish_date + movie.preview_pics = preview_pics + # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 + if movie.preview_pics: + movie.cover = preview_pics[0] + else: + movie.cover = thumb_pic + + +if __name__ == "__main__": + + async def test_main(): + crawler = await Fc2Crawler.create() + movie = MovieInfo("FC2-718323") + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/fc2ppvdb.py b/javsp/crawlers/sites/fc2ppvdb.py new file mode 100644 index 000000000..fbba590c2 --- /dev/null +++ b/javsp/crawlers/sites/fc2ppvdb.py @@ -0,0 +1,92 @@ +"""从FC2PPVDB抓取数据""" + +# BUG: This crawler doesn't work, seemed due to cloudflare + +from typing import List + + +from javsp.crawlers.exceptions import * +from javsp.lib import strftime_to_minutes +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + + +class Fc2PpvDbCrawler(Crawler): + id = CrawlerID.fc2ppvdb + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://fc2ppvdb.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + + def get_list_first(list: List): + return list[0] if list and len(list) > 0 else None + + # 去除番号中的'FC2'字样 + id_uc = movie.dvdid.upper() + if not id_uc.startswith('FC2-'): + raise ValueError('Invalid FC2 number: ' + movie.dvdid) + fc2_id = id_uc.replace('FC2-', '') + # 抓取网页 + url = f'{self.base_url}/articles/{fc2_id}' + resp = await self.client.get(url) + tree = html.fromstring(resp.content) + # html = get_html(url) + container = tree.xpath("//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]") + if len(container) > 0: + container = container[0] + else: + raise MovieNotFoundError(__name__, movie.dvdid) + + title = container.xpath("//h2/a/text()") + thumb_pic = container.xpath(f"//img[@alt='{fc2_id}']/@src") + duration_str = container.xpath("//div[starts-with(text(),'収録時間:')]/span/text()") + actress = container.xpath("//div[starts-with(text(),'女優:')]/span/a/text()") + genre = container.xpath("//div[starts-with(text(),'タグ:')]/span/a/text()") + publish_date = container.xpath("//div[starts-with(text(),'販売日:')]/span/text()") + publisher = container.xpath("//div[starts-with(text(),'販売者:')]/span/a/text()") + uncensored_str = container.xpath("//div[starts-with(text(),'モザイク:')]/span/text()") + uncensored_str_f = get_list_first(uncensored_str); + uncensored = True if uncensored_str_f == '無' else False if uncensored_str_f == '有' else None + preview_pics = None + preview_video = container.xpath("//a[starts-with(text(),'サンプル動画')]/@href") + + movie.dvdid = id_uc + movie.url = url + movie.title = get_list_first(title) + movie.genre = genre + movie.actress = actress + movie.duration = str(strftime_to_minutes(get_list_first(duration_str))) + movie.publish_date = get_list_first(publish_date) + movie.publisher = get_list_first(publisher) + movie.uncensored = uncensored + movie.preview_pics = preview_pics + movie.preview_video = get_list_first(preview_video) + + # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 + if movie.preview_pics: + movie.cover = preview_pics[0] + else: + movie.cover = get_list_first(thumb_pic) + + +if __name__ == "__main__": + + async def test_main(): + crawler = await Fc2PpvDbCrawler.create() + movie = MovieInfo('FC2-4497837') + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/gyutto.py b/javsp/crawlers/sites/gyutto.py new file mode 100644 index 000000000..b30200284 --- /dev/null +++ b/javsp/crawlers/sites/gyutto.py @@ -0,0 +1,106 @@ +"""从https://gyutto.com/官网抓取数据""" +import logging +import time + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html +from lxml.html import HtmlElement + +logger = logging.getLogger(__name__) + +def get_movie_title(tree: HtmlElement) -> str: + container = tree.xpath("//h1") + if len(container) > 0: + container = container[0] + title = container.text + + return title + +def get_movie_img(tree: HtmlElement, index = 1) -> list[str]: + images = [] + container = tree.xpath("//a[@class='highslide']/img") + if len(container) > 0: + if index == 0: + return container[0].get('src') + + for row in container: + images.append(row.get('src')) + + return images + +class GyuttoCrawler(Crawler): + id = CrawlerID.gyutto + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'http://gyutto.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + # 去除番号中的'gyutto'字样 + id_uc = movie.dvdid.upper() + if not id_uc.startswith('GYUTTO-'): + raise ValueError('Invalid gyutto number: ' + movie.dvdid) + gyutto_id = id_uc.replace('GYUTTO-', '') + # 抓取网页 + url = f'{self.base_url}/i/item{gyutto_id}?select_uaflag=1' + r = await self.client.get(url) + if r.status_code == 404: + raise MovieNotFoundError(__name__, movie.dvdid) + tree = html.fromstring(r.text) + container = tree.xpath("//dl[@class='BasicInfo clearfix']") + + producer = None + genre = None + date = None + publish_date = None + + for row in container: + key = row.xpath(".//dt/text()") + if key[0] == "サークル": + producer = ''.join(row.xpath(".//dd/a/text()")) + elif key[0] == "ジャンル": + genre = row.xpath(".//dd/a/text()") + elif key[0] == "配信開始日": + date = row.xpath(".//dd/text()") + date_str = ''.join(date) + date_time = time.strptime(date_str, "%Y年%m月%d日") + publish_date = time.strftime("%Y-%m-%d", date_time) + + plot = tree.xpath("//div[@class='unit_DetailLead']/p/text()")[0] + + movie.title = get_movie_title(tree) + movie.cover = get_movie_img(tree, 0) + movie.preview_pics = get_movie_img(tree) + movie.dvdid = id_uc + movie.url = url + movie.producer = producer + # movie.actress = actress + # movie.duration = duration + movie.publish_date = publish_date + movie.genre = genre + movie.plot = plot + + +if __name__ == "__main__": + + async def test_main(): + crawler = await GyuttoCrawler.create() + movie = MovieInfo('gyutto-266923') + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/jav321.py b/javsp/crawlers/sites/jav321.py new file mode 100644 index 000000000..6a20a98ec --- /dev/null +++ b/javsp/crawlers/sites/jav321.py @@ -0,0 +1,117 @@ +"""从jav321抓取数据""" +import re +import logging + + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + + +logger = logging.getLogger(__name__) + +class Jav321Crawler(Crawler): + id = CrawlerID.jav321 + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.jav321.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + + """解析指定番号的影片数据""" + resp = await self.client.post(f'{self.base_url}/search', data={'sn': movie.dvdid}) + tree = html.fromstring(resp.text) + page_url = tree.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0] + #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542 + cid = page_url.split('/')[-1] # /video/ipx00177 + # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片 + if cid == 'search': + raise MovieNotFoundError(__name__, movie.dvdid) + title = tree.xpath("//div[@class='panel-heading']/h3/text()")[0] + info = tree.xpath("//div[@class='col-md-9']")[0] + # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签 + company_tags = info.xpath("a[contains(@href,'/company/')]/text()") + if company_tags: + movie.producer = company_tags[0] + # actress, actress_pics + # jav321现在连女优信息都没有了,首页通过女优栏跳转过去也全是空白 + actress, actress_pics = [], {} + actress_tags = tree.xpath("//div[@class='thumbnail']/a[contains(@href,'/star/')]/img") + for tag in actress_tags: + name = tag.tail.strip() + pic_url = tag.get('src') + actress.append(name) + # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url, + # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据 + actress_pics[name] = pic_url + # genre, genre_id + genre_tags = info.xpath("a[contains(@href,'/genre/')]") + genre, genre_id = [], [] + for tag in genre_tags: + genre.append(tag.text) + genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1 + dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper() + publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '') + duration_str = info.xpath("b[text()='収録時間']")[0].tail + match = re.search(r'\d+', duration_str) + if match: + movie.duration = match.group(0) + # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星 + score_tag = info.xpath("//b[text()='平均評価']/following-sibling::img/@data-original") + if score_tag: + score = int(score_tag[0][5:7])/5 # /10*2 + movie.score = str(score) + serial_tag = info.xpath("a[contains(@href,'/series/')]/text()") + if serial_tag: + movie.serial = serial_tag[0] + preview_video_tag = info.xpath("//video/source/@src") + if preview_video_tag: + movie.preview_video = preview_video_tag[0] + plot_tag = info.xpath("//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()") + if plot_tag: + movie.plot = plot_tag[0] + preview_pics = tree.xpath("//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src") + if len(preview_pics) == 0: + # 尝试搜索另一种布局下的封面,需要使用onerror过滤掉明明没有封面时网站往里面塞的默认URL + preview_pics = tree.xpath("//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src") + # 有的图片链接里有多个//,网站质量堪忧…… + preview_pics = [i[:8] + i[8:].replace('//', '/') for i in preview_pics] + # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析 + + movie.url = page_url + movie.cid = cid + movie.dvdid = dvdid + movie.title = title + movie.actress = actress + movie.actress_pics = actress_pics + movie.genre = genre + movie.genre_id = genre_id + movie.publish_date = publish_date + # preview_pics的第一张图始终是封面,剩下的才是预览图 + if len(preview_pics) > 0: + movie.cover = preview_pics[0] + movie.preview_pics = preview_pics[1:] + + +if __name__ == "__main__": + + async def test_main(): + crawler = await Jav321Crawler.create() + movie = MovieInfo('SCUTE-1177') + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/javbus.py b/javsp/crawlers/sites/javbus.py new file mode 100644 index 000000000..b3efaa8dd --- /dev/null +++ b/javsp/crawlers/sites/javbus.py @@ -0,0 +1,129 @@ +"""从JavBus抓取数据""" +import logging + + +from javsp.crawlers.exceptions import * +from javsp.func import * +from javsp.config import CrawlerID +from javsp.datatype import MovieInfo, GenreMap + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client + +from javsp.crawlers.interface import Crawler +from lxml import html + + +logger = logging.getLogger(__name__) + +class JavbusCrawler(Crawler): + id = CrawlerID.javbus + genre_map: GenreMap + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.javbus.com') + self.base_url = str(url) + self.client = get_client(url) + self.client.cookies = {'age': 'verified', 'dv': '1'} + self.genre_map = GenreMap('data/genre_javbus.csv') + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + + """从网页抓取并解析指定番号的数据 + Args: + movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 + """ + url = f'{self.base_url}/{movie.dvdid}' + resp = await self.client.get(url) + + tree = html.fromstring(resp.content) + # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息 + # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404 + page_title = tree.xpath('/html/head/title/text()') + if page_title and page_title[0].startswith('404 Page Not Found!'): + raise MovieNotFoundError(__name__, movie.dvdid) + + container = tree.xpath("//div[@class='container']")[0] + title = container.xpath("h3/text()")[0] + cover = container.xpath("//a[@class='bigImage']/img/@src")[0] + preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href") + info = container.xpath("//div[@class='col-md-3 info']")[0] + dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text + publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip() + duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip() + director_tag = info.xpath("p/span[text()='導演:']") + if director_tag: # xpath没有匹配时将得到空列表 + movie.director = director_tag[0].getnext().text.strip() + producer_tag = info.xpath("p/span[text()='製作商:']") + if producer_tag: + text = producer_tag[0].getnext().text + if text: + movie.producer = text.strip() + publisher_tag = info.xpath("p/span[text()='發行商:']") + if publisher_tag: + movie.publisher = publisher_tag[0].getnext().text.strip() + serial_tag = info.xpath("p/span[text()='系列:']") + if serial_tag: + movie.serial = serial_tag[0].getnext().text + # genre, genre_id + genre_tags = info.xpath("//span[@class='genre']/label/a") + genre, genre_id = [], [] + for tag in genre_tags: + tag_url = tag.get('href') + pre_id = tag_url.split('/')[-1] + genre.append(tag.text) + if 'uncensored' in tag_url: + movie.uncensored = True + genre_id.append('uncensored-' + pre_id) + else: + movie.uncensored = False + genre_id.append(pre_id) + # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析 + # actress, actress_pics + actress, actress_pics = [], {} + actress_tags = tree.xpath("//a[@class='avatar-box']/div/img") + for tag in actress_tags: + name = tag.get('title') + pic_url = tag.get('src') + actress.append(name) + if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像 + actress_pics[name] = pic_url + # 整理数据并更新movie的相应属性 + movie.url = f'{self.base_url}/{movie.dvdid}' + movie.dvdid = dvdid + movie.title = title.replace(dvdid, '').strip() + movie.cover = cover + movie.preview_pics = preview_pics + if publish_date != '0000-00-00': # 丢弃无效的发布日期 + movie.publish_date = publish_date + movie.duration = duration if int(duration) else None + movie.genre = genre + movie.genre_id = genre_id + movie.actress = actress + movie.actress_pics = actress_pics + + async def crawl_and_fill_cleaned(self, movie: MovieInfo): + """解析指定番号的影片数据并进行清洗""" + await self.crawl_and_fill(movie) + movie.genre_norm = self.genre_map.map(movie.genre_id) + movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换) + + +if __name__ == "__main__": + + async def test_main(): + crawler = await JavbusCrawler.create() + print(crawler.client.headers) + movie = MovieInfo('NANP-030') + # try: + await crawler.crawl_and_fill_cleaned(movie) + print(movie) + # except Exception as e: + # print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/javdb.py b/javsp/crawlers/sites/javdb.py new file mode 100644 index 000000000..ab23e18bd --- /dev/null +++ b/javsp/crawlers/sites/javdb.py @@ -0,0 +1,350 @@ +"""从JavDB抓取数据""" +import os +import re +import logging + +from httpx import Cookies + +from javsp.func import * +from javsp.avid import guess_av_type +from javsp.config import CrawlerID +from javsp.datatype import MovieInfo, GenreMap +from javsp.chromium import get_browsers_cookies + +from javsp.crawlers.exceptions import CredentialError, MovieDuplicateError, MovieNotFoundError, SiteBlocked, SitePermissionError, WebsiteError +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client + +from javsp.crawlers.interface import Crawler +from lxml import html + +logger = logging.getLogger(__name__) + +class JavDbCrawler(Crawler): + id = CrawlerID.javdb + genre_map: GenreMap + cookies_pool: list[Cookies] + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.javdb.com') + self.base_url = str(url) + self.client = get_client(url) + self.client.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5' + self.genre_map = GenreMap('data/genre_javdb.csv') + self.cookies_pool = [] + return self + + async def get_html_wrapper(self, url: str): + """包装外发的request请求并负责转换为可xpath的html,同时处理Cookies无效等问题""" + r = await self.client.get(url) + if r.status_code == 200: + # 发生重定向可能仅仅是域名重定向,因此还要检查url以判断是否被跳转到了登录页 + if r.history and '/login' in str(r.url): + # 仅在需要时去读取Cookies + if len(self.cookies_pool) == 0: + try: + self.cookies_pool = get_browsers_cookies() + except (PermissionError, OSError) as e: + logger.warning(f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", exc_info=True) + cookies_pool = [] + except Exception as e: + logger.warning(f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", exc_info=True) + cookies_pool = [] + if len(self.cookies_pool) > 0: + item = self.cookies_pool.pop() + # 更换Cookies时需要创建新的request实例,否则cloudscraper会保留它内部第一次发起网络访问时获得的Cookies + self.client.cookies = item['cookies'] + cookies_source = (item['profile'], item['site']) + logger.debug(f'未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}') + return self.get_html_wrapper(url) + else: + raise CredentialError('JavDB: 所有浏览器Cookies均已过期') + elif r.history and 'pay' in str(r.url).split('/')[-1]: + raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'") + else: + + return html.fromstring(r.text) + elif r.status_code in (403, 503): + tree = html.fromstring(r.text) + code_tag = tree.xpath("//span[@class='code-label']/span") + error_code = code_tag[0].text if code_tag else None + if error_code: + if error_code == '1020': + block_msg = f'JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器' + else: + block_msg = f'JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})' + else: + block_msg = f'JavDB: {r.status_code} 禁止访问: {url}' + raise SiteBlocked(block_msg) + else: + raise WebsiteError(f'JavDB: {r.status_code} 非预期状态码: {url}') + + + async def get_user_info(self, site: str, cookies: Cookies): + """获取cookies对应的JavDB用户信息""" + try: + self.client.cookies = cookies + resp = await self.client.get(f'https://{site}/users/profile') + + html_str = resp.text + tree = html.fromstring(html_str) + except Exception as e: + logger.info('JavDB: 获取用户信息时出错') + logger.debug(e, exc_info=1) + return + # 扫描浏览器得到的Cookies对应的临时域名可能会过期,因此需要先判断域名是否仍然指向JavDB的站点 + if 'JavDB' in html_str: + email = tree.xpath("//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()")[0].strip() + username = tree.xpath("//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()")[0].strip() + return email, username + else: + logger.debug('JavDB: 域名已过期: ' + site) + + + async def get_valid_cookies(self): + """扫描浏览器,获取一个可用的Cookies""" + # 经测试,Cookies所发往的域名不需要和登录时的域名保持一致,只要Cookies有效即可在多个域名间使用 + for d in self.cookies_pool: + info = await self.get_user_info(d['site'], d['cookies']) + if info: + return d['cookies'] + else: + logger.debug(f"{d['profile']}, {d['site']}: Cookies无效") + + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """从网页抓取并解析指定番号的数据 + Args: + movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 + """ + # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个 + tree = await self.get_html_wrapper(f'{self.base_url}/search?q={movie.dvdid}') + ids = list(map(str.lower, tree.xpath("//div[@class='video-title']/strong/text()"))) + movie_urls = tree.xpath("//a[@class='box']/@href") + match_count = len([i for i in ids if i == movie.dvdid.lower()]) + if match_count == 0: + raise MovieNotFoundError(__name__, movie.dvdid, ids) + elif match_count == 1: + index = ids.index(movie.dvdid.lower()) + new_url = movie_urls[index] + try: + html2 = await self.get_html_wrapper(new_url) + except (SitePermissionError, CredentialError): + # 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面 + box = tree.xpath("//a[@class='box']")[index] + movie.url = new_url + movie.title = box.get('title') + movie.cover = box.xpath("div/img/@src")[0] + score_str = box.xpath("div[@class='score']/span/span")[0].tail + score = re.search(r'([\d.]+)分', score_str).group(1) + movie.score = "{:.2f}".format(float(score)*2) + movie.publish_date = box.xpath("div[@class='meta']/text()")[0].strip() + return + else: + raise MovieDuplicateError(__name__, movie.dvdid, match_count) + + container = html2.xpath("/html/body/section/div/div[@class='video-detail']")[0] + info = container.xpath("//nav[@class='panel movie-panel-info']")[0] + title = container.xpath("h2/strong[@class='current-title']/text()")[0] + show_orig_title = container.xpath("//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]") + if show_orig_title: + movie.ori_title = container.xpath("h2/span[@class='origin-title']/text()")[0] + cover = container.xpath("//img[@class='video-cover']/@src")[0] + preview_pics = container.xpath("//a[@class='tile-item'][@data-fancybox='gallery']/@href") + preview_video_tag = container.xpath("//video[@id='preview-video']/source/@src") + if preview_video_tag: + preview_video = preview_video_tag[0] + if preview_video.startswith('//'): + preview_video = 'https:' + preview_video + movie.preview_video = preview_video + dvdid = info.xpath("div/span")[0].text_content() + publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text + duration = info.xpath("div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip() + director_tag = info.xpath("div/strong[text()='導演:']") + if director_tag: + movie.director = director_tag[0].getnext().text_content().strip() + av_type = guess_av_type(movie.dvdid) + if av_type != 'fc2': + producer_tag = info.xpath("div/strong[text()='片商:']") + else: + producer_tag = info.xpath("div/strong[text()='賣家:']") + if producer_tag: + movie.producer = producer_tag[0].getnext().text_content().strip() + publisher_tag = info.xpath("div/strong[text()='發行:']") + if publisher_tag: + movie.publisher = publisher_tag[0].getnext().text_content().strip() + serial_tag = info.xpath("div/strong[text()='系列:']") + if serial_tag: + movie.serial = serial_tag[0].getnext().text_content().strip() + score_tag = info.xpath("//span[@class='score-stars']") + if score_tag: + score_str = score_tag[0].tail + score = re.search(r'([\d.]+)分', score_str).group(1) + movie.score = "{:.2f}".format(float(score)*2) + genre_tags = info.xpath("//strong[text()='類別:']/../span/a") + genre, genre_id = [], [] + for tag in genre_tags: + pre_id = tag.get('href').split('/')[-1] + genre.append(tag.text) + genre_id.append(pre_id) + # 判定影片有码/无码 + subsite = pre_id.split('?')[0] + movie.uncensored = {'uncensored': True, 'tags':False}.get(subsite) + # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优 + actors_tag = info.xpath("//strong[text()='演員:']/../span")[0] + all_actors = actors_tag.xpath("a/text()") + genders = actors_tag.xpath("strong/text()") + actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀'] + magnet = container.xpath("//div[@class='magnet-name column is-four-fifths']/a/@href") + + movie.dvdid = dvdid + movie.url = self.base_url + movie.title = title.replace(dvdid, '').strip() + movie.cover = cover + movie.preview_pics = preview_pics + movie.publish_date = publish_date + movie.duration = duration + movie.genre = genre + movie.genre_id = genre_id + movie.actress = actress + movie.magnet = [i.replace('[javdb.com]','') for i in magnet] + + + async def crawl_and_fill_cleaned(self, movie: MovieInfo): + """解析指定番号的影片数据并进行清洗""" + try: + await self.crawl_and_fill(movie) + # 检查封面URL是否真的存在对应图片 + if movie.cover is not None: + r = await self.client.head(movie.cover) + if r.status_code != 200: + movie.cover = None + except SiteBlocked: + raise + logger.error('JavDB: 可能触发了反爬虫机制,请稍后再试') + if movie.genre_id and (not movie.genre_id[0].startswith('fc2?')): + movie.genre_norm = self.genre_map.map(movie.genre_id) + movie.genre_id = None # 没有别的地方需要再用到,清空genre id(表明已经完成转换) + + + async def collect_actress_alias(self, type=0, use_original=True): + """ + 收集女优的别名 + type: 0-有码, 1-无码, 2-欧美 + use_original: 是否使用原名而非译名,True-田中レモン,False-田中檸檬 + """ + import json + import time + import random + + actressAliasMap = {} + + actressAliasFilePath = "data/actress_alias.json" + # 检查文件是否存在 + if not os.path.exists(actressAliasFilePath): + # 如果文件不存在,创建文件并写入空字典 + with open(actressAliasFilePath, "w", encoding="utf-8") as file: + json.dump({}, file) + + typeList = ["censored", "uncensored", "western"] + page_url = f"{self.base_url}/actors/{typeList[type]}" + while True: + try: + tree = await self.get_html_wrapper(page_url) + actors = tree.xpath("//div[@class='box actor-box']/a") + + count = 0 + for actor in actors: + count += 1 + actor_name = actor.xpath("strong/text()")[0].strip() + actor_url = actor.xpath("@href")[0] + # actor_url = f"https://javdb.com{actor_url}" # 构造演员主页的完整URL + + # 进入演员主页,获取更多信息 + actor_html = await self.get_html_wrapper(actor_url) + # 解析演员所有名字信息 + names_span = actor_html.xpath("//span[@class='actor-section-name']")[0] + aliases_span_list = actor_html.xpath("//span[@class='section-meta']") + aliases_span = aliases_span_list[0] + + names_list = [name.strip() for name in names_span.text.split(",")] + if len(aliases_span_list) > 1: + aliases_list = [ + alias.strip() for alias in aliases_span.text.split(",") + ] + else: + aliases_list = [] + + # 将信息添加到actressAliasMap中 + actressAliasMap[names_list[-1 if use_original else 0]] = ( + names_list + aliases_list + ) + print( + f"{count} --- {names_list[-1 if use_original else 0]}: {names_list + aliases_list}" + ) + + if count == 10: + # 将数据写回文件 + with open(actressAliasFilePath, "r", encoding="utf-8") as file: + existing_data = json.load(file) + + # 合并现有数据和新爬取的数据 + existing_data.update(actressAliasMap) + + # 将合并后的数据写回文件 + with open(actressAliasFilePath, "w", encoding="utf-8") as file: + json.dump(existing_data, file, ensure_ascii=False, indent=2) + + actressAliasMap = {} # 重置actressAliasMap + + print( + f"已爬取 {count} 个女优,数据已更新并写回文件:", + actressAliasFilePath, + ) + + # 重置计数器 + count = 0 + + time.sleep(max(1, 10 * random.random())) # 随机等待 1-10 秒 + + # 判断是否有下一页按钮 + next_page_link = tree.xpath( + "//a[@rel='next' and @class='pagination-next']/@href" + ) + if not next_page_link: + break # 没有下一页,结束循环 + else: + next_page_url = f"{next_page_link[0]}" + page_url = next_page_url + + except SiteBlocked: + raise + + with open(actressAliasFilePath, "r", encoding="utf-8") as file: + existing_data = json.load(file) + + # 合并现有数据和新爬取的数据 + existing_data.update(actressAliasMap) + + # 将合并后的数据写回文件 + with open(actressAliasFilePath, "w", encoding="utf-8") as file: + json.dump(existing_data, file, ensure_ascii=False, indent=2) + + print(f"已爬取 {count} 个女优,数据已更新并写回文件:", actressAliasFilePath) + + +if __name__ == "__main__": + + async def test_main(): + crawler = await JavDbCrawler.create() + movie = MovieInfo('FC2-2735981') + try: + await crawler.crawl_and_fill_cleaned(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/javlib.py b/javsp/crawlers/sites/javlib.py new file mode 100644 index 000000000..c71a5f336 --- /dev/null +++ b/javsp/crawlers/sites/javlib.py @@ -0,0 +1,115 @@ +"""从JavLibrary抓取数据""" +import logging +from urllib.parse import urlsplit + +from httpx._transports import base + +from javsp.crawlers.exceptions import MovieDuplicateError, MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + +logger = logging.getLogger(__name__) + +class JavLibCrawler(Crawler): + id = CrawlerID.jav321 + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.javlibrary.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + # TODO: 发现JavLibrary支持使用cid搜索,会直接跳转到对应的影片页面,也许可以利用这个功能来做cid到dvdid的转换 + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + url = new_url = f'{self.base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}' + resp = await self.client.get(url) + tree = html.fromstring(resp.text) + if resp.history and urlsplit(str(resp.url)).netloc == urlsplit(self.base_url).netloc: + # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果 + new_url = resp.url + else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果 + video_tags = tree.xpath("//div[@class='video'][@id]/a") + # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果 + pre_choose = [] + for tag in video_tags: + tag_dvdid = tag.xpath("div[@class='id']/text()")[0] + if tag_dvdid.upper() == movie.dvdid.upper(): + pre_choose.append(tag) + pre_choose_urls = [i.get('href') for i in pre_choose] + match_count = len(pre_choose) + if match_count == 0: + raise MovieNotFoundError(__name__, movie.dvdid) + elif match_count == 1: + new_url = pre_choose_urls[0] + elif match_count == 2: + no_blueray = [] + for tag in pre_choose: + if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc + no_blueray.append(tag) + no_blueray_count = len(no_blueray) + if no_blueray_count == 1: + new_url = no_blueray[0].get('href') + logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}") + else: + # 两个结果中没有谁是蓝光影片,说明影片番号重复了 + raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls) + else: + # 存在不同影片但是番号相同的情况,如MIDV-010 + raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls) + # 重新抓取网页 + resp = await self.client.get(new_url) + tree = html.fromstring(resp.text) + container = tree.xpath("/html/body/div/div[@id='rightcolumn']")[0] + title_tag = container.xpath("div/h3/a/text()") + title = title_tag[0] + cover = container.xpath("//img[@id='video_jacket_img']/@src")[0] + info = container.xpath("//div[@id='video_info']")[0] + dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0] + publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0] + duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0] + director_tag = info.xpath("//span[@class='director']/a/text()") + if director_tag: + movie.director = director_tag[0] + producer = info.xpath("//span[@class='maker']/a/text()")[0] + publisher_tag = info.xpath("//span[@class='label']/a/text()") + if publisher_tag: + movie.publisher = publisher_tag[0] + score_tag = info.xpath("//span[@class='score']/text()") + if score_tag: + movie.score = score_tag[0].strip('()') + genre = info.xpath("//span[@class='genre']/a/text()") + actress = info.xpath("//span[@class='star']/a/text()") + + movie.dvdid = dvdid + movie.url = new_url + movie.title = title.replace(dvdid, '').strip() + if cover.startswith('//'): # 补全URL中缺少的协议段 + cover = 'https:' + cover + movie.cover = cover + movie.publish_date = publish_date + movie.duration = duration + movie.producer = producer + movie.genre = genre + movie.actress = actress + + +if __name__ == "__main__": + + async def test_main(): + crawler = await JavLibCrawler.create() + movie = MovieInfo('IPX-177') + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/javmenu.py b/javsp/crawlers/sites/javmenu.py new file mode 100644 index 000000000..6553d86a1 --- /dev/null +++ b/javsp/crawlers/sites/javmenu.py @@ -0,0 +1,100 @@ +"""从JavMenu抓取数据""" +import logging + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + +logger = logging.getLogger(__name__) + +class JavMenuCrawler(Crawler): + id = CrawlerID.javmenu + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.javmenu.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """从网页抓取并解析指定番号的数据 + Args: + movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 + """ + # JavMenu网页做得很不走心,将就了 + url = f'{self.base_url}zh/{movie.dvdid}' + r = await self.client.get(url) + if r.history: + # 被重定向到主页说明找不到影片资源 + raise MovieNotFoundError(__name__, movie.dvdid) + + tree = html.fromstring(r.text) + container = tree.xpath("//div[@class='col-md-9 px-0']")[0] + title = container.xpath("div[@class='col-12 mb-3']/h1/strong/text()")[0] + # 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站 + title = title.replace(' | JAV目錄大全 | 每日更新', '') + title = title.replace(' 免費在線看', '').replace(' 免費AV在線看', '') + cover_tag = container.xpath("//div[@class='single-video']") + if len(cover_tag) > 0: + video_tag = cover_tag[0].find('video') + # URL首尾竟然也有空格…… + movie.cover = video_tag.get('data-poster').strip() + # 预览影片改为blob了,无法获取 + # movie.preview_video = video_tag.find('source').get('src').strip() + else: + cover_img_tag = container.xpath("//img[@class='lazy rounded']/@data-src") + if cover_img_tag: + movie.cover = cover_img_tag[0].strip() + info = container.xpath("//div[@class='card-body']")[0] + publish_date = info.xpath("div/span[contains(text(), '日期:')]")[0].getnext().text + duration = info.xpath("div/span[contains(text(), '时长:')]")[0].getnext().text.replace('分钟', '') + producer = info.xpath("div/span[contains(text(), '製作:')]/following-sibling::a/span/text()") + if producer: + movie.producer = producer[0] + genre_tags = info.xpath("//a[@class='genre']") + genre, genre_id = [], [] + for tag in genre_tags: + items = tag.get('href').split('/') + pre_id = items[-3] + '/' + items[-1] + genre.append(tag.text.strip()) + genre_id.append(pre_id) + # genre的链接中含有censored字段,但是无法用来判断影片是否有码,因为完全不可靠…… + actress = info.xpath("div/span[contains(text(), '女优:')]/following-sibling::*/a/text()") or None + magnet_table = container.xpath("//table[contains(@class, 'magnet-table')]/tbody") + if magnet_table: + magnet_links = magnet_table[0].xpath("tr/td/a/@href") + # 它的FC2数据是从JavDB抓的,JavDB更换图片服务器后它也跟上了,似乎数据更新频率还可以 + movie.magnet = [i.replace('[javdb.com]','') for i in magnet_links] + preview_pics = container.xpath("//a[@data-fancybox='gallery']/@href") + + if (not movie.cover) and preview_pics: + movie.cover = preview_pics[0] + movie.url = url + movie.title = title.replace(movie.dvdid, '').strip() + movie.preview_pics = preview_pics + movie.publish_date = publish_date + movie.duration = duration + movie.genre = genre + movie.genre_id = genre_id + movie.actress = actress + + +if __name__ == "__main__": + + async def test_main(): + crawler = await JavMenuCrawler.create() + movie = MovieInfo('FC2-718323') + # try: + await crawler.crawl_and_fill(movie) + print(movie) + # except Exception as e: + # print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/mgstage.py b/javsp/crawlers/sites/mgstage.py new file mode 100644 index 000000000..bd9d76840 --- /dev/null +++ b/javsp/crawlers/sites/mgstage.py @@ -0,0 +1,127 @@ +"""从蚊香社-mgstage抓取数据""" +import re +import logging + + +from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import Cfg, CrawlerID +from lxml import html + + +logger = logging.getLogger(__name__) + +class MgstageCrawler(Crawler): + id = CrawlerID.mgstage + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.mgstage.com') + self.base_url = str(url) + self.client = get_client(url) + # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) + self.client.cookies = {'adc': '1'} + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + url = f'{self.base_url}/product/product_detail/{movie.dvdid}/' + resp = await self.client.get(url) + if resp.status_code == 403: + raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理') + # url不存在时会被重定向至主页。history非空时说明发生了重定向 + elif resp.history: + raise MovieNotFoundError(__name__, movie.dvdid) + + tree = html.fromstring(resp.text) + # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除 + title = tree.xpath("//div[@class='common_detail_cover']/h1/text()")[0].strip() + container = tree.xpath("//div[@class='detail_left']")[0] + cover = container.xpath("//a[@id='EnlargeImage']/@href")[0] + # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表 + actress_text = container.xpath("//th[text()='出演:']/following-sibling::td/text()") + actress_link = container.xpath("//th[text()='出演:']/following-sibling::td/a/text()") + actress = [i.strip() for i in actress_text + actress_link] + actress = [i for i in actress if i] # 移除空字符串 + producer = container.xpath("//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip() + duration_str = container.xpath("//th[text()='収録時間:']/following-sibling::td/text()")[0] + match = re.search(r'\d+', duration_str) + if match: + movie.duration = match.group(0) + dvdid = container.xpath("//th[text()='品番:']/following-sibling::td/text()")[0] + date_str = container.xpath("//th[text()='配信開始日:']/following-sibling::td/text()")[0] + publish_date = date_str.replace('/', '-') + serial_tag = container.xpath("//th[text()='シリーズ:']/following-sibling::td/a/text()") + if serial_tag: + movie.serial = serial_tag[0].strip() + # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 + # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip() + genre_tags = container.xpath("//th[text()='ジャンル:']/following-sibling::td/a") + genre = [i.text.strip() for i in genre_tags] + score_str = container.xpath("//td[@class='review']/span")[0].tail.strip() + match = re.search(r'^[\.\d]+', score_str) + if match: + score = float(match.group()) * 2 + movie.score = f'{score:.2f}' + # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签 + plots = [] + plot_p_tags = container.xpath("//dl[@id='introduction']/dd/p[not(@class='more')]") + for p in plot_p_tags: + children = p.getchildren() + # 没有children时表明plot不含有格式,此时简单地提取文本就可以 + if not children: + plots.append(p.text_content()) + continue + for child in children: + if child.tag == 'br' and plots[-1] != '\n': + plots.append('\n') + else: + if child.text: + plots.append(child.text) + if child.tail: + plots.append(child.tail) + plot = ''.join(plots).strip() + preview_pics = container.xpath("//a[@class='sample_image']/@href") + + if Cfg().crawler.hardworking: + # 预览视频是点击按钮后再加载的,不在静态网页中 + btn_url = container.xpath("//a[@class='button_sample']/@href")[0] + video_pid = btn_url.split('/')[-1] + req_url = f'{self.base_url}/sampleplayer/sampleRespons.php?pid={video_pid}' + resp = await self.client.get(req_url) + j = resp.json() + video_url = j.get('url') + if video_url: + # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX + preview_video = video_url.split('.ism/')[0] + '.mp4' + movie.preview_video = preview_video + + movie.dvdid = dvdid + movie.url = url + movie.title = title + movie.cover = cover + movie.actress = actress + movie.producer = producer + movie.publish_date = publish_date + movie.genre = genre + movie.plot = plot + movie.preview_pics = preview_pics + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + + +if __name__ == "__main__": + async def test_main(): + crawler = await MgstageCrawler.create() + movie = MovieInfo('ABF-153') + # try: + await crawler.crawl_and_fill(movie) + print(movie) + # except Exception as e: + # print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/njav.py b/javsp/crawlers/sites/njav.py new file mode 100644 index 000000000..5787397c9 --- /dev/null +++ b/javsp/crawlers/sites/njav.py @@ -0,0 +1,150 @@ +"""从NJAV抓取数据""" +import re +import logging +from typing import List + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from javsp.lib import strftime_to_minutes +from lxml import html + + +logger = logging.getLogger(__name__) + +def get_list_first(list: List): + return list[0] if list and len(list) > 0 else None + +class NjavCrawler(Crawler): + id = CrawlerID.njav + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.njav.tv/') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def search_video(self, movie: MovieInfo) -> str: + id_uc = movie.dvdid + # 抓取网页 + url = f'{self.base_url}ja/search?keyword={id_uc}' + resp = await self.client.get(url) + tree = html.fromstring(resp.text) + list = tree.xpath("//div[@class='box-item']/div[@class='detail']/a") + video_url = None + for item in list: + search_title = item.xpath("text()")[0] + if id_uc in search_title: + video_url = item.xpath("@href") + break + if id_uc.startswith("FC2-"): + fc2id = id_uc.replace('FC2-', '') + if "FC2" in search_title and fc2id in search_title: + video_url = item.xpath("@href") + break + + return get_list_first(video_url) + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + # 抓取网页 + url = await self.search_video(movie) + url = self.base_url + "ja/" + url + if not url: + raise MovieNotFoundError(__name__, movie.dvdid) + resp = await self.client.get(url) + tree = html.fromstring(resp.text) + container = tree.xpath("//div[@class='container']/div/div[@class='col']") + if len(container) > 0: + container = container[0] + else: + raise MovieNotFoundError(__name__, movie.dvdid) + + title = container.xpath("//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()")[0] + thumb_pic = container.xpath("//div[@id='player']/@data-poster") + plot = " ".join(container.xpath("//div[@class='description']/p/text()")) + magnet = container.xpath("//div[@class='magnet']/a/@href") + real_id = None + publish_date = None + duration_str = None + uncensored = None + preview_pics = None + preview_video = None + serial = None + publisher = None + producer = None + genre = [] + actress = [] + + for item in container.xpath("//div[@class='detail-item']/div"): + item_title = item.xpath('span/text()')[0] + if "タグ:" in item_title: + genre += item.xpath("span")[1].xpath("a/text()") + elif "ジャンル:" in item_title: + genre += item.xpath("span")[1].xpath("a/text()") + elif "レーベル:" in item_title: + genre += item.xpath("span")[1].xpath("a/text()") + elif "女優:" in item_title: + actress = item.xpath("span")[1].xpath("a/text()") + elif "シリーズ:" in item_title: + serial = get_list_first(item.xpath("span")[1].xpath("a/text()")) + elif "メーカー:" in item_title: + producer = get_list_first(item.xpath("span")[1].xpath("a/text()")) + elif "コード:" in item_title: + real_id = get_list_first(item.xpath("span")[1].xpath("text()")) + elif "公開日:" in item_title: + publish_date = get_list_first(item.xpath("span")[1].xpath("text()")) + elif "再生時間:" in item_title: + duration_str = get_list_first(item.xpath("span")[1].xpath("text()")) + + # 清除标题里的番号字符 + keywords = [real_id, " "] + if movie.dvdid.startswith("FC2"): + keywords += ["FC2","PPV","-"] + [movie.dvdid.split("-")[-1]] + for keyword in keywords: + title = re.sub(re.escape(keyword), "", title, flags=re.I) + + # 判断是否无码 + uncensored_arr = magnet + [title] + for uncensored_str in uncensored_arr: + if 'uncensored' in uncensored_str.lower(): + uncensored = True + + movie.url = url + movie.title = title + movie.genre = genre + movie.actress = actress + movie.duration = str(strftime_to_minutes(duration_str)) + movie.publish_date = publish_date + movie.publisher = publisher + movie.producer = producer + movie.uncensored = uncensored + movie.preview_pics = preview_pics + movie.preview_video = preview_video + movie.plot = plot + movie.serial = serial + movie.magnet = magnet + + # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 + if movie.preview_pics: + movie.cover = preview_pics[0] + else: + movie.cover = get_list_first(thumb_pic) + +if __name__ == "__main__": + + async def test_main(): + crawler = await NjavCrawler.create() + movie = MovieInfo('012023_002') + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/prestige.py b/javsp/crawlers/sites/prestige.py new file mode 100644 index 000000000..bc0734554 --- /dev/null +++ b/javsp/crawlers/sites/prestige.py @@ -0,0 +1,101 @@ +"""从蚊香社-prestige抓取数据""" +import re +import logging + + + +from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + + +logger = logging.getLogger(__name__) + + +class PrestigeCrawler(Crawler): + id = CrawlerID.prestige + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.prestige-av.com') + self.base_url = str(url) + self.client = get_client(url) + # prestige要求访问者携带已通过R18认证的cookies才能够获得完整数据,否则会被重定向到认证页面 + # (其他多数网站的R18认证只是在网页上遮了一层,完整数据已经传回,不影响爬虫爬取) + self.client.cookies = {'__age_auth__': 'true'} + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """从网页抓取并解析指定番号的数据 + Args: + movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 + """ + url = f'{self.base_url}/goods/goods_detail.php?sku={movie.dvdid}' + resp = await self.client.get(url) + if resp.status_code == 500: + # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试 + raise MovieNotFoundError(__name__, movie.dvdid) + elif resp.status_code == 403: + raise SiteBlocked('prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理') + resp.raise_for_status() + tree = html.fromstring(resp.text) + container_tags = tree.xpath("//section[@class='px-4 mb-4 md:px-8 md:mb-16']") + if not container_tags: + raise MovieNotFoundError(__name__, movie.dvdid) + + container = container_tags[0] + title = container.xpath("h1/span")[0].tail.strip() + cover = container.xpath("//div[@class='c-ratio-image mr-8']/picture/source/img/@src")[0] + cover = cover.split('?')[0] + actress = container.xpath("//p[text()='出演者:']/following-sibling::div/p/a/text()") + # 移除女优名中的空格,使女优名与其他网站保持一致 + actress = [i.strip().replace(' ', '') for i in actress] + duration_str = container.xpath("//p[text()='収録時間:']")[0].getnext().text_content() + match = re.search(r'\d+', duration_str) + if match: + movie.duration = match.group(0) + date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[0] + publish_date = date_url.split('?date=')[-1] + producer = container.xpath("//p[text()='メーカー:']/following-sibling::div/a/text()")[0].strip() + dvdid = container.xpath("//p[text()='品番:']/following-sibling::div/p/text()")[0] + genre_tags = container.xpath("//p[text()='ジャンル:']/following-sibling::div/a") + genre = [tag.text.strip() for tag in genre_tags] + serial = container.xpath("//p[text()='レーベル:']/following-sibling::div/a/text()")[0].strip() + plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[0].text.strip() + preview_pics = container.xpath("//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src") + preview_pics = [i.split('?')[0] for i in preview_pics] + + # prestige改版后已经无法获取高清封面,此前已经获取的高清封面地址也已失效 + movie.url = url + movie.dvdid = dvdid + movie.title = title + movie.cover = cover + movie.actress = actress + movie.publish_date = publish_date + movie.producer = producer + movie.genre = genre + movie.serial = serial + movie.plot = plot + movie.preview_pics = preview_pics + movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片 + + + +if __name__ == "__main__": + + async def test_main(): + crawler = await PrestigeCrawler.create() + movie = MovieInfo('ABP-647') + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/func.py b/javsp/func.py index 042afea5c..6232747fd 100644 --- a/javsp/func.py +++ b/javsp/func.py @@ -16,6 +16,8 @@ from pathlib import Path import importlib.metadata as meta +from pydantic_core import Url + # 判断系统是否可以使用tk USE_GUI = True try: @@ -23,7 +25,7 @@ except ImportError: USE_GUI = False -from javsp.web.base import * +from javsp.network.utils import get_client, url_download from javsp.lib import re_escape, resource_path @@ -150,7 +152,7 @@ def split_by_punc(s): return ls -def check_update(allow_check=True, auto_update=True): +async def check_update(allow_check=True, auto_update=True): """检查版本更新""" def print_header(title, info=[]): @@ -181,7 +183,9 @@ def print_header(title, info=[]): release_url = 'https://github.com/Yuukiy/JavSP/releases/latest' print('正在检查更新...', end='') try: - data = request_get(api_url, timeout=3).json() + client = get_client(Url(api_url)) + resp = await client.get(api_url) + data = resp.json() latest_version = data['tag_name'] release_time = utc2local(data['published_at']) release_date = release_time.isoformat().split('T')[0] @@ -233,7 +237,7 @@ def print_header(title, info=[]): if auto_update: try: logger.info('尝试自动更新到新版本: ' + latest_version + " (按'Ctrl+C'取消)") - download_update(data) + await download_update(data) except KeyboardInterrupt: logger.info('用户取消更新') except Exception as e: @@ -243,7 +247,7 @@ def print_header(title, info=[]): print() # 输出空行,作为新旧程序的分隔 -def download_update(rel_info): +async def download_update(rel_info): """下载版本更新 Args: @@ -253,7 +257,8 @@ def download_update(rel_info): down_url = rel_info['assets'][0]['browser_download_url'] asset_name = rel_info['assets'][0]['name'] desc = '下载更新' if shutil.get_terminal_size().columns < 120 else '下载更新: '+asset_name - download(down_url, asset_name, desc=desc) + await url_download(Url(down_url), asset_name, desc=desc) + # download(down_url, asset_name, desc=desc) if os.path.exists(asset_name): # 备份原有的程序 basepath, ext = os.path.splitext(sys.executable) @@ -270,8 +275,3 @@ def download_update(rel_info): p.wait() p.terminate() sys.exit(0) - - -if __name__ == "__main__": - setattr(sys, 'javsp_version', 'v0') - check_update() diff --git a/javsp/network/client.py b/javsp/network/client.py new file mode 100644 index 000000000..813167233 --- /dev/null +++ b/javsp/network/client.py @@ -0,0 +1,43 @@ +"""网络请求的统一接口""" + +from typing import Dict +from pydantic_core import Url + +from httpx import AsyncClient, AsyncHTTPTransport + +from javsp.config import Cfg + +headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'} + +def get_proxy(unproxied: bool): + if Cfg().network.proxy_server is None or unproxied: + return None + else: + return str(Cfg().network.proxy_server) + +client_dictionary: Dict[str, AsyncClient] = {} +def get_client(url: Url) -> AsyncClient: + if url.host is None: + raise Exception(f"Unknown url {url}") + else: + index = url.host + if index in client_dictionary: + return client_dictionary[index] + else: + unproxied = url.host in Cfg().network.unproxied + + transport = AsyncHTTPTransport( + proxy=get_proxy(unproxied), + retries=Cfg().network.retries) + + client = AsyncClient( + transport=transport, + # 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效 + headers=headers.copy(), + timeout=Cfg().network.timeout.total_seconds(), + follow_redirects=True, + ) + + client_dictionary[index] = client + + return client diff --git a/javsp/network/utils.py b/javsp/network/utils.py new file mode 100644 index 000000000..6f73338e4 --- /dev/null +++ b/javsp/network/utils.py @@ -0,0 +1,91 @@ +from datetime import timedelta +import time +from tqdm.asyncio import tqdm +from typing import Any, Coroutine, NamedTuple +import aiofiles +from pretty_errors import os +from pydantic.types import ByteSize +from pydantic_core import Url + +from pydantic_extra_types.pendulum_dt import Duration + +from javsp.config import Cfg, CrawlerID +from javsp.network.client import get_client + +class DownloadInfo(NamedTuple): + size: ByteSize + elapsed: timedelta + + def get_rate(self) -> float: + """get rate of this download, unit: Mbps""" + return self.size.to("mbit") / self.elapsed.total_seconds() + +async def url_download(url: Url, target_path: str, desc: str | None = None) -> DownloadInfo: + url_str = str(url) + + if url.scheme == 'file': + path: str = url.path + start_time: float = time.time() + async with aiofiles.open(path, "rb") as src: + async with aiofiles.open(target_path, "wb") as dest: + await dest.write(await src.read()) + filesize = os.path.getsize(path) + elapsed = time.time() - start_time + return DownloadInfo(ByteSize(filesize), Duration(seconds=elapsed)) + + if not desc: + desc = url_str.split('/')[-1] + + client = get_client(url) + + # REF: https://www.python-httpx.org/advanced/clients/#monitoring-download-progress + async with aiofiles.open(target_path, 'wb') as download_file: + # NOTE: Create a client for each request for now, need further refactor + async with client.stream("GET", url_str) as response: + total = int(response.headers["Content-Length"]) + + with tqdm(total=total, unit_scale=True, unit_divisor=1024, unit="B") as progress: + num_bytes_downloaded = response.num_bytes_downloaded + for chunk in response.iter_bytes(): + await download_file.write(chunk) + progress.update(response.num_bytes_downloaded - num_bytes_downloaded) + num_bytes_downloaded = response.num_bytes_downloaded + + return DownloadInfo(ByteSize(response.num_bytes_downloaded), response.elapsed) + +# def resp2html(resp: Response) -> lxml.html.HtmlElement: +# +# """将request返回的response转换为经lxml解析后的document""" +# +# html = lxml.html.fromstring(resp.text) +# html.make_links_absolute(str(resp.url), resolve_base_href=True) +# return html +# +async def test_connect(url_str: str, timeout: Duration) -> bool: + """测试与指定url的连接,不使用映射,但使用代理""" + try: + + client = get_client(Url(url_str)) + response = \ + await client.get( + url_str, + timeout=timeout.total_seconds(), + follow_redirects=True, + ) + return response.status_code == 200 + except: + return False + +async def resolve_site_fallback(cr_id: CrawlerID, default: str) -> Url: + if cr_id not in Cfg().network.fallback: + return Url(default) + + tasks: list[tuple[str, Coroutine[Any, Any, bool]]] = [] + for fallback in Cfg().network.fallback[cr_id]: + tasks.append((fallback, test_connect(fallback, Duration(seconds=3)))) + + for (fallback, task) in tasks: + if await task: + return Url(fallback) + + return Url(default) diff --git a/javsp/web/translate.py b/javsp/translate.py similarity index 94% rename from javsp/web/translate.py rename to javsp/translate.py index 2e762cb15..1f202209a 100644 --- a/javsp/web/translate.py +++ b/javsp/translate.py @@ -6,7 +6,7 @@ import random import logging from pydantic_core import Url -import requests +import httpx from hashlib import md5 @@ -15,7 +15,7 @@ from javsp.config import BaiduTranslateEngine, BingTranslateEngine, Cfg, ClaudeTranslateEngine, GoogleTranslateEngine, OpenAITranslateEngine, TranslateEngine from javsp.datatype import MovieInfo -from javsp.web.base import read_proxy +from javsp.network.client import get_proxy logger = logging.getLogger(__name__) @@ -49,13 +49,7 @@ def translate_movie_info(info: MovieInfo): return False return True -def translate(texts, engine: Union[ - BaiduTranslateEngine, - BingTranslateEngine, - ClaudeTranslateEngine, - OpenAITranslateEngine, - None - ], actress=[]): +def translate(texts, engine: TranslateEngine, actress=[]): """ 翻译入口:对错误进行处理并且统一返回格式 @@ -146,7 +140,7 @@ def baidu_translate(texts, app_id, api_key, to='zh'): wait = 1.0 - (now - last_access) if wait > 0: time.sleep(wait) - r = requests.post(api_url, params=payload, headers=headers) + r = httpx.post(api_url, params=payload, headers=headers) result = r.json() baidu_translate._last_access = time.perf_counter() return result @@ -163,7 +157,7 @@ def bing_translate(texts, api_key, to='zh-Hans'): 'X-ClientTraceId': str(uuid.uuid4()) } body = [{'text': texts}] - r = requests.post(api_url, params=params, headers=headers, json=body) + r = httpx.post(api_url, params=params, headers=headers, json=body) result = r.json() return result @@ -175,12 +169,12 @@ def google_trans(texts, to='zh_CN'): # client参数的选择: https://github.com/lmk123/crx-selection-translate/issues/223#issue-184432017 global _google_trans_wait url = f"https://translate.google.com.hk/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={to}&q={texts}" - proxies = read_proxy() - r = requests.get(url, proxies=proxies) + proxies = get_proxy(False) + r = httpx.get(url, proxies=proxies) while r.status_code == 429: logger.warning(f"HTTP {r.status_code}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试") time.sleep(_google_trans_wait) - r = requests.get(url, proxies=proxies) + r = httpx.get(url, proxies=proxies) if r.status_code == 429: _google_trans_wait += random.randint(60, 90) if r.status_code == 200: @@ -204,7 +198,7 @@ def claude_translate(texts, api_key, to="zh_CN"): "max_tokens": 1024, "messages": [{"role": "user", "content": texts}], } - r = requests.post(api_url, headers=headers, json=data) + r = httpx.post(api_url, headers=headers, json=data) if r.status_code == 200: result = r.json().get("content", [{}])[0].get("text", "").strip() else: @@ -236,7 +230,7 @@ def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"): "temperature": 0, "max_tokens": 1024, } - r = requests.post(api_url, headers=headers, json=data) + r = httpx.post(api_url, headers=headers, json=data) if r.status_code == 200: if 'error' in r.json(): result = { diff --git a/javsp/web/airav.py b/javsp/web/airav.py deleted file mode 100644 index 22e9fdbf7..000000000 --- a/javsp/web/airav.py +++ /dev/null @@ -1,118 +0,0 @@ -"""从airav抓取数据""" -import re -import logging -from html import unescape - - -from javsp.web.base import Request -from javsp.web.exceptions import * -from javsp.config import Cfg -from javsp.datatype import MovieInfo - -# 初始化Request实例 -request = Request(use_scraper=True) -request.headers['Accept-Language'] = 'zh-TW,zh;q=0.9' -# 近期airav服务器似乎不稳定,时好时坏,单次查询平均在17秒左右,timeout时间增加到20秒 -request.timeout = 20 - - -logger = logging.getLogger(__name__) -base_url = 'https://www.airav.wiki' - - -def search_movie(dvdid): - """通过搜索番号获取指定的影片在网站上的ID""" - # 部分影片的ID并不直接等于番号(如012717-360),此时需要尝试通过搜索来寻找影片 - page = 0 - count = 1 - result = [] - while len(result) < count: - url = f'{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}' - r = request.get(url).json() - # {"offset": 2460, "count": 12345, "result": [...], "status": "ok"} - if r['result']: - result.extend(r['result']) - count = r['count'] - page += 1 - else: # 结果为空,结束循环 - break - # 如果什么都没搜索到,直接返回 - if not result: - raise MovieNotFoundError(__name__, dvdid) - # 排序,以优先选择更符合预期的结果(如'012717_472'对应的'1pondo_012717_472'和'_1pondo_012717_472') - result.sort(key=lambda x:x['barcode']) - # 从所有搜索结果中选择最可能的番号,返回它的URL - target = dvdid.replace('-', '_') - for item in result: - # {'vid': '', 'slug': '', 'name': '', 'url': '', 'view': '', 'img_url': '', 'barcode': ''} - barcode = item['barcode'].replace('-', '_') - if target in barcode: - return item['barcode'] - raise MovieNotFoundError(__name__, dvdid, result) - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据 - url = f'{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW' - resp = request.get(url).json() - # 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息 - if resp['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid): - barcode = search_movie(movie.dvdid) - if barcode: - url = f'{base_url}/api/video/barcode/{barcode}?lng=zh-TW' - resp = request.get(url).json() - if resp['count'] == 0: - raise MovieNotFoundError(__name__, movie.dvdid, resp) - - # 从API返回的数据中提取需要的字段 - # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展 - data = resp['result'] - dvdid = data['barcode'] - movie.dvdid = dvdid - movie.url = base_url + '/video/' + dvdid - # plot和title中可能含有HTML的转义字符,需要进行解转义处理 - movie.plot = unescape(data['description']) or None - movie.cover = data['img_url'] - # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id - movie.genre = [i['name'] for i in data['tags']] - movie.title = unescape(data['name']) - movie.actress = [i['name'] for i in data['actors']] - movie.publish_date = data['publish_date'] - movie.preview_pics = data['images'] or [] - if data['factories']: - movie.producer = data['factories'][0]['name'] - - if Cfg().crawler.hardworking: - # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472') - video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}" - resp = request.get(video_url).json() - # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'} - if 'data' in resp: - # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址 - # TODO: 发现部分影片(如080719-976)的传统格式预览片错误 - movie.preview_video = resp['data'].get('url') - - # airav上部分影片会被标记为'馬賽克破壞版'等,这些影片的title、plot和genre都不再准确 - for keyword in ('馬賽克破壞版', '馬賽克破解版', '無碼流出版'): - if movie.title and keyword in movie.title: - movie.title = None - movie.genre = [] - if movie.plot and keyword in movie.plot: - movie.plot = None - movie.genre = [] - if not any([movie.title, movie.plot, movie.genre]): - break - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('DSAD-938') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/arzon.py b/javsp/web/arzon.py deleted file mode 100644 index 433949018..000000000 --- a/javsp/web/arzon.py +++ /dev/null @@ -1,100 +0,0 @@ -"""从arzon抓取数据""" -import os -import sys -import logging -import re - -from javsp.web.base import request_get -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo -import requests -from lxml import html - -logger = logging.getLogger(__name__) -base_url = "https://www.arzon.jp" - -def get_cookie(): - # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F - skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1" - session = requests.Session() - session.get(skip_verify_url, timeout=(12, 7)) - return session.cookies.get_dict() - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - full_id = movie.dvdid - cookies = get_cookie() - url = f'{base_url}/itemlist.html?t=&m=all&s=&q={full_id}' - # url = f'{base_url}/imagelist.html?q={full_id}' - r = request_get(url, cookies, delay_raise=True) - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported - data = html.fromstring(r.content) - - urls = data.xpath("//h2/a/@href") - if len(urls) == 0: - raise MovieNotFoundError(__name__, movie.dvdid) - - item_url = base_url + urls[0] - e = request_get(item_url, cookies, delay_raise=True) - item = html.fromstring(e.content) - - title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0] - cover = item.xpath("//td[@align='center']//a/img/@src")[0] - item_text = item.xpath("//div[@class='item_text']/text()") - plot = [item.strip() for item in item_text if item.strip() != ''][0] - preview_pics_arr = item.xpath("//div[@class='detail_img']//img/@src") - # 使用列表推导式添加 "http:" 并去除 "m_" - preview_pics = [("https:" + url).replace("m_", "") for url in preview_pics_arr] - - container = item.xpath("//div[@class='item_register']/table//tr") - for row in container: - key = row.xpath("./td[1]/text()")[0] - contents = row.xpath("./td[2]//text()") - content = [item.strip() for item in contents if item.strip() != ''] - index = 0 - value = content[index] if content and index < len(content) else None - if key == "AV女優:": - movie.actress = content - if key == "AVメーカー:": - movie.producer = value - if key == "AVレーベル:": - video_type = value - if key == "シリーズ:": - movie.serial = value - if key == "監督:": - movie.director = value - if key == "発売日:" and value: - movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") - if key == "収録時間:" and value: - movie.duration = re.search(r'([\d.]+)分', value).group(1) - if key == "品番:": - dvd_id = value - elif key == "タグ:": - genre = value - - genres = '' - if video_type: - genres = [video_type] - if(genre != None): - genres.append(genre) - - movie.genre = genres - movie.url = item_url - movie.title = title - movie.plot = plot - movie.cover = f'https:{cover}' - movie.preview_pics = preview_pics - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('csct-011') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/arzon_iv.py b/javsp/web/arzon_iv.py deleted file mode 100644 index 3ea7a322f..000000000 --- a/javsp/web/arzon_iv.py +++ /dev/null @@ -1,93 +0,0 @@ -"""从arzon抓取数据""" -import os -import sys -import logging -import re - -from javsp.web.base import request_get -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo -import requests -from lxml import html - -logger = logging.getLogger(__name__) -base_url = "https://www.arzon.jp" - -def get_cookie(): - # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F - skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1" - session = requests.Session() - session.get(skip_verify_url, timeout=(12, 7)) - return session.cookies.get_dict() - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - full_id = movie.dvdid - cookies = get_cookie() - url = f'{base_url}/imagelist.html?q={full_id}' - r = request_get(url, cookies, delay_raise=True) - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported - data = html.fromstring(r.content) - - urls = data.xpath("//h2/a/@href") - if len(urls) == 0: - raise MovieNotFoundError(__name__, movie.dvdid) - - item_url = base_url + urls[0] - e = request_get(item_url, cookies, delay_raise=True) - item = html.fromstring(e.content) - - title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0] - cover = item.xpath("//td[@align='center']//a/img/@src")[0] - item_text = item.xpath("//div[@class='item_text']/text()") - plot = [item.strip() for item in item_text if item.strip() != ''][0] - - container = item.xpath("//div[@class='item_register']/table//tr") - for row in container: - key = row.xpath("./td[1]/text()")[0] - contents = row.xpath("./td[2]//text()") - content = [item.strip() for item in contents if item.strip() != ''] - index = 0 - value = content[index] if content and index < len(content) else None - if key == "タレント:": - movie.actress = content - if key == "イメージメーカー:": - movie.producer = value - if key == "イメージレーベル:": - video_type = value - if key == "監督:": - movie.director = value - if key == "発売日:" and value: - movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") - if key == "収録時間:" and value: - movie.duration = re.search(r'([\d.]+)分', value).group(1) - if key == "品番:": - dvd_id = value - elif key == "タグ:": - genre = value - - genres = '' - if video_type: - genres = [video_type] - if(genre != None): - genres.append(genre) - - movie.genre = genres - movie.url = item_url - movie.title = title - movie.plot = plot - movie.cover = f'https:{cover}' - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('KIDM-1137B') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/avsox.py b/javsp/web/avsox.py deleted file mode 100644 index ea96d6cc3..000000000 --- a/javsp/web/avsox.py +++ /dev/null @@ -1,75 +0,0 @@ -"""从avsox抓取数据""" -import logging - -from javsp.web.base import get_html -from javsp.web.exceptions import * -from javsp.config import Cfg, CrawlerID -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = str(Cfg().network.proxy_free[CrawlerID.avsox]) - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # avsox无法直接跳转到影片的网页,因此先搜索再从搜索结果中寻找目标网页 - full_id = movie.dvdid - if full_id.startswith('FC2-'): - full_id = full_id.replace('FC2-', 'FC2-PPV-') - html = get_html(f'{base_url}tw/search/{full_id}') - ids = html.xpath("//div[@class='photo-info']/span/date[1]/text()") - urls = html.xpath("//a[contains(@class, 'movie-box')]/@href") - ids_lower = list(map(str.lower, ids)) - if full_id.lower() in ids_lower: - url = urls[ids_lower.index(full_id.lower())] - url = url.replace('/tw/', '/cn/', 1) - else: - raise MovieNotFoundError(__name__, movie.dvdid, ids) - - # 提取影片信息 - html = get_html(url) - container = html.xpath("/html/body/div[@class='container']")[0] - title = container.xpath("h3/text()")[0] - cover = container.xpath("//a[@class='bigImage']/@href")[0] - info = container.xpath("div/div[@class='col-md-3 info']")[0] - dvdid = info.xpath("p/span[@style]/text()")[0] - publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip() - duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟', '').strip() - producer, serial = None, None - producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a") - if producer_tag: - producer = producer_tag[0].text_content() - serial_tag = info.xpath("p[text()='系列:']") - if serial_tag: - serial = serial_tag[0].getnext().xpath("a/text()")[0] - genre = info.xpath("p/span[@class='genre']/a/text()") - actress = container.xpath("//a[@class='avatar-box']/span/text()") - - movie.dvdid = dvdid.replace('FC2-PPV-', 'FC2-') - movie.url = url - movie.title = title.replace(dvdid, '').strip() - movie.cover = cover - movie.publish_date = publish_date - movie.duration = duration - movie.genre = genre - movie.actress = actress - if full_id.startswith('FC2-'): - # avsox把FC2作品的拍摄者归类到'系列'而制作商固定为'FC2-PPV',这既不合理也与其他的站点不兼容,因此进行调整 - movie.producer = serial - else: - movie.producer = producer - movie.serial = serial - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('082713-417') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/avwiki.py b/javsp/web/avwiki.py deleted file mode 100644 index fbd4ecbb3..000000000 --- a/javsp/web/avwiki.py +++ /dev/null @@ -1,72 +0,0 @@ -"""从av-wiki抓取数据""" -import logging - - -from javsp.web.base import * -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - -logger = logging.getLogger(__name__) -base_url = 'https://av-wiki.net' - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - movie.url = url = f'{base_url}/{movie.dvdid}' - resp = request_get(url, delay_raise=True) - if resp.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - html = resp2html(resp) - - cover_tag = html.xpath("//header/div/a[@class='image-link-border']/img") - if cover_tag: - try: - srcset = cover_tag[0].get('srcset').split(', ') - src_set_urls = {} - for src in srcset: - url, width = src.split() - width = int(width.rstrip('w')) - src_set_urls[width] = url - max_pic = sorted(src_set_urls.items(), key=lambda x:x[0], reverse=True) - movie.cover = max_pic[0][1] - except: - movie.cover = cover_tag[0].get('src') - body = html.xpath("//section[@class='article-body']")[0] - title = body.xpath("div/p/text()")[0] - title = title.replace(f"【{movie.dvdid}】", '') - cite_url = body.xpath("div/cite/a/@href")[0] - cite_url = cite_url.split('?aff=')[0] - info = body.xpath("dl[@class='dltable']")[0] - dt_txt_ls, dd_tags = info.xpath("dt/text()"), info.xpath("dd") - data = {} - for dt_txt, dd in zip(dt_txt_ls, dd_tags): - dt_txt = dt_txt.strip() - a_tag = dd.xpath('a') - if len(a_tag) == 0: - dd_txt = dd.text.strip() - else: - dd_txt = [i.text.strip() for i in a_tag] - if isinstance(dd_txt, list) and dt_txt != 'AV女優名': # 只有女优名以列表的数据格式保留 - dd_txt = dd_txt[0] - data[dt_txt] = dd_txt - - ATTR_MAP = {'メーカー': 'producer', 'AV女優名': 'actress', 'メーカー品番': 'dvdid', 'シリーズ': 'serial', '配信開始日': 'publish_date'} - for key, attr in ATTR_MAP.items(): - setattr(movie, attr, data.get(key)) - movie.title = title - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - - movie = MovieInfo('259LUXU-593') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/base.py b/javsp/web/base.py deleted file mode 100644 index 717b5168a..000000000 --- a/javsp/web/base.py +++ /dev/null @@ -1,270 +0,0 @@ -"""网络请求的统一接口""" -import os -import sys -import time -import shutil -import logging -import requests -import contextlib -import cloudscraper -import lxml.html -from tqdm import tqdm -from lxml import etree -from lxml.html.clean import Cleaner -from requests.models import Response - - -from javsp.config import Cfg -from javsp.web.exceptions import * - - -__all__ = ['Request', 'get_html', 'post_html', 'request_get', 'resp2html', 'is_connectable', 'download', 'get_resp_text', 'read_proxy'] - - -headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'} - -logger = logging.getLogger(__name__) -# 删除js脚本相关的tag,避免网页检测到没有js运行环境时强行跳转,影响调试 -cleaner = Cleaner(kill_tags=['script', 'noscript']) - -def read_proxy(): - if Cfg().network.proxy_server is None: - return {} - else: - proxy = str(Cfg().network.proxy_server) - return {'http': proxy, 'https': proxy} - -# 与网络请求相关的功能汇总到一个模块中以方便处理,但是不同站点的抓取器又有自己的需求(针对不同网站 -# 需要使用不同的UA、语言等)。每次都传递参数很麻烦,而且会面临函数参数越加越多的问题。因此添加这个 -# 处理网络请求的类,它带有默认的属性,但是也可以在各个抓取器模块里进行进行定制 -class Request(): - """作为网络请求出口并支持各个模块定制功能""" - def __init__(self, use_scraper=False) -> None: - # 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效 - self.headers = headers.copy() - self.cookies = {} - - self.proxies = read_proxy() - self.timeout = Cfg().network.timeout.total_seconds() - if not use_scraper: - self.scraper = None - self.__get = requests.get - self.__post = requests.post - self.__head = requests.head - else: - self.scraper = cloudscraper.create_scraper() - self.__get = self._scraper_monitor(self.scraper.get) - self.__post = self._scraper_monitor(self.scraper.post) - self.__head = self._scraper_monitor(self.scraper.head) - - def _scraper_monitor(self, func): - """监控cloudscraper的工作状态,遇到不支持的Challenge时尝试退回常规的requests请求""" - def wrapper(*args, **kw): - try: - return func(*args, **kw) - except Exception as e: - logger.debug(f"无法通过CloudFlare检测: '{e}', 尝试退回常规的requests请求") - if func == self.scraper.get: - return requests.get(*args, **kw) - else: - return requests.post(*args, **kw) - return wrapper - - def get(self, url, delay_raise=False): - r = self.__get(url, - headers=self.headers, - proxies=self.proxies, - cookies=self.cookies, - timeout=self.timeout) - if not delay_raise: - r.raise_for_status() - return r - - def post(self, url, data, delay_raise=False): - r = self.__post(url, - data=data, - headers=self.headers, - proxies=self.proxies, - cookies=self.cookies, - timeout=self.timeout) - if not delay_raise: - r.raise_for_status() - return r - - def head(self, url, delay_raise=True): - r = self.__head(url, - headers=self.headers, - proxies=self.proxies, - cookies=self.cookies, - timeout=self.timeout) - if not delay_raise: - r.raise_for_status() - return r - - def get_html(self, url): - r = self.get(url) - html = resp2html(r) - return html - - -class DownloadProgressBar(tqdm): - def update_to(self, b=1, bsize=1, tsize=None): - if tsize is not None: - self.total = tsize - self.update(b * bsize - self.n) - - -def request_get(url, cookies={}, timeout=None, delay_raise=False): - """获取指定url的原始请求""" - if timeout is None: - timeout = Cfg().network.timeout.seconds - - r = requests.get(url, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout) - if not delay_raise: - if r.status_code == 403 and b'>Just a moment...<' in r.content: - raise SiteBlocked(f"403 Forbidden: 无法通过CloudFlare检测: {url}") - else: - r.raise_for_status() - return r - - -def request_post(url, data, cookies={}, timeout=None, delay_raise=False): - """向指定url发送post请求""" - if timeout is None: - timeout = Cfg().network.timeout.seconds - r = requests.post(url, data=data, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout) - if not delay_raise: - r.raise_for_status() - return r - - -def get_resp_text(resp: Response, encoding=None): - """提取Response的文本""" - if encoding: - resp.encoding = encoding - else: - resp.encoding = resp.apparent_encoding - return resp.text - - -def get_html(url, encoding='utf-8'): - """使用get方法访问指定网页并返回经lxml解析后的document""" - resp = request_get(url) - text = get_resp_text(resp, encoding=encoding) - html = lxml.html.fromstring(text) - html.make_links_absolute(url, resolve_base_href=True) - # 清理功能仅应在需要的时候用来调试网页(如prestige),否则可能反过来影响调试(如JavBus) - # html = cleaner.clean_html(html) - if hasattr(sys, 'javsp_debug_mode'): - lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug - return html - - -def resp2html(resp, encoding='utf-8') -> lxml.html.HtmlComment: - """将request返回的response转换为经lxml解析后的document""" - text = get_resp_text(resp, encoding=encoding) - html = lxml.html.fromstring(text) - html.make_links_absolute(resp.url, resolve_base_href=True) - # html = cleaner.clean_html(html) - if hasattr(sys, 'javsp_debug_mode'): - lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug - return html - - -def post_html(url, data, encoding='utf-8', cookies={}): - """使用post方法访问指定网页并返回经lxml解析后的document""" - resp = request_post(url, data, cookies=cookies) - text = get_resp_text(resp, encoding=encoding) - html = lxml.html.fromstring(text) - # jav321提供ed2k形式的资源链接,其中的非ASCII字符可能导致转换失败,因此要先进行处理 - ed2k_tags = html.xpath("//a[starts-with(@href,'ed2k://')]") - for tag in ed2k_tags: - tag.attrib['ed2k'], tag.attrib['href'] = tag.attrib['href'], '' - html.make_links_absolute(url, resolve_base_href=True) - for tag in ed2k_tags: - tag.attrib['href'] = tag.attrib['ed2k'] - tag.attrib.pop('ed2k') - # html = cleaner.clean_html(html) - # lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug - return html - - -def dump_xpath_node(node, filename=None): - """将xpath节点dump到文件""" - if not filename: - filename = node.tag + '.html' - with open(filename, 'wt', encoding='utf-8') as f: - content = etree.tostring(node, pretty_print=True).decode('utf-8') - f.write(content) - - -def is_connectable(url, timeout=3): - """测试与指定url的连接""" - try: - r = requests.get(url, headers=headers, timeout=timeout) - return True - except requests.exceptions.RequestException as e: - logger.debug(f"Not connectable: {url}\n" + repr(e)) - return False - - -def urlretrieve(url, filename=None, reporthook=None, headers=None): - if "arzon" in url: - headers["Referer"] = "https://www.arzon.jp/" - """使用requests实现urlretrieve""" - # https://blog.csdn.net/qq_38282706/article/details/80253447 - with contextlib.closing(requests.get(url, headers=headers, - proxies=read_proxy(), stream=True)) as r: - header = r.headers - with open(filename, 'wb+') as fp: - bs = 1024 - size = -1 - blocknum = 0 - if "content-length" in header: - size = int(header["Content-Length"]) # 文件总大小(理论值) - if reporthook: # 写入前运行一次回调函数 - reporthook(blocknum, bs, size) - for chunk in r.iter_content(chunk_size=1024): - if chunk: - fp.write(chunk) - fp.flush() - blocknum += 1 - if reporthook: - reporthook(blocknum, bs, size) # 每写入一次运行一次回调函数 - - -def download(url, output_path, desc=None): - """下载指定url的资源""" - # 支持“下载”本地资源,以供fc2fan的本地镜像所使用 - if not url.startswith('http'): - start_time = time.time() - shutil.copyfile(url, output_path) - filesize = os.path.getsize(url) - elapsed = time.time() - start_time - info = {'total': filesize, 'elapsed': elapsed, 'rate': filesize/elapsed} - return info - if not desc: - desc = url.split('/')[-1] - referrer = headers.copy() - referrer['referer'] = url[:url.find('/', 8)+1] # 提取base_url部分 - with DownloadProgressBar(unit='B', unit_scale=True, - miniters=1, desc=desc, leave=False) as t: - urlretrieve(url, filename=output_path, reporthook=t.update_to, headers=referrer) - info = {k: t.format_dict[k] for k in ('total', 'elapsed', 'rate')} - return info - - -def open_in_chrome(url, new=0, autoraise=True): - """使用指定的Chrome Profile打开url,便于调试""" - import subprocess - chrome = R'C:\Program Files\Google\Chrome\Application\chrome.exe' - subprocess.run(f'"{chrome}" --profile-directory="Profile 2" {url}', shell=True) - -import webbrowser -webbrowser.open = open_in_chrome - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - download('https://www.javbus.com/pics/cover/6n54_b.jpg', 'cover.jpg') diff --git a/javsp/web/dl_getchu.py b/javsp/web/dl_getchu.py deleted file mode 100644 index 15267f1f7..000000000 --- a/javsp/web/dl_getchu.py +++ /dev/null @@ -1,122 +0,0 @@ -"""从dl.getchu官网抓取数据""" -import re -import logging - -from javsp.web.base import resp2html, request_get -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - -logger = logging.getLogger(__name__) - -# https://dl.getchu.com/i/item4045373 -base_url = 'https://dl.getchu.com' -# dl.getchu用utf-8会乱码 -base_encode = 'euc-jp' - - -def get_movie_title(html): - container = html.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[2]") - if len(container) > 0: - container = container[0] - rows = container.xpath('.//tr') - title = '' - for row in rows: - for cell in row.xpath('.//td/div'): - # 获取单元格文本内容 - if cell.text: - title = str(cell.text).strip() - return title - - -def get_movie_img(html, getchu_id): - img_src = '' - container = html.xpath(f'//img[contains(@src, "{getchu_id}top.jpg")]') - if len(container) > 0: - container = container[0] - img_src = container.get('src') - return img_src - - -def get_movie_preview(html, getchu_id): - preview_pics = [] - container = html.xpath(f'//img[contains(@src, "{getchu_id}_")]') - if len(container) > 0: - for c in container: - preview_pics.append(c.get('src')) - return preview_pics - - -DURATION_PATTERN = re.compile(r'(?:動画)?(\d+)分') -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # 去除番号中的'GETCHU'字样 - id_uc = movie.dvdid.upper() - if not id_uc.startswith('GETCHU-'): - raise ValueError('Invalid GETCHU number: ' + movie.dvdid) - getchu_id = id_uc.replace('GETCHU-', '') - # 抓取网页 - url = f'{base_url}/i/item{getchu_id}' - r = request_get(url, delay_raise=True) - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - html = resp2html(r, base_encode) - container = html.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[3]") - if len(container) > 0: - container = container[0] - # 将表格提取为键值对 - rows = container.xpath('.//table/tr') - kv_rows = [i for i in rows if len(i) == 2] - data = {} - for row in kv_rows: - # 获取单元格文本内容 - key = row.xpath("td[@class='bluetext']/text()")[0] - # 是否包含a标签: 有的属性是用表示的,不是text - a_tags = row.xpath("td[2]/a") - if a_tags: - value = [i.text for i in a_tags] - else: - # 获取第2个td标签的内容(下标从1开始计数) - value = row.xpath("td[2]/text()") - data[key] = value - - for key, value in data.items(): - if key == 'サークル': - movie.producer = value[0] - elif key == '作者': - # 暂时没有在getchu找到多个actress的片子 - movie.actress = [i.strip() for i in value] - elif key == '画像数&ページ数': - match = DURATION_PATTERN.search(' '.join(value)) - if match: - movie.duration = match.group(1) - elif key == '配信開始日': - movie.publish_date = value[0].replace('/', '-') - elif key == '趣向': - movie.genre = value - elif key == '作品内容': - idx = -1 - for i, line in enumerate(value): - if line.lstrip().startswith('※'): - idx = i - break - movie.plot = ''.join(value[:idx]) - - movie.title = get_movie_title(html) - movie.cover = get_movie_img(html, getchu_id) - movie.preview_pics = get_movie_preview(html, getchu_id) - movie.dvdid = id_uc - movie.url = url - - -if __name__ == "__main__": - import pretty_errors - - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('getchu-4041026') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/fanza.py b/javsp/web/fanza.py deleted file mode 100644 index e975c4c8f..000000000 --- a/javsp/web/fanza.py +++ /dev/null @@ -1,231 +0,0 @@ -"""从fanza抓取数据""" -import os -import re -import sys -import json -import logging -from typing import Dict, List, Tuple - - -from javsp.web.base import Request, resp2html -from javsp.web.exceptions import * -from javsp.config import Cfg -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://www.dmm.co.jp' -# 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) -request = Request() -request.cookies = {'age_check_done': '1'} -request.headers['Accept-Language'] = 'ja,en-US;q=0.9' - - -_PRODUCT_PRIORITY = {'digital': 10, 'mono': 5, 'monthly': 2, 'rental': 1} -_TYPE_PRIORITY = {'videoa': 10, 'anime': 8, 'nikkatsu': 6, 'doujin': 4, 'dvd': 3, 'ppr': 2, 'paradisetv': 1} -def sort_search_result(result: List[Dict]): - """排序搜索结果""" - scores = {i['url']:(_PRODUCT_PRIORITY.get(i['product'], 0), _TYPE_PRIORITY.get(i['type'], 0)) for i in result} - sorted_result = sorted(result, key=lambda x:scores[x['url']], reverse=True) - return sorted_result - - -def get_urls_of_cid(cid: str) -> Tuple[str, str]: - """搜索cid可能的影片URL""" - r = request.get(f"https://www.dmm.co.jp/search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0") - if r.status_code == 404: - raise MovieNotFoundError(__name__, cid) - r.raise_for_status() - html = resp2html_wrapper(r) - result = html.xpath("//ul[@id='list']/li/div/p/a/@href") - parsed_result = {} - for url in result: - items = url.split('/') - type_, cid = None, None - for i, part in enumerate(items): - if part == '-': - product, type_ = items[i-2], items[i-1] - elif part.startswith('cid='): - cid = part[4:] - new_url = '/'.join(i for i in items if not i.startswith('?')) + '/' - parsed_result.setdefault(cid, []).append({'product': product, 'type': type_, 'url': new_url}) - break - if cid not in parsed_result: - if len(result) > 0: - logger.debug(f"Unknown URL in search result: " + ', '.join(result)) - raise MovieNotFoundError(__name__, cid) - sorted_result = sort_search_result(parsed_result[cid]) - return sorted_result - - -def resp2html_wrapper(resp): - html = resp2html(resp) - if 'not available in your region' in html.text_content(): - raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置') - elif '/login/' in resp.url: - raise SiteBlocked('FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP') - return html - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - default_url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/' - r0 = request.get(default_url, delay_raise=True) - if r0.status_code == 404: - urls = get_urls_of_cid(movie.cid) - for d in urls: - func_name = f"parse_{d['type']}_page" - if func_name in globals(): - parse_func = globals()[func_name] - else: - logger.debug(f"不知道怎么解析 fanza {d['type']} 的页面: {d['url']}") - continue - r = request.get(d['url']) - html = resp2html_wrapper(r) - try: - parse_func(movie, html) - movie.url = d['url'] - break - except: - logger.debug(f"Fail to parse {d['url']}", exc_info=True) - if d is urls[-1]: - logger.warning(f"在fanza查找到的cid={movie.cid}的影片页面均解析失败") - raise - else: - html = resp2html_wrapper(r0) - parse_videoa_page(movie, html) - movie.url = default_url - - -def parse_videoa_page(movie: MovieInfo, html): - """解析AV影片的页面布局""" - title = html.xpath("//div[@class='hreview']/h1/text()")[0] - # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来 - container = html.xpath("//table[@class='mg-b12']/tr/td")[0] - cover = container.xpath("//div[@id='sample-video']/a/@href")[0] - # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083 - date_tag = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()") - if date_tag: - movie.publish_date = date_tag[0].strip().replace('/', '-') - duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip() - match = re.search(r'\d+', duration_str) - if match: - movie.duration = match.group(0) - # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况 - actress = container.xpath("//span[@id='performer']/a/text()") - director_tag = container.xpath("//td[text()='監督:']/following-sibling::td/a/text()") - if director_tag: - movie.director = director_tag[0].strip() - serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()") - if serial_tag: - movie.serial = serial_tag[0].strip() - producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()") - if producer_tag: - movie.producer = producer_tag[0].strip() - # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 - # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()") - # if label_tag: - # label = label_tag[0].strip() - # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选 - genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]") - genre, genre_id = [], [] - for tag in genre_tags: - genre.append(tag.text.strip()) - genre_id.append(tag.get('href').split('=')[-1].strip('/')) - cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() - plot = container.xpath("//div[contains(@class, 'mg-b20 lh4')]/text()")[0].strip() - preview_pics = container.xpath("//a[@name='sample-image']/img/@src") - score_tag = container.xpath("//p[@class='d-review__average']/strong/text()") - if score_tag: - match = re.search(r'\d+', score_tag[0].strip()) - if match: - score = float(match.group()) * 2 - movie.score = f'{score:.2f}' - else: - score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0] - movie.score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 - - if Cfg().crawler.hardworking: - # 预览视频是动态加载的,不在静态网页中 - video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}' - html2 = request.get_html(video_url) - # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据 - script = html2.xpath("//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()")[0].strip() - match = re.search(r'\{.*\}', script) - # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配 - try: - data = json.loads(match.group()) - video_url = data.get('src') - if video_url and video_url.startswith('//'): - video_url = 'https:' + video_url - movie.preview_video = video_url - except Exception as e: - logger.debug('解析视频地址时异常: ' + repr(e)) - - movie.cid = cid - movie.title = title - movie.cover = cover - movie.actress = actress - movie.genre = genre - movie.genre_id = genre_id - movie.plot = plot - movie.preview_pics = preview_pics - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 - - -def parse_anime_page(movie: MovieInfo, html): - """解析动画影片的页面布局""" - title = html.xpath("//h1[@id='title']/text()")[0] - container = html.xpath("//table[@class='mg-b12']/tr/td")[0] - cover = container.xpath("//img[@name='package-image']/@src")[0] - date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip() - publish_date = date_str.replace('/', '-') - duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()") - if duration_tag: - movie.duration = duration_tag[0].strip().replace('分', '') - serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()") - if serial_tag: - movie.serial = serial_tag[0].strip() - producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()") - if producer_tag: - movie.producer = producer_tag[0].strip() - genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]") - genre, genre_id = [], [] - for tag in genre_tags: - genre.append(tag.text.strip()) - genre_id.append(tag.get('href').split('=')[-1].strip('/')) - cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() - plot = container.xpath("//div[@class='mg-b20 lh4']/p")[0].text_content().strip() - preview_pics = container.xpath("//a[@name='sample-image']/img/@data-lazy") - score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0] - score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 - - movie.cid = cid - movie.title = title - movie.cover = cover - movie.publish_date = publish_date - movie.genre = genre - movie.genre_id = genre_id - movie.plot = plot - movie.score = f'{score/5:.2f}' # 转换为10分制 - movie.preview_pics = preview_pics - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 - - -# parse_dvd_page = parse_videoa_page # 118wtktabf067 -parse_ppr_page = parse_videoa_page -parse_nikkatsu_page = parse_videoa_page -parse_doujin_page = parse_anime_page - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo(cid='d_aisoft3356') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/fc2.py b/javsp/web/fc2.py deleted file mode 100644 index 66be7ae4e..000000000 --- a/javsp/web/fc2.py +++ /dev/null @@ -1,105 +0,0 @@ -"""从FC2官网抓取数据""" -import logging - - -from javsp.web.base import get_html, request_get, resp2html -from javsp.web.exceptions import * -from javsp.config import Cfg -from javsp.lib import strftime_to_minutes -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://adult.contents.fc2.com' - - -def get_movie_score(fc2_id): - """通过评论数据来计算FC2的影片评分(10分制),无法获得评分时返回None""" - html = get_html(f'{base_url}/article/{fc2_id}/review') - review_tags = html.xpath("//ul[@class='items_comment_headerReviewInArea']/li") - reviews = {} - for tag in review_tags: - score = int(tag.xpath("div/span/text()")[0]) - vote = int(tag.xpath("span")[0].text_content()) - reviews[score] = vote - total_votes = sum(reviews.values()) - if (total_votes >= 2): # 至少也该有两个人评价才有参考意义一点吧 - summary = sum([k*v for k, v in reviews.items()]) - final_score = summary / total_votes * 2 # 乘以2转换为10分制 - return final_score - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # 去除番号中的'FC2'字样 - id_uc = movie.dvdid.upper() - if not id_uc.startswith('FC2-'): - raise ValueError('Invalid FC2 number: ' + movie.dvdid) - fc2_id = id_uc.replace('FC2-', '') - # 抓取网页 - url = f'{base_url}/article/{fc2_id}/' - resp = request_get(url) - if '/id.fc2.com/' in resp.url: - raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP') - html = resp2html(resp) - container = html.xpath("//div[@class='items_article_left']") - if len(container) > 0: - container = container[0] - else: - raise MovieNotFoundError(__name__, movie.dvdid) - # FC2 标题增加反爬乱码,使用数组合并标题 - title_arr = container.xpath("//div[@class='items_article_headerInfo']/h3/text()") - title = ''.join(title_arr) - thumb_tag = container.xpath("//div[@class='items_article_MainitemThumb']")[0] - thumb_pic = thumb_tag.xpath("span/img/@src")[0] - duration_str = thumb_tag.xpath("span/p[@class='items_article_info']/text()")[0] - # FC2没有制作商和发行商的区分,作为个人市场,影片页面的'by'更接近于制作商 - producer = container.xpath("//li[text()='by ']/a/text()")[0] - genre = container.xpath("//a[@class='tag tagTag']/text()") - date_str = container.xpath("//div[@class='items_article_Releasedate']/p/text()")[0] - publish_date = date_str[-10:].replace('/', '-') # '販売日 : 2017/11/30' - preview_pics = container.xpath("//ul[@data-feed='sample-images']/li/a/@href") - - if Cfg().crawler.hardworking: - # 通过评论数据来计算准确的评分 - score = get_movie_score(fc2_id) - if score: - movie.score = f'{score:.2f}' - # 预览视频是动态加载的,不在静态网页中 - desc_frame_url = container.xpath("//section[@class='items_article_Contents']/iframe/@src")[0] - key = desc_frame_url.split('=')[-1] # /widget/article/718323/description?ac=60fc08fa... - api_url = f'{base_url}/api/v2/videos/{fc2_id}/sample?key={key}' - r = request_get(api_url).json() - movie.preview_video = r['path'] - else: - # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星 - score_tag_attr = container.xpath("//a[@class='items_article_Stars']/p/span/@class")[0] - score = int(score_tag_attr[-1]) * 2 - movie.score = f'{score:.2f}' - - movie.dvdid = id_uc - movie.url = url - movie.title = title - movie.genre = genre - movie.producer = producer - movie.duration = str(strftime_to_minutes(duration_str)) - movie.publish_date = publish_date - movie.preview_pics = preview_pics - # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 - if movie.preview_pics: - movie.cover = preview_pics[0] - else: - movie.cover = thumb_pic - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('FC2-718323') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/fc2fan.py b/javsp/web/fc2fan.py deleted file mode 100644 index 229b3e3df..000000000 --- a/javsp/web/fc2fan.py +++ /dev/null @@ -1,80 +0,0 @@ -"""解析fc2fan本地镜像的数据""" -# FC2官网的影片下架就无法再抓取数据,如果用户有fc2fan的镜像,那可以尝试从镜像中解析影片数据 -import os -import re -import logging -import lxml.html -import requests - - -from javsp.web.base import resp2html -from javsp.web.exceptions import * -from javsp.config import Cfg -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_path = str(Cfg().crawler.fc2fan_local_path) -use_local_mirror = os.path.exists(base_path) - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - if use_local_mirror: - html_file = f'{base_path}/{movie.dvdid}.html' - if not os.path.exists(html_file): - raise MovieNotFoundError(__name__, movie.dvdid, html_file) - html = lxml.html.parse(html_file) - else: - url = f"https://fc2club.top/html/{movie.dvdid}.html" - r = requests.get(url) - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - elif r.text == '': - raise WebsiteError(f'fc2fan: 站点不可用 (HTTP {r.status_code}): {url}') - html = resp2html(r) - try: - container = html.xpath("//div[@class='col-sm-8']")[0] - except IndexError: - raise WebsiteError(f'fc2fan: 站点不可用') - title = container.xpath("h3/text()")[0] - score_str = container.xpath("h5/strong[text()='影片评分']")[0].tail.strip() - match = re.search(r'\d+', score_str) - if match: - score = int(match.group()) / 10 # fc2fan站长是按100分来打分的 - movie.score = f'{score:.1f}' - resource_info = container.xpath("h5/strong[text()='资源参数']")[0].tail - if '无码' in resource_info: - movie.uncensored = True - elif '有码' in resource_info: - movie.uncensored = False - # FC2没有制作商和发行商的区分,作为个人市场,卖家更接近于制作商 - producer = container.xpath("h5/strong[text()='卖家信息']")[0].getnext().text - if producer: - movie.producer = producer.strip() - genre = container.xpath("h5/strong[text()='影片标签']/../a/text()") - actress = container.xpath("h5/strong[text()='女优名字']/../a/text()") - preview_pics = container.xpath("//ul[@class='slides']/li/img/@src") - if use_local_mirror: - preview_pics = [os.path.normpath(os.path.join(base_path, i)) for i in preview_pics] - # big_preview = container.xpath("//img[@id='thumbpic']/../@href")[0] # 影片真实截图,目前暂时用不到 - - movie.title = title - movie.genre = genre - movie.actress = actress - if preview_pics: - movie.preview_pics = preview_pics - movie.cover = preview_pics[0] - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('FC2-1879420') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/fc2ppvdb.py b/javsp/web/fc2ppvdb.py deleted file mode 100644 index b0ad60892..000000000 --- a/javsp/web/fc2ppvdb.py +++ /dev/null @@ -1,76 +0,0 @@ -"""从FC2PPVDB抓取数据""" -import logging -from typing import List - - -from javsp.web.base import get_html -from javsp.web.exceptions import * -from javsp.lib import strftime_to_minutes -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://fc2ppvdb.com' - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # 去除番号中的'FC2'字样 - id_uc = movie.dvdid.upper() - if not id_uc.startswith('FC2-'): - raise ValueError('Invalid FC2 number: ' + movie.dvdid) - fc2_id = id_uc.replace('FC2-', '') - # 抓取网页 - url = f'{base_url}/articles/{fc2_id}' - html = get_html(url) - container = html.xpath("//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]") - if len(container) > 0: - container = container[0] - else: - raise MovieNotFoundError(__name__, movie.dvdid) - - title = container.xpath("//h2/a/text()") - thumb_pic = container.xpath(f"//img[@alt='{fc2_id}']/@src") - duration_str = container.xpath("//div[starts-with(text(),'収録時間:')]/span/text()") - actress = container.xpath("//div[starts-with(text(),'女優:')]/span/a/text()") - genre = container.xpath("//div[starts-with(text(),'タグ:')]/span/a/text()") - publish_date = container.xpath("//div[starts-with(text(),'販売日:')]/span/text()") - publisher = container.xpath("//div[starts-with(text(),'販売者:')]/span/a/text()") - uncensored_str = container.xpath("//div[starts-with(text(),'モザイク:')]/span/text()") - uncensored_str_f = get_list_first(uncensored_str); - uncensored = True if uncensored_str_f == '無' else False if uncensored_str_f == '有' else None - preview_pics = None - preview_video = container.xpath("//a[starts-with(text(),'サンプル動画')]/@href") - - movie.dvdid = id_uc - movie.url = url - movie.title = get_list_first(title) - movie.genre = genre - movie.actress = actress - movie.duration = str(strftime_to_minutes(get_list_first(duration_str))) - movie.publish_date = get_list_first(publish_date) - movie.publisher = get_list_first(publisher) - movie.uncensored = uncensored - movie.preview_pics = preview_pics - movie.preview_video = get_list_first(preview_video) - - # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 - if movie.preview_pics: - movie.cover = preview_pics[0] - else: - movie.cover = get_list_first(thumb_pic) - -def get_list_first(list:List): - return list[0] if list and len(list) > 0 else None - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('FC2-4497837') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/gyutto.py b/javsp/web/gyutto.py deleted file mode 100644 index db7d6c795..000000000 --- a/javsp/web/gyutto.py +++ /dev/null @@ -1,87 +0,0 @@ -"""从https://gyutto.com/官网抓取数据""" -import logging -import time - -from javsp.web.base import resp2html, request_get -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - -logger = logging.getLogger(__name__) - -# https://dl.gyutto.com/i/item266923 -base_url = 'http://gyutto.com' -base_encode = 'euc-jp' - -def get_movie_title(html): - container = html.xpath("//h1") - if len(container) > 0: - container = container[0] - title = container.text - - return title - -def get_movie_img(html, index = 1): - images = [] - container = html.xpath("//a[@class='highslide']/img") - if len(container) > 0: - if index == 0: - return container[0].get('src') - - for row in container: - images.append(row.get('src')) - - return images - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # 去除番号中的'gyutto'字样 - id_uc = movie.dvdid.upper() - if not id_uc.startswith('GYUTTO-'): - raise ValueError('Invalid gyutto number: ' + movie.dvdid) - gyutto_id = id_uc.replace('GYUTTO-', '') - # 抓取网页 - url = f'{base_url}/i/item{gyutto_id}?select_uaflag=1' - r = request_get(url, delay_raise=True) - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - html = resp2html(r, base_encode) - container = html.xpath("//dl[@class='BasicInfo clearfix']") - - for row in container: - key = row.xpath(".//dt/text()") - if key[0] == "サークル": - producer = ''.join(row.xpath(".//dd/a/text()")) - elif key[0] == "ジャンル": - genre = row.xpath(".//dd/a/text()") - elif key[0] == "配信開始日": - date = row.xpath(".//dd/text()") - date_str = ''.join(date) - date_time = time.strptime(date_str, "%Y年%m月%d日") - publish_date = time.strftime("%Y-%m-%d", date_time) - - plot = html.xpath("//div[@class='unit_DetailLead']/p/text()")[0] - - movie.title = get_movie_title(html) - movie.cover = get_movie_img(html, 0) - movie.preview_pics = get_movie_img(html) - movie.dvdid = id_uc - movie.url = url - movie.producer = producer - # movie.actress = actress - # movie.duration = duration - movie.publish_date = publish_date - movie.genre = genre - movie.plot = plot - -if __name__ == "__main__": - import pretty_errors - - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('gyutto-266923') - - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/jav321.py b/javsp/web/jav321.py deleted file mode 100644 index 4e42617a5..000000000 --- a/javsp/web/jav321.py +++ /dev/null @@ -1,100 +0,0 @@ -"""从jav321抓取数据""" -import re -import logging - - -from javsp.web.base import post_html -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://www.jav321.com' - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - html = post_html(f'{base_url}/search', data={'sn': movie.dvdid}) - page_url = html.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0] - #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542 - cid = page_url.split('/')[-1] # /video/ipx00177 - # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片 - if cid == 'search': - raise MovieNotFoundError(__name__, movie.dvdid) - title = html.xpath("//div[@class='panel-heading']/h3/text()")[0] - info = html.xpath("//div[@class='col-md-9']")[0] - # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签 - company_tags = info.xpath("a[contains(@href,'/company/')]/text()") - if company_tags: - movie.producer = company_tags[0] - # actress, actress_pics - # jav321现在连女优信息都没有了,首页通过女优栏跳转过去也全是空白 - actress, actress_pics = [], {} - actress_tags = html.xpath("//div[@class='thumbnail']/a[contains(@href,'/star/')]/img") - for tag in actress_tags: - name = tag.tail.strip() - pic_url = tag.get('src') - actress.append(name) - # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url, - # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据 - actress_pics[name] = pic_url - # genre, genre_id - genre_tags = info.xpath("a[contains(@href,'/genre/')]") - genre, genre_id = [], [] - for tag in genre_tags: - genre.append(tag.text) - genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1 - dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper() - publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '') - duration_str = info.xpath("b[text()='収録時間']")[0].tail - match = re.search(r'\d+', duration_str) - if match: - movie.duration = match.group(0) - # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星 - score_tag = info.xpath("//b[text()='平均評価']/following-sibling::img/@data-original") - if score_tag: - score = int(score_tag[0][5:7])/5 # /10*2 - movie.score = str(score) - serial_tag = info.xpath("a[contains(@href,'/series/')]/text()") - if serial_tag: - movie.serial = serial_tag[0] - preview_video_tag = info.xpath("//video/source/@src") - if preview_video_tag: - movie.preview_video = preview_video_tag[0] - plot_tag = info.xpath("//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()") - if plot_tag: - movie.plot = plot_tag[0] - preview_pics = html.xpath("//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src") - if len(preview_pics) == 0: - # 尝试搜索另一种布局下的封面,需要使用onerror过滤掉明明没有封面时网站往里面塞的默认URL - preview_pics = html.xpath("//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src") - # 有的图片链接里有多个//,网站质量堪忧…… - preview_pics = [i[:8] + i[8:].replace('//', '/') for i in preview_pics] - # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析 - - movie.url = page_url - movie.cid = cid - movie.dvdid = dvdid - movie.title = title - movie.actress = actress - movie.actress_pics = actress_pics - movie.genre = genre - movie.genre_id = genre_id - movie.publish_date = publish_date - # preview_pics的第一张图始终是封面,剩下的才是预览图 - if len(preview_pics) > 0: - movie.cover = preview_pics[0] - movie.preview_pics = preview_pics[1:] - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('SCUTE-1177') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/javbus.py b/javsp/web/javbus.py deleted file mode 100644 index a98cd9974..000000000 --- a/javsp/web/javbus.py +++ /dev/null @@ -1,115 +0,0 @@ -"""从JavBus抓取数据""" -import logging - - -from javsp.web.base import * -from javsp.web.exceptions import * -from javsp.func import * -from javsp.config import Cfg, CrawlerID -from javsp.datatype import MovieInfo, GenreMap - - -logger = logging.getLogger(__name__) -genre_map = GenreMap('data/genre_javbus.csv') -permanent_url = 'https://www.javbus.com' -if Cfg().network.proxy_server is not None: - base_url = permanent_url -else: - base_url = str(Cfg().network.proxy_free[CrawlerID.javbus]) - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - url = f'{base_url}/{movie.dvdid}' - resp = request_get(url, delay_raise=True) - # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息 - if resp.history and resp.history[0].status_code == 302: - html = resp2html(resp.history[0]) - else: - html = resp2html(resp) - # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404 - page_title = html.xpath('/html/head/title/text()') - if page_title and page_title[0].startswith('404 Page Not Found!'): - raise MovieNotFoundError(__name__, movie.dvdid) - - container = html.xpath("//div[@class='container']")[0] - title = container.xpath("h3/text()")[0] - cover = container.xpath("//a[@class='bigImage']/img/@src")[0] - preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href") - info = container.xpath("//div[@class='col-md-3 info']")[0] - dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text - publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip() - duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip() - director_tag = info.xpath("p/span[text()='導演:']") - if director_tag: # xpath没有匹配时将得到空列表 - movie.director = director_tag[0].getnext().text.strip() - producer_tag = info.xpath("p/span[text()='製作商:']") - if producer_tag: - text = producer_tag[0].getnext().text - if text: - movie.producer = text.strip() - publisher_tag = info.xpath("p/span[text()='發行商:']") - if publisher_tag: - movie.publisher = publisher_tag[0].getnext().text.strip() - serial_tag = info.xpath("p/span[text()='系列:']") - if serial_tag: - movie.serial = serial_tag[0].getnext().text - # genre, genre_id - genre_tags = info.xpath("//span[@class='genre']/label/a") - genre, genre_id = [], [] - for tag in genre_tags: - tag_url = tag.get('href') - pre_id = tag_url.split('/')[-1] - genre.append(tag.text) - if 'uncensored' in tag_url: - movie.uncensored = True - genre_id.append('uncensored-' + pre_id) - else: - movie.uncensored = False - genre_id.append(pre_id) - # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析 - # actress, actress_pics - actress, actress_pics = [], {} - actress_tags = html.xpath("//a[@class='avatar-box']/div/img") - for tag in actress_tags: - name = tag.get('title') - pic_url = tag.get('src') - actress.append(name) - if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像 - actress_pics[name] = pic_url - # 整理数据并更新movie的相应属性 - movie.url = f'{permanent_url}/{movie.dvdid}' - movie.dvdid = dvdid - movie.title = title.replace(dvdid, '').strip() - movie.cover = cover - movie.preview_pics = preview_pics - if publish_date != '0000-00-00': # 丢弃无效的发布日期 - movie.publish_date = publish_date - movie.duration = duration if int(duration) else None - movie.genre = genre - movie.genre_id = genre_id - movie.actress = actress - movie.actress_pics = actress_pics - - -def parse_clean_data(movie: MovieInfo): - """解析指定番号的影片数据并进行清洗""" - parse_data(movie) - movie.genre_norm = genre_map.map(movie.genre_id) - movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换) - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('NANP-030') - try: - parse_clean_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/javdb.py b/javsp/web/javdb.py deleted file mode 100644 index 5120aae76..000000000 --- a/javsp/web/javdb.py +++ /dev/null @@ -1,333 +0,0 @@ -"""从JavDB抓取数据""" -import os -import re -import logging - -from javsp.web.base import Request, resp2html -from javsp.web.exceptions import * -from javsp.func import * -from javsp.avid import guess_av_type -from javsp.config import Cfg, CrawlerID -from javsp.datatype import MovieInfo, GenreMap -from javsp.chromium import get_browsers_cookies - - -# 初始化Request实例。使用scraper绕过CloudFlare后,需要指定网页语言,否则可能会返回其他语言网页,影响解析 -request = Request(use_scraper=True) -request.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5' - -logger = logging.getLogger(__name__) -genre_map = GenreMap('data/genre_javdb.csv') -permanent_url = 'https://javdb.com' -if Cfg().network.proxy_server is not None: - base_url = permanent_url -else: - base_url = str(Cfg().network.proxy_free[CrawlerID.javdb]) - - -def get_html_wrapper(url): - """包装外发的request请求并负责转换为可xpath的html,同时处理Cookies无效等问题""" - global request, cookies_pool - r = request.get(url, delay_raise=True) - if r.status_code == 200: - # 发生重定向可能仅仅是域名重定向,因此还要检查url以判断是否被跳转到了登录页 - if r.history and '/login' in r.url: - # 仅在需要时去读取Cookies - if 'cookies_pool' not in globals(): - try: - cookies_pool = get_browsers_cookies() - except (PermissionError, OSError) as e: - logger.warning(f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", exc_info=True) - cookies_pool = [] - except Exception as e: - logger.warning(f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", exc_info=True) - cookies_pool = [] - if len(cookies_pool) > 0: - item = cookies_pool.pop() - # 更换Cookies时需要创建新的request实例,否则cloudscraper会保留它内部第一次发起网络访问时获得的Cookies - request = Request(use_scraper=True) - request.cookies = item['cookies'] - cookies_source = (item['profile'], item['site']) - logger.debug(f'未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}') - return get_html_wrapper(url) - else: - raise CredentialError('JavDB: 所有浏览器Cookies均已过期') - elif r.history and 'pay' in r.url.split('/')[-1]: - raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'") - else: - html = resp2html(r) - return html - elif r.status_code in (403, 503): - html = resp2html(r) - code_tag = html.xpath("//span[@class='code-label']/span") - error_code = code_tag[0].text if code_tag else None - if error_code: - if error_code == '1020': - block_msg = f'JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器' - else: - block_msg = f'JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})' - else: - block_msg = f'JavDB: {r.status_code} 禁止访问: {url}' - raise SiteBlocked(block_msg) - else: - raise WebsiteError(f'JavDB: {r.status_code} 非预期状态码: {url}') - - -def get_user_info(site, cookies): - """获取cookies对应的JavDB用户信息""" - try: - request.cookies = cookies - html = request.get_html(f'https://{site}/users/profile') - except Exception as e: - logger.info('JavDB: 获取用户信息时出错') - logger.debug(e, exc_info=1) - return - # 扫描浏览器得到的Cookies对应的临时域名可能会过期,因此需要先判断域名是否仍然指向JavDB的站点 - if 'JavDB' in html.text: - email = html.xpath("//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()")[0].strip() - username = html.xpath("//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()")[0].strip() - return email, username - else: - logger.debug('JavDB: 域名已过期: ' + site) - - -def get_valid_cookies(): - """扫描浏览器,获取一个可用的Cookies""" - # 经测试,Cookies所发往的域名不需要和登录时的域名保持一致,只要Cookies有效即可在多个域名间使用 - for d in cookies_pool: - info = get_user_info(d['site'], d['cookies']) - if info: - return d['cookies'] - else: - logger.debug(f"{d['profile']}, {d['site']}: Cookies无效") - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个 - html = get_html_wrapper(f'{base_url}/search?q={movie.dvdid}') - ids = list(map(str.lower, html.xpath("//div[@class='video-title']/strong/text()"))) - movie_urls = html.xpath("//a[@class='box']/@href") - match_count = len([i for i in ids if i == movie.dvdid.lower()]) - if match_count == 0: - raise MovieNotFoundError(__name__, movie.dvdid, ids) - elif match_count == 1: - index = ids.index(movie.dvdid.lower()) - new_url = movie_urls[index] - try: - html2 = get_html_wrapper(new_url) - except (SitePermissionError, CredentialError): - # 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面 - box = html.xpath("//a[@class='box']")[index] - movie.url = new_url - movie.title = box.get('title') - movie.cover = box.xpath("div/img/@src")[0] - score_str = box.xpath("div[@class='score']/span/span")[0].tail - score = re.search(r'([\d.]+)分', score_str).group(1) - movie.score = "{:.2f}".format(float(score)*2) - movie.publish_date = box.xpath("div[@class='meta']/text()")[0].strip() - return - else: - raise MovieDuplicateError(__name__, movie.dvdid, match_count) - - container = html2.xpath("/html/body/section/div/div[@class='video-detail']")[0] - info = container.xpath("//nav[@class='panel movie-panel-info']")[0] - title = container.xpath("h2/strong[@class='current-title']/text()")[0] - show_orig_title = container.xpath("//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]") - if show_orig_title: - movie.ori_title = container.xpath("h2/span[@class='origin-title']/text()")[0] - cover = container.xpath("//img[@class='video-cover']/@src")[0] - preview_pics = container.xpath("//a[@class='tile-item'][@data-fancybox='gallery']/@href") - preview_video_tag = container.xpath("//video[@id='preview-video']/source/@src") - if preview_video_tag: - preview_video = preview_video_tag[0] - if preview_video.startswith('//'): - preview_video = 'https:' + preview_video - movie.preview_video = preview_video - dvdid = info.xpath("div/span")[0].text_content() - publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text - duration = info.xpath("div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip() - director_tag = info.xpath("div/strong[text()='導演:']") - if director_tag: - movie.director = director_tag[0].getnext().text_content().strip() - av_type = guess_av_type(movie.dvdid) - if av_type != 'fc2': - producer_tag = info.xpath("div/strong[text()='片商:']") - else: - producer_tag = info.xpath("div/strong[text()='賣家:']") - if producer_tag: - movie.producer = producer_tag[0].getnext().text_content().strip() - publisher_tag = info.xpath("div/strong[text()='發行:']") - if publisher_tag: - movie.publisher = publisher_tag[0].getnext().text_content().strip() - serial_tag = info.xpath("div/strong[text()='系列:']") - if serial_tag: - movie.serial = serial_tag[0].getnext().text_content().strip() - score_tag = info.xpath("//span[@class='score-stars']") - if score_tag: - score_str = score_tag[0].tail - score = re.search(r'([\d.]+)分', score_str).group(1) - movie.score = "{:.2f}".format(float(score)*2) - genre_tags = info.xpath("//strong[text()='類別:']/../span/a") - genre, genre_id = [], [] - for tag in genre_tags: - pre_id = tag.get('href').split('/')[-1] - genre.append(tag.text) - genre_id.append(pre_id) - # 判定影片有码/无码 - subsite = pre_id.split('?')[0] - movie.uncensored = {'uncensored': True, 'tags':False}.get(subsite) - # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优 - actors_tag = info.xpath("//strong[text()='演員:']/../span")[0] - all_actors = actors_tag.xpath("a/text()") - genders = actors_tag.xpath("strong/text()") - actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀'] - magnet = container.xpath("//div[@class='magnet-name column is-four-fifths']/a/@href") - - movie.dvdid = dvdid - movie.url = new_url.replace(base_url, permanent_url) - movie.title = title.replace(dvdid, '').strip() - movie.cover = cover - movie.preview_pics = preview_pics - movie.publish_date = publish_date - movie.duration = duration - movie.genre = genre - movie.genre_id = genre_id - movie.actress = actress - movie.magnet = [i.replace('[javdb.com]','') for i in magnet] - - -def parse_clean_data(movie: MovieInfo): - """解析指定番号的影片数据并进行清洗""" - try: - parse_data(movie) - # 检查封面URL是否真的存在对应图片 - if movie.cover is not None: - r = request.head(movie.cover) - if r.status_code != 200: - movie.cover = None - except SiteBlocked: - raise - logger.error('JavDB: 可能触发了反爬虫机制,请稍后再试') - if movie.genre_id and (not movie.genre_id[0].startswith('fc2?')): - movie.genre_norm = genre_map.map(movie.genre_id) - movie.genre_id = None # 没有别的地方需要再用到,清空genre id(表明已经完成转换) - - -def collect_actress_alias(type=0, use_original=True): - """ - 收集女优的别名 - type: 0-有码, 1-无码, 2-欧美 - use_original: 是否使用原名而非译名,True-田中レモン,False-田中檸檬 - """ - import json - import time - import random - - actressAliasMap = {} - - actressAliasFilePath = "data/actress_alias.json" - # 检查文件是否存在 - if not os.path.exists(actressAliasFilePath): - # 如果文件不存在,创建文件并写入空字典 - with open(actressAliasFilePath, "w", encoding="utf-8") as file: - json.dump({}, file) - - typeList = ["censored", "uncensored", "western"] - page_url = f"{base_url}/actors/{typeList[type]}" - while True: - try: - html = get_html_wrapper(page_url) - actors = html.xpath("//div[@class='box actor-box']/a") - - count = 0 - for actor in actors: - count += 1 - actor_name = actor.xpath("strong/text()")[0].strip() - actor_url = actor.xpath("@href")[0] - # actor_url = f"https://javdb.com{actor_url}" # 构造演员主页的完整URL - - # 进入演员主页,获取更多信息 - actor_html = get_html_wrapper(actor_url) - # 解析演员所有名字信息 - names_span = actor_html.xpath("//span[@class='actor-section-name']")[0] - aliases_span_list = actor_html.xpath("//span[@class='section-meta']") - aliases_span = aliases_span_list[0] - - names_list = [name.strip() for name in names_span.text.split(",")] - if len(aliases_span_list) > 1: - aliases_list = [ - alias.strip() for alias in aliases_span.text.split(",") - ] - else: - aliases_list = [] - - # 将信息添加到actressAliasMap中 - actressAliasMap[names_list[-1 if use_original else 0]] = ( - names_list + aliases_list - ) - print( - f"{count} --- {names_list[-1 if use_original else 0]}: {names_list + aliases_list}" - ) - - if count == 10: - # 将数据写回文件 - with open(actressAliasFilePath, "r", encoding="utf-8") as file: - existing_data = json.load(file) - - # 合并现有数据和新爬取的数据 - existing_data.update(actressAliasMap) - - # 将合并后的数据写回文件 - with open(actressAliasFilePath, "w", encoding="utf-8") as file: - json.dump(existing_data, file, ensure_ascii=False, indent=2) - - actressAliasMap = {} # 重置actressAliasMap - - print( - f"已爬取 {count} 个女优,数据已更新并写回文件:", - actressAliasFilePath, - ) - - # 重置计数器 - count = 0 - - time.sleep(max(1, 10 * random.random())) # 随机等待 1-10 秒 - - # 判断是否有下一页按钮 - next_page_link = html.xpath( - "//a[@rel='next' and @class='pagination-next']/@href" - ) - if not next_page_link: - break # 没有下一页,结束循环 - else: - next_page_url = f"{next_page_link[0]}" - page_url = next_page_url - - except SiteBlocked: - raise - - with open(actressAliasFilePath, "r", encoding="utf-8") as file: - existing_data = json.load(file) - - # 合并现有数据和新爬取的数据 - existing_data.update(actressAliasMap) - - # 将合并后的数据写回文件 - with open(actressAliasFilePath, "w", encoding="utf-8") as file: - json.dump(existing_data, file, ensure_ascii=False, indent=2) - - print(f"已爬取 {count} 个女优,数据已更新并写回文件:", actressAliasFilePath) - - -if __name__ == "__main__": - # collect_actress_alias() - movie = MovieInfo('FC2-2735981') - try: - parse_clean_data(movie) - print(movie) - except CrawlerError as e: - print(repr(e)) diff --git a/javsp/web/javlib.py b/javsp/web/javlib.py deleted file mode 100644 index 85f77b75f..000000000 --- a/javsp/web/javlib.py +++ /dev/null @@ -1,141 +0,0 @@ -"""从JavLibrary抓取数据""" -import logging -from urllib.parse import urlsplit - - -from javsp.web.base import Request, read_proxy, resp2html -from javsp.web.exceptions import * -from javsp.web.proxyfree import get_proxy_free_url -from javsp.config import Cfg, CrawlerID -from javsp.datatype import MovieInfo - - -# 初始化Request实例 -request = Request(use_scraper=True) - -logger = logging.getLogger(__name__) -permanent_url = 'https://www.javlibrary.com' -base_url = '' - - -def init_network_cfg(): - """设置合适的代理模式和base_url""" - request.timeout = 5 - proxy_free_url = get_proxy_free_url('javlib') - urls = [str(Cfg().network.proxy_free[CrawlerID.javlib]), permanent_url] - if proxy_free_url and proxy_free_url not in urls: - urls.insert(1, proxy_free_url) - # 使用代理容易触发IUAM保护,先尝试不使用代理访问 - proxy_cfgs = [{}, read_proxy()] if Cfg().network.proxy_server else [{}] - for proxies in proxy_cfgs: - request.proxies = proxies - for url in urls: - if proxies == {} and url == permanent_url: - continue - try: - resp = request.get(url, delay_raise=True) - if resp.status_code == 200: - request.timeout = Cfg().network.timeout.seconds - return url - except Exception as e: - logger.debug(f"Fail to connect to '{url}': {e}") - logger.warning('无法绕开JavLib的反爬机制') - request.timeout = Cfg().network.timeout.seconds - return permanent_url - - -# TODO: 发现JavLibrary支持使用cid搜索,会直接跳转到对应的影片页面,也许可以利用这个功能来做cid到dvdid的转换 -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - global base_url - if not base_url: - base_url = init_network_cfg() - logger.debug(f"JavLib网络配置: {base_url}, proxy={request.proxies}") - url = new_url = f'{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}' - resp = request.get(url) - html = resp2html(resp) - if resp.history: - if urlsplit(resp.url).netloc == urlsplit(base_url).netloc: - # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果 - new_url = resp.url - else: - # 重定向到了不同的netloc时,新地址并不是影片地址。这种情况下新地址中丢失了path字段, - # 为无效地址(应该是JavBus重定向配置有问题),需要使用新的base_url抓取数据 - base_url = 'https://' + urlsplit(resp.url).netloc - logger.warning(f"请将配置文件中的JavLib免代理地址更新为: {base_url}") - return parse_data(movie) - else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果 - video_tags = html.xpath("//div[@class='video'][@id]/a") - # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果 - pre_choose = [] - for tag in video_tags: - tag_dvdid = tag.xpath("div[@class='id']/text()")[0] - if tag_dvdid.upper() == movie.dvdid.upper(): - pre_choose.append(tag) - pre_choose_urls = [i.get('href') for i in pre_choose] - match_count = len(pre_choose) - if match_count == 0: - raise MovieNotFoundError(__name__, movie.dvdid) - elif match_count == 1: - new_url = pre_choose_urls[0] - elif match_count == 2: - no_blueray = [] - for tag in pre_choose: - if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc - no_blueray.append(tag) - no_blueray_count = len(no_blueray) - if no_blueray_count == 1: - new_url = no_blueray[0].get('href') - logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}") - else: - # 两个结果中没有谁是蓝光影片,说明影片番号重复了 - raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls) - else: - # 存在不同影片但是番号相同的情况,如MIDV-010 - raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls) - # 重新抓取网页 - html = request.get_html(new_url) - container = html.xpath("/html/body/div/div[@id='rightcolumn']")[0] - title_tag = container.xpath("div/h3/a/text()") - title = title_tag[0] - cover = container.xpath("//img[@id='video_jacket_img']/@src")[0] - info = container.xpath("//div[@id='video_info']")[0] - dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0] - publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0] - duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0] - director_tag = info.xpath("//span[@class='director']/a/text()") - if director_tag: - movie.director = director_tag[0] - producer = info.xpath("//span[@class='maker']/a/text()")[0] - publisher_tag = info.xpath("//span[@class='label']/a/text()") - if publisher_tag: - movie.publisher = publisher_tag[0] - score_tag = info.xpath("//span[@class='score']/text()") - if score_tag: - movie.score = score_tag[0].strip('()') - genre = info.xpath("//span[@class='genre']/a/text()") - actress = info.xpath("//span[@class='star']/a/text()") - - movie.dvdid = dvdid - movie.url = new_url.replace(base_url, permanent_url) - movie.title = title.replace(dvdid, '').strip() - if cover.startswith('//'): # 补全URL中缺少的协议段 - cover = 'https:' + cover - movie.cover = cover - movie.publish_date = publish_date - movie.duration = duration - movie.producer = producer - movie.genre = genre - movie.actress = actress - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - base_url = permanent_url - movie = MovieInfo('IPX-177') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - print(e) diff --git a/javsp/web/javmenu.py b/javsp/web/javmenu.py deleted file mode 100644 index 5296a69cd..000000000 --- a/javsp/web/javmenu.py +++ /dev/null @@ -1,88 +0,0 @@ -"""从JavMenu抓取数据""" -import logging - -from javsp.web.base import Request, resp2html -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - - -request = Request() - -logger = logging.getLogger(__name__) -base_url = 'https://mrzyx.xyz' - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - # JavMenu网页做得很不走心,将就了 - url = f'{base_url}/{movie.dvdid}' - r = request.get(url) - if r.history: - # 被重定向到主页说明找不到影片资源 - raise MovieNotFoundError(__name__, movie.dvdid) - - html = resp2html(r) - container = html.xpath("//div[@class='col-md-9 px-0']")[0] - title = container.xpath("div[@class='col-12 mb-3']/h1/strong/text()")[0] - # 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站 - title = title.replace(' | JAV目錄大全 | 每日更新', '') - title = title.replace(' 免費在線看', '').replace(' 免費AV在線看', '') - cover_tag = container.xpath("//div[@class='single-video']") - if len(cover_tag) > 0: - video_tag = cover_tag[0].find('video') - # URL首尾竟然也有空格…… - movie.cover = video_tag.get('data-poster').strip() - # 预览影片改为blob了,无法获取 - # movie.preview_video = video_tag.find('source').get('src').strip() - else: - cover_img_tag = container.xpath("//img[@class='lazy rounded']/@data-src") - if cover_img_tag: - movie.cover = cover_img_tag[0].strip() - info = container.xpath("//div[@class='card-body']")[0] - publish_date = info.xpath("div/span[contains(text(), '日期:')]")[0].getnext().text - duration = info.xpath("div/span[contains(text(), '時長:')]")[0].getnext().text.replace('分鐘', '') - producer = info.xpath("div/span[contains(text(), '製作:')]/following-sibling::a/span/text()") - if producer: - movie.producer = producer[0] - genre_tags = info.xpath("//a[@class='genre']") - genre, genre_id = [], [] - for tag in genre_tags: - items = tag.get('href').split('/') - pre_id = items[-3] + '/' + items[-1] - genre.append(tag.text.strip()) - genre_id.append(pre_id) - # genre的链接中含有censored字段,但是无法用来判断影片是否有码,因为完全不可靠…… - actress = info.xpath("div/span[contains(text(), '女優:')]/following-sibling::*/a/text()") or None - magnet_table = container.xpath("//table[contains(@class, 'magnet-table')]/tbody") - if magnet_table: - magnet_links = magnet_table[0].xpath("tr/td/a/@href") - # 它的FC2数据是从JavDB抓的,JavDB更换图片服务器后它也跟上了,似乎数据更新频率还可以 - movie.magnet = [i.replace('[javdb.com]','') for i in magnet_links] - preview_pics = container.xpath("//a[@data-fancybox='gallery']/@href") - - if (not movie.cover) and preview_pics: - movie.cover = preview_pics[0] - movie.url = url - movie.title = title.replace(movie.dvdid, '').strip() - movie.preview_pics = preview_pics - movie.publish_date = publish_date - movie.duration = duration - movie.genre = genre - movie.genre_id = genre_id - movie.actress = actress - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('FC2-718323') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/mgstage.py b/javsp/web/mgstage.py deleted file mode 100644 index 4904e51db..000000000 --- a/javsp/web/mgstage.py +++ /dev/null @@ -1,114 +0,0 @@ -"""从蚊香社-mgstage抓取数据""" -import re -import logging - - -from javsp.web.base import Request, resp2html -from javsp.web.exceptions import * -from javsp.config import Cfg -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://www.mgstage.com' -# 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) -request = Request() -request.cookies = {'adc': '1'} - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - url = f'{base_url}/product/product_detail/{movie.dvdid}/' - resp = request.get(url, delay_raise=True) - if resp.status_code == 403: - raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理') - # url不存在时会被重定向至主页。history非空时说明发生了重定向 - elif resp.history: - raise MovieNotFoundError(__name__, movie.dvdid) - - html = resp2html(resp) - # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除 - title = html.xpath("//div[@class='common_detail_cover']/h1/text()")[0].strip() - container = html.xpath("//div[@class='detail_left']")[0] - cover = container.xpath("//a[@id='EnlargeImage']/@href")[0] - # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表 - actress_text = container.xpath("//th[text()='出演:']/following-sibling::td/text()") - actress_link = container.xpath("//th[text()='出演:']/following-sibling::td/a/text()") - actress = [i.strip() for i in actress_text + actress_link] - actress = [i for i in actress if i] # 移除空字符串 - producer = container.xpath("//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip() - duration_str = container.xpath("//th[text()='収録時間:']/following-sibling::td/text()")[0] - match = re.search(r'\d+', duration_str) - if match: - movie.duration = match.group(0) - dvdid = container.xpath("//th[text()='品番:']/following-sibling::td/text()")[0] - date_str = container.xpath("//th[text()='配信開始日:']/following-sibling::td/text()")[0] - publish_date = date_str.replace('/', '-') - serial_tag = container.xpath("//th[text()='シリーズ:']/following-sibling::td/a/text()") - if serial_tag: - movie.serial = serial_tag[0].strip() - # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 - # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip() - genre_tags = container.xpath("//th[text()='ジャンル:']/following-sibling::td/a") - genre = [i.text.strip() for i in genre_tags] - score_str = container.xpath("//td[@class='review']/span")[0].tail.strip() - match = re.search(r'^[\.\d]+', score_str) - if match: - score = float(match.group()) * 2 - movie.score = f'{score:.2f}' - # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签 - plots = [] - plot_p_tags = container.xpath("//dl[@id='introduction']/dd/p[not(@class='more')]") - for p in plot_p_tags: - children = p.getchildren() - # 没有children时表明plot不含有格式,此时简单地提取文本就可以 - if not children: - plots.append(p.text_content()) - continue - for child in children: - if child.tag == 'br' and plots[-1] != '\n': - plots.append('\n') - else: - if child.text: - plots.append(child.text) - if child.tail: - plots.append(child.tail) - plot = ''.join(plots).strip() - preview_pics = container.xpath("//a[@class='sample_image']/@href") - - if Cfg().crawler.hardworking: - # 预览视频是点击按钮后再加载的,不在静态网页中 - btn_url = container.xpath("//a[@class='button_sample']/@href")[0] - video_pid = btn_url.split('/')[-1] - req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}' - resp = request.get(req_url).json() - video_url = resp.get('url') - if video_url: - # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX - preview_video = video_url.split('.ism/')[0] + '.mp4' - movie.preview_video = preview_video - - movie.dvdid = dvdid - movie.url = url - movie.title = title - movie.cover = cover - movie.actress = actress - movie.producer = producer - movie.publish_date = publish_date - movie.genre = genre - movie.plot = plot - movie.preview_pics = preview_pics - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('HRV-045') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/njav.py b/javsp/web/njav.py deleted file mode 100644 index f94e943f3..000000000 --- a/javsp/web/njav.py +++ /dev/null @@ -1,134 +0,0 @@ -"""从NJAV抓取数据""" -import re -import logging -from typing import List - - -from javsp.web.base import get_html -from javsp.web.exceptions import * -from javsp.lib import strftime_to_minutes -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://njav.tv/ja' - -def search_video(movie: MovieInfo): - id_uc = movie.dvdid - # 抓取网页 - url = f'{base_url}/search?keyword={id_uc}' - html = get_html(url) - list = html.xpath("//div[@class='box-item']/div[@class='detail']/a") - video_url = None - for item in list: - search_title = item.xpath("text()")[0] - if id_uc in search_title: - video_url = item.xpath("@href") - break - if id_uc.startswith("FC2-"): - fc2id = id_uc.replace('FC2-', '') - if "FC2" in search_title and fc2id in search_title: - video_url = item.xpath("@href") - break - - return get_list_first(video_url) - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # 抓取网页 - url = search_video(movie) - if not url: - raise MovieNotFoundError(__name__, movie.dvdid) - html = get_html(url) - container = html.xpath("//div[@class='container']/div/div[@class='col']") - if len(container) > 0: - container = container[0] - else: - raise MovieNotFoundError(__name__, movie.dvdid) - - title = container.xpath("//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()")[0] - thumb_pic = container.xpath("//div[@id='player']/@data-poster") - plot = " ".join(container.xpath("//div[@class='description']/p/text()")) - magnet = container.xpath("//div[@class='magnet']/a/@href") - real_id = None - publish_date = None - duration_str = None - uncensored = None - preview_pics = None - preview_video = None - serial = None - publisher = None - producer = None - genre = [] - actress = [] - - detail_dic = {} - for item in container.xpath("//div[@class='detail-item']/div"): - item_title = item.xpath('span/text()')[0] - if "タグ:" in item_title: - genre += item.xpath("span")[1].xpath("a/text()") - elif "ジャンル:" in item_title: - genre += item.xpath("span")[1].xpath("a/text()") - elif "レーベル:" in item_title: - genre += item.xpath("span")[1].xpath("a/text()") - elif "女優:" in item_title: - actress = item.xpath("span")[1].xpath("a/text()") - elif "シリーズ:" in item_title: - serial = get_list_first(item.xpath("span")[1].xpath("a/text()")) - elif "メーカー:" in item_title: - producer = get_list_first(item.xpath("span")[1].xpath("a/text()")) - elif "コード:" in item_title: - real_id = get_list_first(item.xpath("span")[1].xpath("text()")) - elif "公開日:" in item_title: - publish_date = get_list_first(item.xpath("span")[1].xpath("text()")) - elif "再生時間:" in item_title: - duration_str = get_list_first(item.xpath("span")[1].xpath("text()")) - - # 清除标题里的番号字符 - keywords = [real_id, " "] - if movie.dvdid.startswith("FC2"): - keywords += ["FC2","PPV","-"] + [movie.dvdid.split("-")[-1]] - for keyword in keywords: - title = re.sub(re.escape(keyword), "", title, flags=re.I) - - # 判断是否无码 - uncensored_arr = magnet + [title] - for uncensored_str in uncensored_arr: - if 'uncensored' in uncensored_str.lower(): - uncensored = True - - movie.url = url - movie.title = title - movie.genre = genre - movie.actress = actress - movie.duration = str(strftime_to_minutes(duration_str)) - movie.publish_date = publish_date - movie.publisher = publisher - movie.producer = producer - movie.uncensored = uncensored - movie.preview_pics = preview_pics - movie.preview_video = preview_video - movie.plot = plot - movie.serial = serial - movie.magnet = magnet - - # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 - if movie.preview_pics: - movie.cover = preview_pics[0] - else: - movie.cover = get_list_first(thumb_pic) - -def get_list_first(list:List): - return list[0] if list and len(list) > 0 else None - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('012023_002') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/prestige.py b/javsp/web/prestige.py deleted file mode 100644 index f6884c658..000000000 --- a/javsp/web/prestige.py +++ /dev/null @@ -1,83 +0,0 @@ -"""从蚊香社-prestige抓取数据""" -import re -import logging - - -from javsp.web.base import * -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://www.prestige-av.com' -# prestige要求访问者携带已通过R18认证的cookies才能够获得完整数据,否则会被重定向到认证页面 -# (其他多数网站的R18认证只是在网页上遮了一层,完整数据已经传回,不影响爬虫爬取) -cookies = {'__age_auth__': 'true'} - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - url = f'{base_url}/goods/goods_detail.php?sku={movie.dvdid}' - resp = request_get(url, cookies=cookies, delay_raise=True) - if resp.status_code == 500: - # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试 - raise MovieNotFoundError(__name__, movie.dvdid) - elif resp.status_code == 403: - raise SiteBlocked('prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理') - resp.raise_for_status() - html = resp2html(resp) - container_tags = html.xpath("//section[@class='px-4 mb-4 md:px-8 md:mb-16']") - if not container_tags: - raise MovieNotFoundError(__name__, movie.dvdid) - - container = container_tags[0] - title = container.xpath("h1/span")[0].tail.strip() - cover = container.xpath("//div[@class='c-ratio-image mr-8']/picture/source/img/@src")[0] - cover = cover.split('?')[0] - actress = container.xpath("//p[text()='出演者:']/following-sibling::div/p/a/text()") - # 移除女优名中的空格,使女优名与其他网站保持一致 - actress = [i.strip().replace(' ', '') for i in actress] - duration_str = container.xpath("//p[text()='収録時間:']")[0].getnext().text_content() - match = re.search(r'\d+', duration_str) - if match: - movie.duration = match.group(0) - date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[0] - publish_date = date_url.split('?date=')[-1] - producer = container.xpath("//p[text()='メーカー:']/following-sibling::div/a/text()")[0].strip() - dvdid = container.xpath("//p[text()='品番:']/following-sibling::div/p/text()")[0] - genre_tags = container.xpath("//p[text()='ジャンル:']/following-sibling::div/a") - genre = [tag.text.strip() for tag in genre_tags] - serial = container.xpath("//p[text()='レーベル:']/following-sibling::div/a/text()")[0].strip() - plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[0].text.strip() - preview_pics = container.xpath("//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src") - preview_pics = [i.split('?')[0] for i in preview_pics] - - # prestige改版后已经无法获取高清封面,此前已经获取的高清封面地址也已失效 - movie.url = url - movie.dvdid = dvdid - movie.title = title - movie.cover = cover - movie.actress = actress - movie.publish_date = publish_date - movie.producer = producer - movie.genre = genre - movie.serial = serial - movie.plot = plot - movie.preview_pics = preview_pics - movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片 - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('ABP-647') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/proxyfree.py b/javsp/web/proxyfree.py deleted file mode 100644 index 89c1e63a4..000000000 --- a/javsp/web/proxyfree.py +++ /dev/null @@ -1,75 +0,0 @@ -"""获取各个网站的免代理地址""" -import re -import sys - -from javsp.web.base import is_connectable, get_html, get_resp_text, request_get - - -def get_proxy_free_url(site_name: str, prefer_url=None) -> str: - """获取指定网站的免代理地址 - Args: - site_name (str): 站点名称 - prefer_url (str, optional): 优先测试此url是否可用 - Returns: - str: 指定站点的免代理地址(失败时为空字符串) - """ - if prefer_url and is_connectable(prefer_url, timeout=5): - return prefer_url - # 当prefer_url不可用时,尝试自动获取指定网站的免代理地址 - site_name = site_name.lower() - func_name = f'_get_{site_name}_urls' - get_funcs = [i for i in dir(sys.modules[__name__]) if i.startswith('_get_')] - if func_name in get_funcs: - get_urls = getattr(sys.modules[__name__], func_name) - try: - urls = get_urls() - return _choose_one(urls) - except: - return '' - else: - raise Exception("Dont't know how to get proxy-free url for " + site_name) - - -def _choose_one(urls) -> str: - for url in urls: - if is_connectable(url, timeout=5): - return url - return '' - - -def _get_avsox_urls() -> list: - html = get_html('https://tellme.pw/avsox') - urls = html.xpath('//h4/strong/a/@href') - return urls - - -def _get_javbus_urls() -> list: - html = get_html('https://www.javbus.one/') - text = html.text_content() - urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A) - return urls - - -def _get_javlib_urls() -> list: - html = get_html('https://github.com/javlibcom') - text = html.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content() - match = re.search(r'[\w\.]+', text, re.A) - if match: - domain = f'https://www.{match.group(0)}.com' - return [domain] - - -def _get_javdb_urls() -> list: - html = get_html('https://jav524.app') - js_links = html.xpath("//script[@src]/@src") - for link in js_links: - if '/js/index' in link: - text = get_resp_text(request_get(link)) - match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A) - if match: - return [match.group(1)] - - -if __name__ == "__main__": - print('javdb:\t', _get_javdb_urls()) - print('javlib:\t', _get_javlib_urls()) diff --git a/poetry.lock b/poetry.lock index 1c92293a3..7e17536c5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,21 @@ # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +[[package]] +name = "aiofiles" +version = "24.1.0" +description = "File support for asyncio." +optional = false +python-versions = ">=3.8" +files = [ + {file = "aiofiles-24.1.0-py3-none-any.whl", hash = "sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5"}, + {file = "aiofiles-24.1.0.tar.gz", hash = "sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c"}, +] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "annotated-types" version = "0.7.0" @@ -16,6 +32,33 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "anyio" +version = "4.6.0" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = false +python-versions = ">=3.9" +files = [ + {file = "anyio-4.6.0-py3-none-any.whl", hash = "sha256:c7d2e9d63e31599eeb636c8c5c03a7e108d73b345f064f1c19fdc87b79036a9a"}, + {file = "anyio-4.6.0.tar.gz", hash = "sha256:137b4559cbb034c477165047febb6ff83f390fc3b20bf181c1fc0a728cb8beeb"}, +] + +[package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +idna = ">=2.8" +sniffio = ">=1.1" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} + +[package.extras] +doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.21.0b1)"] +trio = ["trio (>=0.26.1)"] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "certifi" version = "2024.8.30" @@ -570,6 +613,79 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + +[[package]] +name = "httpcore" +version = "1.0.5" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpcore-1.0.5-py3-none-any.whl", hash = "sha256:421f18bac248b25d310f3cacd198d55b8e6125c107797b609ff9b7a6ba7991b5"}, + {file = "httpcore-1.0.5.tar.gz", hash = "sha256:34a38e2f9291467ee3b44e89dd52615370e152954ba21721378a87b2960f7a61"}, +] + +[package.dependencies] +certifi = "*" +h11 = ">=0.13,<0.15" + +[package.extras] +asyncio = ["anyio (>=4.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<0.26.0)"] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + +[[package]] +name = "httpx" +version = "0.27.2" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"}, + {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"}, +] + +[package.dependencies] +anyio = "*" +certifi = "*" +httpcore = "==1.*" +idna = "*" +sniffio = "*" +socksio = {version = "==1.*", optional = true, markers = "extra == \"socks\""} + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +zstd = ["zstandard (>=0.18.0)"] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "idna" version = "3.10" @@ -1625,13 +1741,13 @@ reference = "mirrors" [[package]] name = "requests" -version = "2.31.0" +version = "2.32.3" description = "Python HTTP for Humans." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, - {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, ] [package.dependencies] @@ -1748,6 +1864,38 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "sniffio" +version = "1.3.1" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, + {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, +] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + +[[package]] +name = "socksio" +version = "1.0.0" +description = "Sans-I/O implementation of SOCKS4, SOCKS4A, and SOCKS5." +optional = false +python-versions = ">=3.6" +files = [ + {file = "socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3"}, + {file = "socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac"}, +] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "time-machine" version = "2.15.0" @@ -2041,4 +2189,4 @@ reference = "mirrors" [metadata] lock-version = "2.0" python-versions = "<3.13,>=3.10" -content-hash = "056b2f7a21b0286a04a5ecadb809f6472c636348fe07976ac42c9c47c620f04c" +content-hash = "6ac810d36d51220ff82224b4d5a814b2624a7ee8ac2091a7cb4b790f16adc578" diff --git a/pyproject.toml b/pyproject.toml index a5e1b4d10..aa123f5bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ cloudscraper = "1.2.71" colorama = "0.4.4" pillow = "10.2.0" pretty-errors = "1.2.19" -requests = "2.31.0" tqdm = "4.59.0" # https://stackoverflow.com/questions/446209/possible-values-from-sys-platform pywin32 = {version = "^306", markers = "sys_platform == 'win32'"} @@ -29,6 +28,8 @@ confz = "^2.0.1" pydantic-extra-types = "^2.9.0" pendulum = "^3.0.0" slimeface = "^2024.9.27" +httpx = {extras = ["socks"], version = "^0.27.2"} +aiofiles = "^24.1.0" [tool.poetry.scripts] javsp = "javsp.__main__:entry" diff --git a/tools/config_migration.py b/tools/config_migration.py index 95adc45d6..f08f9ed67 100644 --- a/tools/config_migration.py +++ b/tools/config_migration.py @@ -76,13 +76,16 @@ def fix_pat(p): # 设置代理服务器地址,支持 http, socks5/socks5h 代理,比如'http://127.0.0.1:1080' # null表示禁用代理 proxy_server: {'null' if proxy_disabled else f"'{cfg['Network']['proxy']}'"} - # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置 - proxy_free: -{'\n'.join([f" {id}: '{url}'" for id, url in dict(cfg['ProxyFree']).items()])} # 网络问题导致抓取数据失败时的重试次数,通常3次就差不多了 - retry: {cfg['Network']['retry']} + retries: {cfg['Network']['retry']} # https://en.wikipedia.org/wiki/ISO_8601#Durations timeout: PT{cfg['Network']['timeout']}S + # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置 + unproxied: [{ + ', '.join(dict(cfg['ProxyFree']).values()) +}] + fallback: +{'\n'.join([f" {id}: ['{url}']" for id, url in dict(cfg['ProxyFree']).items()])} ################################ crawler: @@ -100,8 +103,6 @@ def fix_pat(p): hardworking: {yes_to_true(cfg['Crawler']['hardworking_mode'])} # 使用网页番号作为最终番号(启用时会对番号大小写等进行更正) respect_site_avid: {yes_to_true(cfg['Crawler']['respect_site_avid'])} - # fc2fan已关站。如果你有镜像,请设置本地镜像文件夹的路径,此文件夹内要有类似'FC2-12345.html'的网页文件 - fc2fan_local_path: '{cfg['Crawler']['fc2fan_local_path']}' # 刮削一部电影后的等待时间(设置为0禁用此功能) # https://en.wikipedia.org/wiki/ISO_8601#Durations sleep_after_scraping: PT{cfg['Crawler']['sleep_after_scraping']}S From f72a3752557bfdc5c1748d2cc6ee4d7fc6b90468 Mon Sep 17 00:00:00 2001 From: glyh Date: Sun, 29 Sep 2024 01:10:57 +0800 Subject: [PATCH 02/10] drop cloudscraper --- poetry.lock | 213 +------------------------------------------------ pyproject.toml | 1 - 2 files changed, 1 insertion(+), 213 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7e17536c5..f9b1b8d77 100644 --- a/poetry.lock +++ b/poetry.lock @@ -159,131 +159,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "charset-normalizer" -version = "3.3.2" -description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"}, - {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, -] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - -[[package]] -name = "cloudscraper" -version = "1.2.71" -description = "A Python module to bypass Cloudflare's anti-bot page." -optional = false -python-versions = "*" -files = [ - {file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"}, - {file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"}, -] - -[package.dependencies] -pyparsing = ">=2.4.7" -requests = ">=2.9.2" -requests-toolbelt = ">=0.9.1" - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "colorama" version = "0.4.4" @@ -1544,25 +1419,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "pyparsing" -version = "3.1.4" -description = "pyparsing module - Classes and methods to define and execute parsing grammars" -optional = false -python-versions = ">=3.6.8" -files = [ - {file = "pyparsing-3.1.4-py3-none-any.whl", hash = "sha256:a6a7ee4235a3f944aa1fa2249307708f893fe5717dc603503c6c7969c070fb7c"}, - {file = "pyparsing-3.1.4.tar.gz", hash = "sha256:f86ec8d1a83f11977c9a6ea7598e8c27fc5cddfa5b07ea2241edbbde1d7bc032"}, -] - -[package.extras] -diagrams = ["jinja2", "railroad-diagrams"] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "pytest" version = "8.3.3" @@ -1739,51 +1595,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "requests" -version = "2.32.3" -description = "Python HTTP for Humans." -optional = false -python-versions = ">=3.8" -files = [ - {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, - {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, -] - -[package.dependencies] -certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" -idna = ">=2.5,<4" -urllib3 = ">=1.21.1,<3" - -[package.extras] -socks = ["PySocks (>=1.5.6,!=1.5.7)"] -use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - -[[package]] -name = "requests-toolbelt" -version = "1.0.0" -description = "A utility belt for advanced users of python-requests" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, - {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, -] - -[package.dependencies] -requests = ">=2.0.1,<3.0.0" - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "setuptools" version = "75.1.0" @@ -2140,28 +1951,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "urllib3" -version = "2.2.3" -description = "HTTP library with thread-safe connection pooling, file post, and more." -optional = false -python-versions = ">=3.8" -files = [ - {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"}, - {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"}, -] - -[package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] -h2 = ["h2 (>=4,<5)"] -socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] -zstd = ["zstandard (>=0.18.0)"] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "zipp" version = "3.20.2" @@ -2189,4 +1978,4 @@ reference = "mirrors" [metadata] lock-version = "2.0" python-versions = "<3.13,>=3.10" -content-hash = "6ac810d36d51220ff82224b4d5a814b2624a7ee8ac2091a7cb4b790f16adc578" +content-hash = "3c98b4c2562b1cc5d88474d6962ab34e60be1be488d840c691c0d0e1095d7285" diff --git a/pyproject.toml b/pyproject.toml index aa123f5bb..a74d2bc1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,6 @@ format = "v{base}.{distance}" [tool.poetry.dependencies] python = "<3.13,>=3.10" -cloudscraper = "1.2.71" colorama = "0.4.4" pillow = "10.2.0" pretty-errors = "1.2.19" From 264c7b7cf09256ea018e1a70003b74303c1857aa Mon Sep 17 00:00:00 2001 From: glyh Date: Sun, 29 Sep 2024 02:46:18 +0800 Subject: [PATCH 03/10] fix asyncio proxyfree test, use uvloop for main loop --- javsp/__main__.py | 9 +++---- javsp/crawlers/proxyfree.py | 28 +++++++++----------- javsp/network/client.py | 6 +++-- javsp/network/utils.py | 47 +++++++++++++++++++++------------- poetry.lock | 51 ++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + unittest/test_proxyfree.py | 27 ++++++++++++-------- 7 files changed, 118 insertions(+), 51 deletions(-) diff --git a/javsp/__main__.py b/javsp/__main__.py index cf73ffd09..5d2604ec6 100644 --- a/javsp/__main__.py +++ b/javsp/__main__.py @@ -2,17 +2,16 @@ import re import sys import json -import asyncio import time +import asyncio import logging +import uvloop from PIL import Image from lxml.etree import Comment from pydantic import ValidationError from pydantic_core import Url from pydantic_extra_types.pendulum_dt import Duration -import threading from typing import Any, Coroutine, Dict, List -from javsp.crawlers.interface import Crawler from javsp.crawlers.all import crawlers sys.stdout.reconfigure(encoding='utf-8') @@ -102,7 +101,7 @@ async def wrapper(id: CrawlerID, movie: MovieInfo) -> None: co_pool.append(wrapper(crawler_id, info)) # 等待所有协程结束 - asyncio.gather(*co_pool) + await asyncio.gather(*co_pool) # 根据抓取结果更新影片类型判定 if movie.data_src == 'cid' and movie.dvdid: @@ -555,7 +554,7 @@ async def aentry(): sys.exit(0) def entry(): - asyncio.run(aentry(), debug=True) + uvloop.run(aentry(), debug=True) if __name__ == "__main__": entry() diff --git a/javsp/crawlers/proxyfree.py b/javsp/crawlers/proxyfree.py index cafcfe062..cf4010d33 100644 --- a/javsp/crawlers/proxyfree.py +++ b/javsp/crawlers/proxyfree.py @@ -1,7 +1,6 @@ """获取各个网站的免代理地址""" from collections.abc import Callable, Coroutine import re -import sys from typing import Any, Dict from pydantic_core import Url @@ -9,11 +8,11 @@ from lxml import html from javsp.config import CrawlerID -from javsp.network.utils import test_connect +from javsp.network.utils import test_connect, choose_one_connectable from javsp.network.client import get_client -async def _get_avsox_urls() -> list: +async def _get_avsox_urls() -> list[str]: link = 'https://tellme.pw/avsox' client = get_client(Url(link)) resp = await client.get(link) @@ -22,7 +21,7 @@ async def _get_avsox_urls() -> list: return urls -async def _get_javbus_urls() -> list: +async def _get_javbus_urls() -> list[str]: link = 'https://www.javbus.one/' client = get_client(Url(link)) resp = await client.get(link) @@ -31,7 +30,7 @@ async def _get_javbus_urls() -> list: return urls -async def _get_javlib_urls() -> list: +async def _get_javlib_urls() -> list[str]: link = 'https://github.com/javlibcom' client = get_client(Url(link)) resp = await client.get(link) @@ -41,9 +40,10 @@ async def _get_javlib_urls() -> list: if match: domain = f'https://www.{match.group(0)}.com' return [domain] + return [] -async def _get_javdb_urls() -> list: +async def _get_javdb_urls() -> list[str]: root_link = 'https://jav524.app' client = get_client(Url(root_link)) resp = await client.get(root_link) @@ -57,6 +57,7 @@ async def _get_javdb_urls() -> list: match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A) if match: return [match.group(1)] + return [] proxy_free_fns: Dict[CrawlerID, Callable[[], Coroutine[Any, Any, list[str]]]]= { CrawlerID.avsox: _get_avsox_urls, @@ -65,13 +66,7 @@ async def _get_javdb_urls() -> list: CrawlerID.javlib: _get_javlib_urls, } -def _choose_one(urls: list[str]) -> str: - for url in urls: - if test_connect(url, Duration(seconds=5)): - return url - return '' - -async def get_proxy_free_url(site_name: CrawlerID, prefer_url: str | None=None) -> str: +async def get_proxy_free_url(site_name: CrawlerID, prefer_url: str | None = None) -> str | None: """获取指定网站的免代理地址 Args: site_name (str): 站点名称 @@ -79,15 +74,16 @@ async def get_proxy_free_url(site_name: CrawlerID, prefer_url: str | None=None) Returns: str: 指定站点的免代理地址(失败时为空字符串) """ - if prefer_url and test_connect(prefer_url, Duration(seconds=5)): + if prefer_url and await test_connect(prefer_url, Duration(seconds=5)): return prefer_url if site_name in proxy_free_fns: try: urls = await proxy_free_fns[site_name]() - return _choose_one(urls) + print(f"I got {urls}") + return await choose_one_connectable(urls) except: - return '' + return None else: raise Exception("Dont't know how to get proxy-free url for " + site_name) diff --git a/javsp/network/client.py b/javsp/network/client.py index 813167233..33232b677 100644 --- a/javsp/network/client.py +++ b/javsp/network/client.py @@ -7,7 +7,9 @@ from javsp.config import Cfg -headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'} +default_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36' +} def get_proxy(unproxied: bool): if Cfg().network.proxy_server is None or unproxied: @@ -33,7 +35,7 @@ def get_client(url: Url) -> AsyncClient: client = AsyncClient( transport=transport, # 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效 - headers=headers.copy(), + headers=default_headers.copy(), timeout=Cfg().network.timeout.total_seconds(), follow_redirects=True, ) diff --git a/javsp/network/utils.py b/javsp/network/utils.py index 6f73338e4..6e3836423 100644 --- a/javsp/network/utils.py +++ b/javsp/network/utils.py @@ -12,6 +12,8 @@ from javsp.config import Cfg, CrawlerID from javsp.network.client import get_client +import asyncio + class DownloadInfo(NamedTuple): size: ByteSize elapsed: timedelta @@ -53,18 +55,10 @@ async def url_download(url: Url, target_path: str, desc: str | None = None) -> D return DownloadInfo(ByteSize(response.num_bytes_downloaded), response.elapsed) -# def resp2html(resp: Response) -> lxml.html.HtmlElement: -# -# """将request返回的response转换为经lxml解析后的document""" -# -# html = lxml.html.fromstring(resp.text) -# html.make_links_absolute(str(resp.url), resolve_base_href=True) -# return html -# async def test_connect(url_str: str, timeout: Duration) -> bool: """测试与指定url的连接,不使用映射,但使用代理""" try: - + print(f"Attemping to connect {url_str}") client = get_client(Url(url_str)) response = \ await client.get( @@ -76,16 +70,35 @@ async def test_connect(url_str: str, timeout: Duration) -> bool: except: return False +async def choose_one_connectable(urls: list[str]) -> str | None: + print(urls) + co_connectables: list[Coroutine[Any, Any, bool]] = [] + for url in urls: + co_connectables.append(test_connect(url, Duration(seconds=5))) + + connectables = await asyncio.gather(*co_connectables) + for i, connectable in enumerate(connectables): + if connectable: + return urls[i] + return None + async def resolve_site_fallback(cr_id: CrawlerID, default: str) -> Url: if cr_id not in Cfg().network.fallback: return Url(default) - - tasks: list[tuple[str, Coroutine[Any, Any, bool]]] = [] - for fallback in Cfg().network.fallback[cr_id]: - tasks.append((fallback, test_connect(fallback, Duration(seconds=3)))) - for (fallback, task) in tasks: - if await task: - return Url(fallback) + fallbacks = Cfg().network.fallback[cr_id] + chosen = await choose_one_connectable(fallbacks) + if chosen is None: + return Url(default) + else: + return Url(chosen) + + +if __name__ == '__main__': + # async def aentry(): + # print(await choose_one_connectable(['http://iandown.what', 'http://www.baidu.com'])) + + async def aentry(): + print(await test_connect("https://www.y78k.com/", timeout=3)) - return Url(default) + asyncio.run(aentry()) diff --git a/poetry.lock b/poetry.lock index f9b1b8d77..1c3b638cb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1951,6 +1951,55 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "uvloop" +version = "0.20.0" +description = "Fast implementation of asyncio event loop on top of libuv" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "uvloop-0.20.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9ebafa0b96c62881d5cafa02d9da2e44c23f9f0cd829f3a32a6aff771449c996"}, + {file = "uvloop-0.20.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:35968fc697b0527a06e134999eef859b4034b37aebca537daeb598b9d45a137b"}, + {file = "uvloop-0.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b16696f10e59d7580979b420eedf6650010a4a9c3bd8113f24a103dfdb770b10"}, + {file = "uvloop-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b04d96188d365151d1af41fa2d23257b674e7ead68cfd61c725a422764062ae"}, + {file = "uvloop-0.20.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:94707205efbe809dfa3a0d09c08bef1352f5d3d6612a506f10a319933757c006"}, + {file = "uvloop-0.20.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:89e8d33bb88d7263f74dc57d69f0063e06b5a5ce50bb9a6b32f5fcbe655f9e73"}, + {file = "uvloop-0.20.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e50289c101495e0d1bb0bfcb4a60adde56e32f4449a67216a1ab2750aa84f037"}, + {file = "uvloop-0.20.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e237f9c1e8a00e7d9ddaa288e535dc337a39bcbf679f290aee9d26df9e72bce9"}, + {file = "uvloop-0.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:746242cd703dc2b37f9d8b9f173749c15e9a918ddb021575a0205ec29a38d31e"}, + {file = "uvloop-0.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82edbfd3df39fb3d108fc079ebc461330f7c2e33dbd002d146bf7c445ba6e756"}, + {file = "uvloop-0.20.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:80dc1b139516be2077b3e57ce1cb65bfed09149e1d175e0478e7a987863b68f0"}, + {file = "uvloop-0.20.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4f44af67bf39af25db4c1ac27e82e9665717f9c26af2369c404be865c8818dcf"}, + {file = "uvloop-0.20.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:4b75f2950ddb6feed85336412b9a0c310a2edbcf4cf931aa5cfe29034829676d"}, + {file = "uvloop-0.20.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:77fbc69c287596880ecec2d4c7a62346bef08b6209749bf6ce8c22bbaca0239e"}, + {file = "uvloop-0.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6462c95f48e2d8d4c993a2950cd3d31ab061864d1c226bbf0ee2f1a8f36674b9"}, + {file = "uvloop-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:649c33034979273fa71aa25d0fe120ad1777c551d8c4cd2c0c9851d88fcb13ab"}, + {file = "uvloop-0.20.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3a609780e942d43a275a617c0839d85f95c334bad29c4c0918252085113285b5"}, + {file = "uvloop-0.20.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aea15c78e0d9ad6555ed201344ae36db5c63d428818b4b2a42842b3870127c00"}, + {file = "uvloop-0.20.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0e94b221295b5e69de57a1bd4aeb0b3a29f61be6e1b478bb8a69a73377db7ba"}, + {file = "uvloop-0.20.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fee6044b64c965c425b65a4e17719953b96e065c5b7e09b599ff332bb2744bdf"}, + {file = "uvloop-0.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:265a99a2ff41a0fd56c19c3838b29bf54d1d177964c300dad388b27e84fd7847"}, + {file = "uvloop-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b10c2956efcecb981bf9cfb8184d27d5d64b9033f917115a960b83f11bfa0d6b"}, + {file = "uvloop-0.20.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e7d61fe8e8d9335fac1bf8d5d82820b4808dd7a43020c149b63a1ada953d48a6"}, + {file = "uvloop-0.20.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2beee18efd33fa6fdb0976e18475a4042cd31c7433c866e8a09ab604c7c22ff2"}, + {file = "uvloop-0.20.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d8c36fdf3e02cec92aed2d44f63565ad1522a499c654f07935c8f9d04db69e95"}, + {file = "uvloop-0.20.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0fac7be202596c7126146660725157d4813aa29a4cc990fe51346f75ff8fde7"}, + {file = "uvloop-0.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d0fba61846f294bce41eb44d60d58136090ea2b5b99efd21cbdf4e21927c56a"}, + {file = "uvloop-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95720bae002ac357202e0d866128eb1ac82545bcf0b549b9abe91b5178d9b541"}, + {file = "uvloop-0.20.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:36c530d8fa03bfa7085af54a48f2ca16ab74df3ec7108a46ba82fd8b411a2315"}, + {file = "uvloop-0.20.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e97152983442b499d7a71e44f29baa75b3b02e65d9c44ba53b10338e98dedb66"}, + {file = "uvloop-0.20.0.tar.gz", hash = "sha256:4603ca714a754fc8d9b197e325db25b2ea045385e8a3ad05d3463de725fdf469"}, +] + +[package.extras] +docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["Cython (>=0.29.36,<0.30.0)", "aiohttp (==3.9.0b0)", "aiohttp (>=3.8.1)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "zipp" version = "3.20.2" @@ -1978,4 +2027,4 @@ reference = "mirrors" [metadata] lock-version = "2.0" python-versions = "<3.13,>=3.10" -content-hash = "3c98b4c2562b1cc5d88474d6962ab34e60be1be488d840c691c0d0e1095d7285" +content-hash = "6ad14727b4a6c9a4e9b948fd278a29c28074f9732a0f490d9d6ab4596903e58c" diff --git a/pyproject.toml b/pyproject.toml index a74d2bc1b..dba6fd9e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ pendulum = "^3.0.0" slimeface = "^2024.9.27" httpx = {extras = ["socks"], version = "^0.27.2"} aiofiles = "^24.1.0" +uvloop = "^0.20.0" [tool.poetry.scripts] javsp = "javsp.__main__:entry" diff --git a/unittest/test_proxyfree.py b/unittest/test_proxyfree.py index 1537d93ad..6e1d65b60 100644 --- a/unittest/test_proxyfree.py +++ b/unittest/test_proxyfree.py @@ -1,18 +1,25 @@ -import os -import sys - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -from javsp.web.proxyfree import * +import uvloop +import tracemalloc +from javsp.crawlers.proxyfree import get_proxy_free_url +from javsp.config import CrawlerID def test_get_url(): - assert get_proxy_free_url('javlib') != '' - assert get_proxy_free_url('javdb') != '' + async def wrap(): + assert await get_proxy_free_url(CrawlerID.javlib) != None + assert await get_proxy_free_url(CrawlerID.javdb) != None + uvloop.run(wrap()) def test_get_url_with_prefer(): - prefer_url = 'https://www.baidu.com' - assert prefer_url == get_proxy_free_url('javlib', prefer_url) + async def wrap(): + prefer_url = 'https://www.baidu.com' + assert prefer_url == await get_proxy_free_url(CrawlerID.javlib, prefer_url) + uvloop.run(wrap()) if __name__ == "__main__": - print(get_proxy_free_url('javlib')) + async def aentry(): + print(await get_proxy_free_url(CrawlerID.javlib)) + + tracemalloc.start() + uvloop.run(aentry(), debug=True) From 629f14c402b4b910d8b60576d5483d2a45257b2e Mon Sep 17 00:00:00 2001 From: glyh Date: Sun, 29 Sep 2024 02:53:23 +0800 Subject: [PATCH 04/10] remove uvloop as poetry can't build it on old python --- javsp/__main__.py | 4 +--- poetry.lock | 51 +---------------------------------------------- pyproject.toml | 1 - 3 files changed, 2 insertions(+), 54 deletions(-) diff --git a/javsp/__main__.py b/javsp/__main__.py index 5d2604ec6..456bbebf8 100644 --- a/javsp/__main__.py +++ b/javsp/__main__.py @@ -5,9 +5,7 @@ import time import asyncio import logging -import uvloop from PIL import Image -from lxml.etree import Comment from pydantic import ValidationError from pydantic_core import Url from pydantic_extra_types.pendulum_dt import Duration @@ -554,7 +552,7 @@ async def aentry(): sys.exit(0) def entry(): - uvloop.run(aentry(), debug=True) + asyncio.run(aentry(), debug=True) if __name__ == "__main__": entry() diff --git a/poetry.lock b/poetry.lock index 1c3b638cb..f9b1b8d77 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1951,55 +1951,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "uvloop" -version = "0.20.0" -description = "Fast implementation of asyncio event loop on top of libuv" -optional = false -python-versions = ">=3.8.0" -files = [ - {file = "uvloop-0.20.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9ebafa0b96c62881d5cafa02d9da2e44c23f9f0cd829f3a32a6aff771449c996"}, - {file = "uvloop-0.20.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:35968fc697b0527a06e134999eef859b4034b37aebca537daeb598b9d45a137b"}, - {file = "uvloop-0.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b16696f10e59d7580979b420eedf6650010a4a9c3bd8113f24a103dfdb770b10"}, - {file = "uvloop-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b04d96188d365151d1af41fa2d23257b674e7ead68cfd61c725a422764062ae"}, - {file = "uvloop-0.20.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:94707205efbe809dfa3a0d09c08bef1352f5d3d6612a506f10a319933757c006"}, - {file = "uvloop-0.20.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:89e8d33bb88d7263f74dc57d69f0063e06b5a5ce50bb9a6b32f5fcbe655f9e73"}, - {file = "uvloop-0.20.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e50289c101495e0d1bb0bfcb4a60adde56e32f4449a67216a1ab2750aa84f037"}, - {file = "uvloop-0.20.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e237f9c1e8a00e7d9ddaa288e535dc337a39bcbf679f290aee9d26df9e72bce9"}, - {file = "uvloop-0.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:746242cd703dc2b37f9d8b9f173749c15e9a918ddb021575a0205ec29a38d31e"}, - {file = "uvloop-0.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82edbfd3df39fb3d108fc079ebc461330f7c2e33dbd002d146bf7c445ba6e756"}, - {file = "uvloop-0.20.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:80dc1b139516be2077b3e57ce1cb65bfed09149e1d175e0478e7a987863b68f0"}, - {file = "uvloop-0.20.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4f44af67bf39af25db4c1ac27e82e9665717f9c26af2369c404be865c8818dcf"}, - {file = "uvloop-0.20.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:4b75f2950ddb6feed85336412b9a0c310a2edbcf4cf931aa5cfe29034829676d"}, - {file = "uvloop-0.20.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:77fbc69c287596880ecec2d4c7a62346bef08b6209749bf6ce8c22bbaca0239e"}, - {file = "uvloop-0.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6462c95f48e2d8d4c993a2950cd3d31ab061864d1c226bbf0ee2f1a8f36674b9"}, - {file = "uvloop-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:649c33034979273fa71aa25d0fe120ad1777c551d8c4cd2c0c9851d88fcb13ab"}, - {file = "uvloop-0.20.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3a609780e942d43a275a617c0839d85f95c334bad29c4c0918252085113285b5"}, - {file = "uvloop-0.20.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aea15c78e0d9ad6555ed201344ae36db5c63d428818b4b2a42842b3870127c00"}, - {file = "uvloop-0.20.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0e94b221295b5e69de57a1bd4aeb0b3a29f61be6e1b478bb8a69a73377db7ba"}, - {file = "uvloop-0.20.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fee6044b64c965c425b65a4e17719953b96e065c5b7e09b599ff332bb2744bdf"}, - {file = "uvloop-0.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:265a99a2ff41a0fd56c19c3838b29bf54d1d177964c300dad388b27e84fd7847"}, - {file = "uvloop-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b10c2956efcecb981bf9cfb8184d27d5d64b9033f917115a960b83f11bfa0d6b"}, - {file = "uvloop-0.20.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e7d61fe8e8d9335fac1bf8d5d82820b4808dd7a43020c149b63a1ada953d48a6"}, - {file = "uvloop-0.20.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2beee18efd33fa6fdb0976e18475a4042cd31c7433c866e8a09ab604c7c22ff2"}, - {file = "uvloop-0.20.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d8c36fdf3e02cec92aed2d44f63565ad1522a499c654f07935c8f9d04db69e95"}, - {file = "uvloop-0.20.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0fac7be202596c7126146660725157d4813aa29a4cc990fe51346f75ff8fde7"}, - {file = "uvloop-0.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d0fba61846f294bce41eb44d60d58136090ea2b5b99efd21cbdf4e21927c56a"}, - {file = "uvloop-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95720bae002ac357202e0d866128eb1ac82545bcf0b549b9abe91b5178d9b541"}, - {file = "uvloop-0.20.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:36c530d8fa03bfa7085af54a48f2ca16ab74df3ec7108a46ba82fd8b411a2315"}, - {file = "uvloop-0.20.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e97152983442b499d7a71e44f29baa75b3b02e65d9c44ba53b10338e98dedb66"}, - {file = "uvloop-0.20.0.tar.gz", hash = "sha256:4603ca714a754fc8d9b197e325db25b2ea045385e8a3ad05d3463de725fdf469"}, -] - -[package.extras] -docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] -test = ["Cython (>=0.29.36,<0.30.0)", "aiohttp (==3.9.0b0)", "aiohttp (>=3.8.1)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "zipp" version = "3.20.2" @@ -2027,4 +1978,4 @@ reference = "mirrors" [metadata] lock-version = "2.0" python-versions = "<3.13,>=3.10" -content-hash = "6ad14727b4a6c9a4e9b948fd278a29c28074f9732a0f490d9d6ab4596903e58c" +content-hash = "3c98b4c2562b1cc5d88474d6962ab34e60be1be488d840c691c0d0e1095d7285" diff --git a/pyproject.toml b/pyproject.toml index dba6fd9e6..a74d2bc1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ pendulum = "^3.0.0" slimeface = "^2024.9.27" httpx = {extras = ["socks"], version = "^0.27.2"} aiofiles = "^24.1.0" -uvloop = "^0.20.0" [tool.poetry.scripts] javsp = "javsp.__main__:entry" From dd0d83bbdd43a84e948dfada92fd199a1346d0a6 Mon Sep 17 00:00:00 2001 From: glyh Date: Sun, 29 Sep 2024 03:02:31 +0800 Subject: [PATCH 05/10] don't use uvloop for unittest --- unittest/test_proxyfree.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unittest/test_proxyfree.py b/unittest/test_proxyfree.py index 6e1d65b60..65151a9d4 100644 --- a/unittest/test_proxyfree.py +++ b/unittest/test_proxyfree.py @@ -1,4 +1,4 @@ -import uvloop +import asyncio import tracemalloc from javsp.crawlers.proxyfree import get_proxy_free_url @@ -8,18 +8,18 @@ def test_get_url(): async def wrap(): assert await get_proxy_free_url(CrawlerID.javlib) != None assert await get_proxy_free_url(CrawlerID.javdb) != None - uvloop.run(wrap()) + asyncio.run(wrap()) def test_get_url_with_prefer(): async def wrap(): prefer_url = 'https://www.baidu.com' assert prefer_url == await get_proxy_free_url(CrawlerID.javlib, prefer_url) - uvloop.run(wrap()) + asyncio.run(wrap()) if __name__ == "__main__": async def aentry(): print(await get_proxy_free_url(CrawlerID.javlib)) tracemalloc.start() - uvloop.run(aentry(), debug=True) + asyncio.run(aentry(), debug=True) From e2a43c7715cf94941b2ae115406e0b532cb1f18a Mon Sep 17 00:00:00 2001 From: glyh Date: Sun, 29 Sep 2024 03:30:25 +0800 Subject: [PATCH 06/10] remove prints --- javsp/crawlers/proxyfree.py | 1 - javsp/network/utils.py | 10 ++++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/javsp/crawlers/proxyfree.py b/javsp/crawlers/proxyfree.py index cf4010d33..381eeb7af 100644 --- a/javsp/crawlers/proxyfree.py +++ b/javsp/crawlers/proxyfree.py @@ -80,7 +80,6 @@ async def get_proxy_free_url(site_name: CrawlerID, prefer_url: str | None = None if site_name in proxy_free_fns: try: urls = await proxy_free_fns[site_name]() - print(f"I got {urls}") return await choose_one_connectable(urls) except: return None diff --git a/javsp/network/utils.py b/javsp/network/utils.py index 6e3836423..ff2046efc 100644 --- a/javsp/network/utils.py +++ b/javsp/network/utils.py @@ -58,7 +58,6 @@ async def url_download(url: Url, target_path: str, desc: str | None = None) -> D async def test_connect(url_str: str, timeout: Duration) -> bool: """测试与指定url的连接,不使用映射,但使用代理""" try: - print(f"Attemping to connect {url_str}") client = get_client(Url(url_str)) response = \ await client.get( @@ -71,7 +70,6 @@ async def test_connect(url_str: str, timeout: Duration) -> bool: return False async def choose_one_connectable(urls: list[str]) -> str | None: - print(urls) co_connectables: list[Coroutine[Any, Any, bool]] = [] for url in urls: co_connectables.append(test_connect(url, Duration(seconds=5))) @@ -95,10 +93,10 @@ async def resolve_site_fallback(cr_id: CrawlerID, default: str) -> Url: if __name__ == '__main__': - # async def aentry(): - # print(await choose_one_connectable(['http://iandown.what', 'http://www.baidu.com'])) - async def aentry(): - print(await test_connect("https://www.y78k.com/", timeout=3)) + print(await choose_one_connectable(['http://iandown.what', 'http://www.baidu.com'])) + + # async def aentry(): + # print(await test_connect("https://www.y78k.com/", Duration(seconds=3))) asyncio.run(aentry()) From 5ed7f36f9db2daef10ab58c86f853781e8f10145 Mon Sep 17 00:00:00 2001 From: glyh Date: Sun, 29 Sep 2024 03:39:13 +0800 Subject: [PATCH 07/10] attempt to fix proxyfree fail --- javsp/network/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/javsp/network/utils.py b/javsp/network/utils.py index ff2046efc..34caf68da 100644 --- a/javsp/network/utils.py +++ b/javsp/network/utils.py @@ -1,4 +1,5 @@ from datetime import timedelta +import logging import time from tqdm.asyncio import tqdm from typing import Any, Coroutine, NamedTuple @@ -14,6 +15,8 @@ import asyncio +logger = logging.getLogger(__name__) + class DownloadInfo(NamedTuple): size: ByteSize elapsed: timedelta @@ -63,16 +66,16 @@ async def test_connect(url_str: str, timeout: Duration) -> bool: await client.get( url_str, timeout=timeout.total_seconds(), - follow_redirects=True, ) return response.status_code == 200 - except: + except Exception as e: + logger.debug(f"Not connectable: {url_str}\n" + repr(e)) return False async def choose_one_connectable(urls: list[str]) -> str | None: co_connectables: list[Coroutine[Any, Any, bool]] = [] for url in urls: - co_connectables.append(test_connect(url, Duration(seconds=5))) + co_connectables.append(test_connect(url, Duration(seconds=3))) connectables = await asyncio.gather(*co_connectables) for i, connectable in enumerate(connectables): From 9729542b355658423f2e586f8b69cd782337e03e Mon Sep 17 00:00:00 2001 From: glyh Date: Sun, 29 Sep 2024 11:44:35 +0800 Subject: [PATCH 08/10] refactor network with aiohttp --- javsp/__main__.py | 3 + javsp/chromium.py | 2 + javsp/crawlers/interface.py | 4 +- javsp/crawlers/proxyfree.py | 30 +- javsp/crawlers/sites/airav.py | 27 +- javsp/crawlers/sites/arzon.py | 12 +- javsp/crawlers/sites/arzon_iv.py | 10 +- javsp/crawlers/sites/avsox.py | 10 +- javsp/crawlers/sites/avwiki.py | 8 +- javsp/crawlers/sites/dl_getchu.py | 8 +- javsp/crawlers/sites/fanza.py | 68 ++-- javsp/crawlers/sites/fc2.py | 10 +- javsp/crawlers/sites/fc2ppvdb.py | 7 +- javsp/crawlers/sites/gyutto.py | 8 +- javsp/crawlers/sites/jav321.py | 6 +- javsp/crawlers/sites/javbus.py | 8 +- javsp/crawlers/sites/javdb.py | 46 +-- javsp/crawlers/sites/javlib.py | 13 +- javsp/crawlers/sites/javmenu.py | 6 +- javsp/crawlers/sites/mgstage.py | 12 +- javsp/crawlers/sites/njav.py | 8 +- javsp/crawlers/sites/prestige.py | 12 +- javsp/func.py | 6 +- javsp/network/client.py | 54 ++- javsp/network/utils.py | 48 +-- javsp/translate.py | 74 ++-- poetry.lock | 635 ++++++++++++++++++++++++------ pyproject.toml | 3 +- unittest/test_proxyfree.py | 4 + 29 files changed, 794 insertions(+), 348 deletions(-) diff --git a/javsp/__main__.py b/javsp/__main__.py index 456bbebf8..a7f407f99 100644 --- a/javsp/__main__.py +++ b/javsp/__main__.py @@ -11,6 +11,7 @@ from pydantic_extra_types.pendulum_dt import Duration from typing import Any, Coroutine, Dict, List from javsp.crawlers.all import crawlers +from javsp.network.client import clear_clients sys.stdout.reconfigure(encoding='utf-8') @@ -549,6 +550,8 @@ async def aentry(): logger.info(f'扫描影片文件:共找到 {movie_count} 部影片') await RunNormalMode(recognized + recognize_fail) + await clear_clients() + sys.exit(0) def entry(): diff --git a/javsp/chromium.py b/javsp/chromium.py index db315293e..1f8d01964 100644 --- a/javsp/chromium.py +++ b/javsp/chromium.py @@ -32,6 +32,8 @@ def decrypt(self, encrypted_value): def get_browsers_cookies(): """获取系统上的所有Chromium系浏览器的JavDB的Cookies""" + if not sys.platform.startswith('win32'): # 不支持windows以外的系统 + return [] # 不予支持: Opera, 360安全&极速, 搜狗使用非标的用户目录或数据格式; QQ浏览器屏蔽站点 user_data_dirs = { 'Chrome': '/Google/Chrome/User Data', diff --git a/javsp/crawlers/interface.py b/javsp/crawlers/interface.py index a641b0a27..c82085554 100644 --- a/javsp/crawlers/interface.py +++ b/javsp/crawlers/interface.py @@ -1,13 +1,13 @@ -from httpx import AsyncClient from javsp.config import CrawlerID from javsp.datatype import MovieInfo from abc import ABC, abstractmethod from typing import Self +from aiohttp import ClientSession class Crawler(ABC): base_url: str - client: AsyncClient + client: ClientSession id: CrawlerID diff --git a/javsp/crawlers/proxyfree.py b/javsp/crawlers/proxyfree.py index 381eeb7af..45da59b94 100644 --- a/javsp/crawlers/proxyfree.py +++ b/javsp/crawlers/proxyfree.py @@ -9,32 +9,32 @@ from javsp.config import CrawlerID from javsp.network.utils import test_connect, choose_one_connectable -from javsp.network.client import get_client +from javsp.network.client import get_session async def _get_avsox_urls() -> list[str]: link = 'https://tellme.pw/avsox' - client = get_client(Url(link)) - resp = await client.get(link) - tree = html.fromstring(resp.text) + s = get_session(Url(link)) + resp = await s.get(link) + tree = html.fromstring(await resp.text()) urls = tree.xpath('//h4/strong/a/@href') return urls async def _get_javbus_urls() -> list[str]: link = 'https://www.javbus.one/' - client = get_client(Url(link)) - resp = await client.get(link) - text = resp.text + s = get_session(Url(link)) + resp = await s.get(link) + text = await resp.text() urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A) return urls async def _get_javlib_urls() -> list[str]: link = 'https://github.com/javlibcom' - client = get_client(Url(link)) - resp = await client.get(link) - tree = html.fromstring(resp.text) + s = get_session(Url(link)) + resp = await s.get(link) + tree = html.fromstring(await resp.text()) text = tree.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content() match = re.search(r'[\w\.]+', text, re.A) if match: @@ -45,15 +45,15 @@ async def _get_javlib_urls() -> list[str]: async def _get_javdb_urls() -> list[str]: root_link = 'https://jav524.app' - client = get_client(Url(root_link)) - resp = await client.get(root_link) - tree = html.fromstring(resp.text) + s = get_session(Url(root_link)) + resp = await s.get(root_link) + tree = html.fromstring(await resp.text()) js_links = tree.xpath("//script[@src]/@src") for link in js_links: if '/js/index' in link: link = root_link + link - resp = await client.get(link) - text = resp.text + resp = await s.get(link) + text = await resp.text() match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A) if match: return [match.group(1)] diff --git a/javsp/crawlers/sites/airav.py b/javsp/crawlers/sites/airav.py index 5afd46998..8bc4fa6e6 100644 --- a/javsp/crawlers/sites/airav.py +++ b/javsp/crawlers/sites/airav.py @@ -1,9 +1,10 @@ """从airav抓取数据""" import re from html import unescape +from typing import Dict from javsp.crawlers.exceptions import MovieNotFoundError -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.network.utils import resolve_site_fallback from javsp.config import Cfg, CrawlerID from javsp.datatype import MovieInfo @@ -13,13 +14,15 @@ class AiravCrawler(Crawler): id = CrawlerID.airav + headers: Dict[str, str] + @classmethod async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://www.airav.wiki') self.base_url = str(url) - self.client = get_client(url) - self.client.headers['Accept-Language'] = 'zh-TW,zh;q=0.9' + self.client = get_session(url) + self.headers = {'Accept-Language': 'zh-TW,zh;q=0.9'} return self async def search_movie(self, dvdid: str): @@ -30,8 +33,8 @@ async def search_movie(self, dvdid: str): result = [] while len(result) < count: url = f'{self.base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}' - response = await self.client.get(url) - resp = response.json() + response = await self.client.get(url, headers=self.headers) + resp = await response.json() # {"offset": 2460, "count": 12345, "result": [...], "status": "ok"} if resp['result']: result.extend(resp['result']) @@ -59,15 +62,15 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: """解析指定番号的影片数据""" # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据 url = f'{self.base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW' - response = await self.client.get(url) - resp_json = response.json() + response = await self.client.get(url, headers=self.headers) + resp_json = await response.json() # 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息 if resp_json['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid): barcode = await self.search_movie(movie.dvdid) if barcode: url = f'{self.base_url}/api/video/barcode/{barcode}?lng=zh-TW' - response = await self.client.get(url) - resp_json = response.json() + response = await self.client.get(url, headers=self.headers) + resp_json = await response.json() if resp_json['count'] == 0: raise MovieNotFoundError(__name__, movie.dvdid, resp_json) @@ -93,8 +96,8 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: if Cfg().crawler.hardworking: # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472') video_url = f"{self.base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}" - response = await self.client.get(video_url) - resp = response.json() + response = await self.client.get(video_url, headers=self.headers) + resp = await response.json() # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'} if 'data' in resp: # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址 @@ -113,12 +116,14 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: break if __name__ == "__main__": + from javsp.network.client import clear_clients async def test_main(): crawler = await AiravCrawler.create() movie = MovieInfo("DSAD-938") await crawler.crawl_and_fill(movie) print(movie) + await clear_clients() import asyncio asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/arzon.py b/javsp/crawlers/sites/arzon.py index f4887f4d7..f325984d0 100644 --- a/javsp/crawlers/sites/arzon.py +++ b/javsp/crawlers/sites/arzon.py @@ -2,7 +2,7 @@ import re from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from javsp.config import CrawlerID from javsp.crawlers.exceptions import * @@ -17,7 +17,7 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, "https://www.arzon.jp") self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1" await self.client.get(skip_verify_url) @@ -30,10 +30,10 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: # url = f'{base_url}/imagelist.html?q={full_id}' r = await self.client.get(url) - if r.status_code == 404: + if r.status == 404: raise MovieNotFoundError(__name__, movie.dvdid) # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported - data = html.fromstring(r.content) + data = html.fromstring(await r.read()) urls = data.xpath("//h2/a/@href") if len(urls) == 0: @@ -41,7 +41,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: item_url = self.base_url + urls[0] e = await self.client.get(item_url) - item = html.fromstring(e.content) + item = html.fromstring(await e.read()) title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0] cover = item.xpath("//td[@align='center']//a/img/@src")[0] @@ -91,6 +91,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: movie.preview_pics = preview_pics if __name__ == "__main__": + from javsp.network.client import clear_clients async def test_main(): crawler = await ArzonCrawler.create() @@ -98,6 +99,7 @@ async def test_main(): try: await crawler.crawl_and_fill(movie) print(movie) + await clear_clients() except Exception as e: print(repr(e)) diff --git a/javsp/crawlers/sites/arzon_iv.py b/javsp/crawlers/sites/arzon_iv.py index a84c97aea..65c9b1367 100644 --- a/javsp/crawlers/sites/arzon_iv.py +++ b/javsp/crawlers/sites/arzon_iv.py @@ -3,7 +3,7 @@ from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from javsp.config import CrawlerID from javsp.crawlers.exceptions import * @@ -18,7 +18,7 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, "https://www.arzon.jp") self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1" await self.client.get(skip_verify_url) @@ -31,10 +31,10 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: # url = f'{base_url}/imagelist.html?q={full_id}' r = await self.client.get(url) - if r.status_code == 404: + if r.status == 404: raise MovieNotFoundError(__name__, movie.dvdid) # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported - data = html.fromstring(r.content) + data = html.fromstring(await r.read()) urls = data.xpath("//h2/a/@href") if len(urls) == 0: @@ -42,7 +42,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: item_url = self.base_url + urls[0] e = await self.client.get(item_url) - item = html.fromstring(e.content) + item = html.fromstring(await e.read()) title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0] cover = item.xpath("//td[@align='center']//a/img/@src")[0] diff --git a/javsp/crawlers/sites/avsox.py b/javsp/crawlers/sites/avsox.py index 47b0ea32d..75dcd67c2 100644 --- a/javsp/crawlers/sites/avsox.py +++ b/javsp/crawlers/sites/avsox.py @@ -3,7 +3,7 @@ from javsp.crawlers.exceptions import MovieNotFoundError from javsp.datatype import MovieInfo from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from javsp.config import CrawlerID from lxml import html @@ -16,7 +16,7 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, "https://avsox.click/") self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) return self async def crawl_and_fill(self, movie: MovieInfo) -> None: @@ -24,7 +24,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: if full_id.startswith('FC2-'): full_id = full_id.replace('FC2-', 'FC2-PPV-') resp = await self.client.get(f'{self.base_url}tw/search/{full_id}') - tree = html.fromstring(resp.text) + tree = html.fromstring(await resp.text()) tree.make_links_absolute(str(resp.url), resolve_base_href=True) ids = tree.xpath("//div[@class='photo-info']/span/date[1]/text()") urls = tree.xpath("//a[contains(@class, 'movie-box')]/@href") @@ -37,9 +37,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: # 提取影片信息 resp = await self.client.get(url) - # with open('file.html', 'wb') as f: - # f.write(resp.content) - tree = html.fromstring(resp.text) + tree = html.fromstring(await resp.text()) container = tree.xpath("/html/body/div[@class='container']")[0] title = container.xpath("h3/text()")[0] cover = container.xpath("//a[@class='bigImage']/@href")[0] diff --git a/javsp/crawlers/sites/avwiki.py b/javsp/crawlers/sites/avwiki.py index 7bc2041e5..6a75dd345 100644 --- a/javsp/crawlers/sites/avwiki.py +++ b/javsp/crawlers/sites/avwiki.py @@ -4,7 +4,7 @@ from javsp.datatype import MovieInfo from javsp.crawlers.interface import Crawler from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.config import CrawlerID from lxml import html @@ -16,7 +16,7 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://av-wiki.net') self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) return self async def crawl_and_fill(self, movie: MovieInfo) -> None: @@ -27,9 +27,9 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: movie.url = url = f'{self.base_url}/{movie.dvdid}' resp = await self.client.get(url) - if resp.status_code == 404: + if resp.status == 404: raise MovieNotFoundError(__name__, movie.dvdid) - tree = html.fromstring(resp.content) + tree = html.fromstring(await resp.text()) cover_tag = tree.xpath("//header/div/a[@class='image-link-border']/img") if cover_tag: diff --git a/javsp/crawlers/sites/dl_getchu.py b/javsp/crawlers/sites/dl_getchu.py index c2ab0814f..a635515d4 100644 --- a/javsp/crawlers/sites/dl_getchu.py +++ b/javsp/crawlers/sites/dl_getchu.py @@ -5,7 +5,7 @@ from javsp.config import CrawlerID from javsp.crawlers.exceptions import MovieNotFoundError from javsp.crawlers.interface import Crawler -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.network.utils import resolve_site_fallback from javsp.crawlers.exceptions import * from javsp.datatype import MovieInfo @@ -55,7 +55,7 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://dl.getchu.com') self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) return self async def crawl_and_fill(self, movie: MovieInfo) -> None: @@ -68,9 +68,9 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: # 抓取网页 url = f'{self.base_url}/i/item{getchu_id}' r = await self.client.get(url) - if r.status_code == 404: + if r.status == 404: raise MovieNotFoundError(__name__, movie.dvdid) - tree = html.fromstring(r.text) + tree = html.fromstring((await r.read()).decode(encoding='euc_jp', errors='ignore')) container = tree.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[3]") if len(container) > 0: container = container[0] diff --git a/javsp/crawlers/sites/fanza.py b/javsp/crawlers/sites/fanza.py index 66b895df5..b81ac93ae 100644 --- a/javsp/crawlers/sites/fanza.py +++ b/javsp/crawlers/sites/fanza.py @@ -5,19 +5,18 @@ import logging from typing import Dict, List, Tuple -from httpx import Response - from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked from javsp.crawlers.interface import Crawler from javsp.config import CrawlerID from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.config import Cfg from javsp.datatype import MovieInfo from lxml import html from lxml.html import HtmlElement +from aiohttp import ClientResponse logger = logging.getLogger(__name__) @@ -31,8 +30,8 @@ def sort_search_result(result: List[Dict]): return sorted_result -def resp2html_wrapper(resp: Response) -> HtmlElement: - tree = html.fromstring(resp.text) +async def resp2html_wrapper(resp: ClientResponse) -> HtmlElement: + tree = html.fromstring(await resp.text()) if 'not available in your region' in tree.text_content(): raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置') elif '/login/' in str(resp.url): @@ -88,14 +87,29 @@ def parse_anime_page(movie: MovieInfo, tree: HtmlElement): class FanzaCrawler(Crawler): id = CrawlerID.fanza + headers: Dict[str, str] + + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.dmm.co.jp') + self.base_url = str(url) + self.client = get_session(url) + + # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) + self.client.cookie_jar.update_cookies({'age_check_done': '1'}) + self.headers = {'Accept-Language': 'ja,en-US;q=0.9'} + return self async def get_urls_of_cid(self, cid: str) -> Tuple[str, str]: """搜索cid可能的影片URL""" - r = await self.client.get(f"{self.base_url}search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0") - if r.status_code == 404: + r = await self.client.get(f"{self.base_url}search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0", headers=self.headers) + if r.status == 404: raise MovieNotFoundError(__name__, cid) r.raise_for_status() - tree = resp2html_wrapper(r) + + tree = await resp2html_wrapper(r) result = tree.xpath("//ul[@id='list']/li/div/p/a/@href") parsed_result = {} for url in result: @@ -116,36 +130,25 @@ async def get_urls_of_cid(self, cid: str) -> Tuple[str, str]: sorted_result = sort_search_result(parsed_result[cid]) return sorted_result - @classmethod - async def create(cls): - self = cls() - url = await resolve_site_fallback(self.id, 'https://www.dmm.co.jp') - self.base_url = str(url) - self.client = get_client(url) - - # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) - self.client.cookies = {'age_check_done': '1'} - self.client.headers['Accept-Language'] = 'ja,en-US;q=0.9' - return self + async def dispatch(self, type: str, movie: MovieInfo, tree: HtmlElement): + match type: + case 'videoa' | 'dvd' | 'ppr' | 'nikkatsu': + await self.parse_videoa_page(movie, tree) + case 'anime' | 'doujin': + parse_anime_page(movie, tree) async def crawl_and_fill(self, movie: MovieInfo) -> None: """解析指定番号的影片数据""" default_url = f'{self.base_url}digital/videoa/-/detail/=/cid={movie.cid}/' - r0 = await self.client.get(default_url) - if r0.status_code == 404: + r0 = await self.client.get(default_url, headers=self.headers) + if r0.status == 404: urls = await self.get_urls_of_cid(movie.cid) for d in urls: - func_name = f"parse_{d['type']}_page" - if func_name in globals(): - parse_func = globals()[func_name] - else: - logger.debug(f"不知道怎么解析 fanza {d['type']} 的页面: {d['url']}") - continue - r = await self.client.get(d['url']) - tree = resp2html_wrapper(r) try: - parse_func(movie, tree) + r = await self.client.get(d['url'], headers=self.headers) + tree = await resp2html_wrapper(r) + await self.dispatch(d['type'], movie, tree) movie.url = d['url'] break except: @@ -209,8 +212,8 @@ async def parse_videoa_page(self, movie: MovieInfo, tree: HtmlElement): if Cfg().crawler.hardworking: # 预览视频是动态加载的,不在静态网页中 video_url = f'{self.base_url}service/digitalapi/-/html5_player/=/cid={movie.cid}' - resp = await self.client.get(video_url) - tree2 = html.fromstring(resp.text) + resp = await self.client.get(video_url, headers=self.headers) + tree2 = html.fromstring(await resp.text()) # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据 script = tree2.xpath("//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()")[0].strip() match = re.search(r'\{.*\}', script) @@ -244,3 +247,4 @@ async def test_main(): import asyncio asyncio.run(test_main()) + diff --git a/javsp/crawlers/sites/fc2.py b/javsp/crawlers/sites/fc2.py index 0ce072b90..4ef981ff1 100644 --- a/javsp/crawlers/sites/fc2.py +++ b/javsp/crawlers/sites/fc2.py @@ -10,7 +10,7 @@ from javsp.datatype import MovieInfo from javsp.crawlers.interface import Crawler from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.config import CrawlerID @@ -24,13 +24,13 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://adult.contents.fc2.com') self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) return self async def get_movie_score(self, fc2_id: str) -> float | None: """通过评论数据来计算FC2的影片评分(10分制),无法获得评分时返回None""" resp = await self.client.get(f'{self.base_url}/article/{fc2_id}/review') - tree = html.fromstring(resp.text) + tree = html.fromstring(await resp.text()) review_tags = tree.xpath("//ul[@class='items_comment_headerReviewInArea']/li") reviews = {} for tag in review_tags: @@ -56,7 +56,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: resp = await self.client.get(url) if '/id.fc2.com/' in str(resp.url): raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP') - tree = html.fromstring(resp.text) + tree = html.fromstring(await resp.text()) container = tree.xpath("//div[@class='items_article_left']") if len(container) > 0: container = container[0] @@ -85,7 +85,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: key = desc_frame_url.split('=')[-1] # /widget/article/718323/description?ac=60fc08fa... api_url = f'{self.base_url}/api/v2/videos/{fc2_id}/sample?key={key}' resp = await self.client.get(api_url) - j = resp.json() + j = await resp.json() movie.preview_video = j['path'] else: # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星 diff --git a/javsp/crawlers/sites/fc2ppvdb.py b/javsp/crawlers/sites/fc2ppvdb.py index fbba590c2..8ae6d7415 100644 --- a/javsp/crawlers/sites/fc2ppvdb.py +++ b/javsp/crawlers/sites/fc2ppvdb.py @@ -2,6 +2,7 @@ # BUG: This crawler doesn't work, seemed due to cloudflare +from ssl import ALERT_DESCRIPTION_HANDSHAKE_FAILURE from typing import List @@ -9,7 +10,7 @@ from javsp.lib import strftime_to_minutes from javsp.datatype import MovieInfo from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from javsp.config import CrawlerID from lxml import html @@ -23,7 +24,7 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://fc2ppvdb.com') self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) return self async def crawl_and_fill(self, movie: MovieInfo) -> None: @@ -40,7 +41,7 @@ def get_list_first(list: List): # 抓取网页 url = f'{self.base_url}/articles/{fc2_id}' resp = await self.client.get(url) - tree = html.fromstring(resp.content) + tree = html.fromstring(await resp.text()) # html = get_html(url) container = tree.xpath("//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]") if len(container) > 0: diff --git a/javsp/crawlers/sites/gyutto.py b/javsp/crawlers/sites/gyutto.py index b30200284..632fb9123 100644 --- a/javsp/crawlers/sites/gyutto.py +++ b/javsp/crawlers/sites/gyutto.py @@ -5,7 +5,7 @@ from javsp.crawlers.exceptions import MovieNotFoundError from javsp.datatype import MovieInfo from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from javsp.config import CrawlerID from lxml import html @@ -41,7 +41,7 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'http://gyutto.com') self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) return self async def crawl_and_fill(self, movie: MovieInfo) -> None: @@ -54,9 +54,9 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: # 抓取网页 url = f'{self.base_url}/i/item{gyutto_id}?select_uaflag=1' r = await self.client.get(url) - if r.status_code == 404: + if r.status == 404: raise MovieNotFoundError(__name__, movie.dvdid) - tree = html.fromstring(r.text) + tree = html.fromstring(await r.text()) container = tree.xpath("//dl[@class='BasicInfo clearfix']") producer = None diff --git a/javsp/crawlers/sites/jav321.py b/javsp/crawlers/sites/jav321.py index 6a20a98ec..61f609bfd 100644 --- a/javsp/crawlers/sites/jav321.py +++ b/javsp/crawlers/sites/jav321.py @@ -6,7 +6,7 @@ from javsp.crawlers.exceptions import MovieNotFoundError from javsp.datatype import MovieInfo from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from javsp.config import CrawlerID from lxml import html @@ -22,14 +22,14 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://www.jav321.com') self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) return self async def crawl_and_fill(self, movie: MovieInfo) -> None: """解析指定番号的影片数据""" resp = await self.client.post(f'{self.base_url}/search', data={'sn': movie.dvdid}) - tree = html.fromstring(resp.text) + tree = html.fromstring(await resp.text()) page_url = tree.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0] #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542 cid = page_url.split('/')[-1] # /video/ipx00177 diff --git a/javsp/crawlers/sites/javbus.py b/javsp/crawlers/sites/javbus.py index b3efaa8dd..3038579cd 100644 --- a/javsp/crawlers/sites/javbus.py +++ b/javsp/crawlers/sites/javbus.py @@ -9,7 +9,7 @@ from javsp.crawlers.exceptions import MovieNotFoundError from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from lxml import html @@ -26,8 +26,8 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://www.javbus.com') self.base_url = str(url) - self.client = get_client(url) - self.client.cookies = {'age': 'verified', 'dv': '1'} + self.client = get_session(url) + self.client.cookie_jar.update_cookies({'age': 'verified', 'dv': '1'}) self.genre_map = GenreMap('data/genre_javbus.csv') return self @@ -40,7 +40,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: url = f'{self.base_url}/{movie.dvdid}' resp = await self.client.get(url) - tree = html.fromstring(resp.content) + tree = html.fromstring(await resp.text()) # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息 # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404 page_title = tree.xpath('/html/head/title/text()') diff --git a/javsp/crawlers/sites/javdb.py b/javsp/crawlers/sites/javdb.py index ab23e18bd..d101e2c4e 100644 --- a/javsp/crawlers/sites/javdb.py +++ b/javsp/crawlers/sites/javdb.py @@ -2,8 +2,7 @@ import os import re import logging - -from httpx import Cookies +from typing import Dict from javsp.func import * from javsp.avid import guess_av_type @@ -13,7 +12,7 @@ from javsp.crawlers.exceptions import CredentialError, MovieDuplicateError, MovieNotFoundError, SiteBlocked, SitePermissionError, WebsiteError from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from lxml import html @@ -23,23 +22,25 @@ class JavDbCrawler(Crawler): id = CrawlerID.javdb genre_map: GenreMap - cookies_pool: list[Cookies] + cookies_pool: list + headers: Dict[str, str] @classmethod async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://www.javdb.com') self.base_url = str(url) - self.client = get_client(url) - self.client.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5' + self.client = get_session(url) + self.headers = {'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5'} self.genre_map = GenreMap('data/genre_javdb.csv') self.cookies_pool = [] return self async def get_html_wrapper(self, url: str): """包装外发的request请求并负责转换为可xpath的html,同时处理Cookies无效等问题""" + r = await self.client.get(url) - if r.status_code == 200: + if r.status == 200: # 发生重定向可能仅仅是域名重定向,因此还要检查url以判断是否被跳转到了登录页 if r.history and '/login' in str(r.url): # 仅在需要时去读取Cookies @@ -48,14 +49,14 @@ async def get_html_wrapper(self, url: str): self.cookies_pool = get_browsers_cookies() except (PermissionError, OSError) as e: logger.warning(f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", exc_info=True) - cookies_pool = [] + self.cookies_pool = [] except Exception as e: logger.warning(f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", exc_info=True) - cookies_pool = [] + self.cookies_pool = [] if len(self.cookies_pool) > 0: item = self.cookies_pool.pop() # 更换Cookies时需要创建新的request实例,否则cloudscraper会保留它内部第一次发起网络访问时获得的Cookies - self.client.cookies = item['cookies'] + self.client.cookie_jar.update_cookies = item['cookies'] cookies_source = (item['profile'], item['site']) logger.debug(f'未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}') return self.get_html_wrapper(url) @@ -65,30 +66,30 @@ async def get_html_wrapper(self, url: str): raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'") else: - return html.fromstring(r.text) - elif r.status_code in (403, 503): - tree = html.fromstring(r.text) + return html.fromstring(await r.text()) + elif r.status in (403, 503): + tree = html.fromstring(await r.text()) code_tag = tree.xpath("//span[@class='code-label']/span") error_code = code_tag[0].text if code_tag else None if error_code: if error_code == '1020': - block_msg = f'JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器' + block_msg = f'JavDB: {r.status} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器' else: - block_msg = f'JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})' + block_msg = f'JavDB: {r.status} 禁止访问: {url} (Error code: {error_code})' else: - block_msg = f'JavDB: {r.status_code} 禁止访问: {url}' + block_msg = f'JavDB: {r.status} 禁止访问: {url}' raise SiteBlocked(block_msg) else: - raise WebsiteError(f'JavDB: {r.status_code} 非预期状态码: {url}') + raise WebsiteError(f'JavDB: {r.status} 非预期状态码: {url}') - async def get_user_info(self, site: str, cookies: Cookies): + async def get_user_info(self, site: str, cookies): """获取cookies对应的JavDB用户信息""" try: self.client.cookies = cookies resp = await self.client.get(f'https://{site}/users/profile') - html_str = resp.text + html_str = await resp.text() tree = html.fromstring(html_str) except Exception as e: logger.info('JavDB: 获取用户信息时出错') @@ -130,7 +131,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: index = ids.index(movie.dvdid.lower()) new_url = movie_urls[index] try: - html2 = await self.get_html_wrapper(new_url) + html2 = await self.get_html_wrapper(self.base_url + new_url) except (SitePermissionError, CredentialError): # 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面 box = tree.xpath("//a[@class='box']")[index] @@ -219,7 +220,7 @@ async def crawl_and_fill_cleaned(self, movie: MovieInfo): # 检查封面URL是否真的存在对应图片 if movie.cover is not None: r = await self.client.head(movie.cover) - if r.status_code != 200: + if r.status != 200: movie.cover = None except SiteBlocked: raise @@ -260,7 +261,7 @@ async def collect_actress_alias(self, type=0, use_original=True): count += 1 actor_name = actor.xpath("strong/text()")[0].strip() actor_url = actor.xpath("@href")[0] - # actor_url = f"https://javdb.com{actor_url}" # 构造演员主页的完整URL + actor_url = self.base_url + actor_url # 构造演员主页的完整URL # 进入演员主页,获取更多信息 actor_html = await self.get_html_wrapper(actor_url) @@ -338,6 +339,7 @@ async def collect_actress_alias(self, type=0, use_original=True): if __name__ == "__main__": async def test_main(): + # breakpoint() crawler = await JavDbCrawler.create() movie = MovieInfo('FC2-2735981') try: diff --git a/javsp/crawlers/sites/javlib.py b/javsp/crawlers/sites/javlib.py index c71a5f336..3832acbea 100644 --- a/javsp/crawlers/sites/javlib.py +++ b/javsp/crawlers/sites/javlib.py @@ -1,4 +1,7 @@ """从JavLibrary抓取数据""" + +# BUG: This crawler doesn't work, seemed due to cloudflare + import logging from urllib.parse import urlsplit @@ -7,7 +10,7 @@ from javsp.crawlers.exceptions import MovieDuplicateError, MovieNotFoundError from javsp.datatype import MovieInfo from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from javsp.config import CrawlerID from lxml import html @@ -15,14 +18,14 @@ logger = logging.getLogger(__name__) class JavLibCrawler(Crawler): - id = CrawlerID.jav321 + id = CrawlerID.javlib @classmethod async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://www.javlibrary.com') self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) return self # TODO: 发现JavLibrary支持使用cid搜索,会直接跳转到对应的影片页面,也许可以利用这个功能来做cid到dvdid的转换 @@ -30,7 +33,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: """解析指定番号的影片数据""" url = new_url = f'{self.base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}' resp = await self.client.get(url) - tree = html.fromstring(resp.text) + tree = html.fromstring(await resp.text()) if resp.history and urlsplit(str(resp.url)).netloc == urlsplit(self.base_url).netloc: # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果 new_url = resp.url @@ -65,7 +68,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls) # 重新抓取网页 resp = await self.client.get(new_url) - tree = html.fromstring(resp.text) + tree = html.fromstring(await resp.text()) container = tree.xpath("/html/body/div/div[@id='rightcolumn']")[0] title_tag = container.xpath("div/h3/a/text()") title = title_tag[0] diff --git a/javsp/crawlers/sites/javmenu.py b/javsp/crawlers/sites/javmenu.py index 6553d86a1..15ea78c0c 100644 --- a/javsp/crawlers/sites/javmenu.py +++ b/javsp/crawlers/sites/javmenu.py @@ -4,7 +4,7 @@ from javsp.crawlers.exceptions import MovieNotFoundError from javsp.datatype import MovieInfo from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from javsp.config import CrawlerID from lxml import html @@ -19,7 +19,7 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://www.javmenu.com') self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) return self async def crawl_and_fill(self, movie: MovieInfo) -> None: @@ -34,7 +34,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: # 被重定向到主页说明找不到影片资源 raise MovieNotFoundError(__name__, movie.dvdid) - tree = html.fromstring(r.text) + tree = html.fromstring(await r.text()) container = tree.xpath("//div[@class='col-md-9 px-0']")[0] title = container.xpath("div[@class='col-12 mb-3']/h1/strong/text()")[0] # 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站 diff --git a/javsp/crawlers/sites/mgstage.py b/javsp/crawlers/sites/mgstage.py index bd9d76840..a352470bf 100644 --- a/javsp/crawlers/sites/mgstage.py +++ b/javsp/crawlers/sites/mgstage.py @@ -6,7 +6,7 @@ from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked from javsp.datatype import MovieInfo from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from javsp.config import Cfg, CrawlerID from lxml import html @@ -22,22 +22,22 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://www.mgstage.com') self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) - self.client.cookies = {'adc': '1'} + self.client.cookie_jar.update_cookies({'adc': '1'}) return self async def crawl_and_fill(self, movie: MovieInfo) -> None: """解析指定番号的影片数据""" url = f'{self.base_url}/product/product_detail/{movie.dvdid}/' resp = await self.client.get(url) - if resp.status_code == 403: + if resp.status == 403: raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理') # url不存在时会被重定向至主页。history非空时说明发生了重定向 elif resp.history: raise MovieNotFoundError(__name__, movie.dvdid) - tree = html.fromstring(resp.text) + tree = html.fromstring(await resp.text()) # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除 title = tree.xpath("//div[@class='common_detail_cover']/h1/text()")[0].strip() container = tree.xpath("//div[@class='detail_left']")[0] @@ -93,7 +93,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: video_pid = btn_url.split('/')[-1] req_url = f'{self.base_url}/sampleplayer/sampleRespons.php?pid={video_pid}' resp = await self.client.get(req_url) - j = resp.json() + j = await resp.json() video_url = j.get('url') if video_url: # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX diff --git a/javsp/crawlers/sites/njav.py b/javsp/crawlers/sites/njav.py index 5787397c9..72826db02 100644 --- a/javsp/crawlers/sites/njav.py +++ b/javsp/crawlers/sites/njav.py @@ -6,7 +6,7 @@ from javsp.crawlers.exceptions import MovieNotFoundError from javsp.datatype import MovieInfo from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from javsp.config import CrawlerID from javsp.lib import strftime_to_minutes @@ -26,7 +26,7 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://www.njav.tv/') self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) return self async def search_video(self, movie: MovieInfo) -> str: @@ -34,7 +34,7 @@ async def search_video(self, movie: MovieInfo) -> str: # 抓取网页 url = f'{self.base_url}ja/search?keyword={id_uc}' resp = await self.client.get(url) - tree = html.fromstring(resp.text) + tree = html.fromstring(await resp.text()) list = tree.xpath("//div[@class='box-item']/div[@class='detail']/a") video_url = None for item in list: @@ -57,7 +57,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: if not url: raise MovieNotFoundError(__name__, movie.dvdid) resp = await self.client.get(url) - tree = html.fromstring(resp.text) + tree = html.fromstring(await resp.text()) container = tree.xpath("//div[@class='container']/div/div[@class='col']") if len(container) > 0: container = container[0] diff --git a/javsp/crawlers/sites/prestige.py b/javsp/crawlers/sites/prestige.py index bc0734554..5d0d4c9bb 100644 --- a/javsp/crawlers/sites/prestige.py +++ b/javsp/crawlers/sites/prestige.py @@ -7,7 +7,7 @@ from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked from javsp.datatype import MovieInfo from javsp.network.utils import resolve_site_fallback -from javsp.network.client import get_client +from javsp.network.client import get_session from javsp.crawlers.interface import Crawler from javsp.config import CrawlerID from lxml import html @@ -24,10 +24,10 @@ async def create(cls): self = cls() url = await resolve_site_fallback(self.id, 'https://www.prestige-av.com') self.base_url = str(url) - self.client = get_client(url) + self.client = get_session(url) # prestige要求访问者携带已通过R18认证的cookies才能够获得完整数据,否则会被重定向到认证页面 # (其他多数网站的R18认证只是在网页上遮了一层,完整数据已经传回,不影响爬虫爬取) - self.client.cookies = {'__age_auth__': 'true'} + self.client.cookie_jar.update_cookies({'__age_auth__': 'true'}) return self async def crawl_and_fill(self, movie: MovieInfo) -> None: @@ -37,13 +37,13 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: """ url = f'{self.base_url}/goods/goods_detail.php?sku={movie.dvdid}' resp = await self.client.get(url) - if resp.status_code == 500: + if resp.status == 500: # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试 raise MovieNotFoundError(__name__, movie.dvdid) - elif resp.status_code == 403: + elif resp.status == 403: raise SiteBlocked('prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理') resp.raise_for_status() - tree = html.fromstring(resp.text) + tree = html.fromstring(await resp.text()) container_tags = tree.xpath("//section[@class='px-4 mb-4 md:px-8 md:mb-16']") if not container_tags: raise MovieNotFoundError(__name__, movie.dvdid) diff --git a/javsp/func.py b/javsp/func.py index 6232747fd..d10ba8222 100644 --- a/javsp/func.py +++ b/javsp/func.py @@ -25,7 +25,7 @@ except ImportError: USE_GUI = False -from javsp.network.utils import get_client, url_download +from javsp.network.utils import get_session, url_download from javsp.lib import re_escape, resource_path @@ -183,9 +183,9 @@ def print_header(title, info=[]): release_url = 'https://github.com/Yuukiy/JavSP/releases/latest' print('正在检查更新...', end='') try: - client = get_client(Url(api_url)) + client = get_session(Url(api_url)) resp = await client.get(api_url) - data = resp.json() + data = await resp.json() latest_version = data['tag_name'] release_time = utc2local(data['published_at']) release_date = release_time.isoformat().split('T')[0] diff --git a/javsp/network/client.py b/javsp/network/client.py index 33232b677..981afeee4 100644 --- a/javsp/network/client.py +++ b/javsp/network/client.py @@ -1,11 +1,12 @@ """网络请求的统一接口""" -from typing import Dict +from typing import Any, Coroutine, Dict from pydantic_core import Url -from httpx import AsyncClient, AsyncHTTPTransport - from javsp.config import Cfg +from aiohttp import BaseConnector, ClientSession, TCPConnector +from aiohttp_socks import ProxyConnector +import asyncio default_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36' @@ -17,29 +18,44 @@ def get_proxy(unproxied: bool): else: return str(Cfg().network.proxy_server) -client_dictionary: Dict[str, AsyncClient] = {} -def get_client(url: Url) -> AsyncClient: +session_dictionary: Dict[str, ClientSession] = {} +proxy_connector: BaseConnector | None = None +def get_session(url: Url) -> ClientSession: if url.host is None: raise Exception(f"Unknown url {url}") else: index = url.host - if index in client_dictionary: - return client_dictionary[index] + if index in session_dictionary: + return session_dictionary[index] else: - unproxied = url.host in Cfg().network.unproxied + proxy = get_proxy(url.host in Cfg().network.unproxied) + - transport = AsyncHTTPTransport( - proxy=get_proxy(unproxied), - retries=Cfg().network.retries) + connector: BaseConnector + if proxy is None: + connector = TCPConnector() + else: + global proxy_connector + if proxy_connector is None: + proxy_connector = ProxyConnector.from_url(proxy) + connector = proxy_connector - client = AsyncClient( - transport=transport, + session = ClientSession( + connector=connector, # 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效 - headers=default_headers.copy(), - timeout=Cfg().network.timeout.total_seconds(), - follow_redirects=True, - ) + headers=default_headers.copy()) + + + session_dictionary[index] = session + + return session + +async def clear_clients(): + close_tasks: list[Coroutine[Any, Any, None]] = [] + for client in session_dictionary.values(): + close_tasks.append(client.close()) - client_dictionary[index] = client + await asyncio.gather(*close_tasks) - return client + if proxy_connector is not None: + await proxy_connector.close() diff --git a/javsp/network/utils.py b/javsp/network/utils.py index 34caf68da..00e379098 100644 --- a/javsp/network/utils.py +++ b/javsp/network/utils.py @@ -1,17 +1,17 @@ from datetime import timedelta import logging import time +from aiohttp import ClientTimeout from tqdm.asyncio import tqdm from typing import Any, Coroutine, NamedTuple import aiofiles -from pretty_errors import os from pydantic.types import ByteSize from pydantic_core import Url from pydantic_extra_types.pendulum_dt import Duration from javsp.config import Cfg, CrawlerID -from javsp.network.client import get_client +from javsp.network.client import get_session, clear_clients import asyncio @@ -28,46 +28,37 @@ def get_rate(self) -> float: async def url_download(url: Url, target_path: str, desc: str | None = None) -> DownloadInfo: url_str = str(url) - if url.scheme == 'file': - path: str = url.path - start_time: float = time.time() - async with aiofiles.open(path, "rb") as src: - async with aiofiles.open(target_path, "wb") as dest: - await dest.write(await src.read()) - filesize = os.path.getsize(path) - elapsed = time.time() - start_time - return DownloadInfo(ByteSize(filesize), Duration(seconds=elapsed)) - if not desc: desc = url_str.split('/')[-1] - client = get_client(url) + s = get_session(url) # REF: https://www.python-httpx.org/advanced/clients/#monitoring-download-progress async with aiofiles.open(target_path, 'wb') as download_file: # NOTE: Create a client for each request for now, need further refactor - async with client.stream("GET", url_str) as response: - total = int(response.headers["Content-Length"]) + + start = time.monotonic() + async with s.get(url_str) as response: + total = response.content_length with tqdm(total=total, unit_scale=True, unit_divisor=1024, unit="B") as progress: - num_bytes_downloaded = response.num_bytes_downloaded - for chunk in response.iter_bytes(): + async for chunk in response.content.iter_any(): await download_file.write(chunk) - progress.update(response.num_bytes_downloaded - num_bytes_downloaded) - num_bytes_downloaded = response.num_bytes_downloaded + progress.update(len(chunk)) - return DownloadInfo(ByteSize(response.num_bytes_downloaded), response.elapsed) + response_time = time.monotonic() - start + return DownloadInfo(ByteSize(total), timedelta(seconds=response_time)) async def test_connect(url_str: str, timeout: Duration) -> bool: """测试与指定url的连接,不使用映射,但使用代理""" try: - client = get_client(Url(url_str)) + s = get_session(Url(url_str)) response = \ - await client.get( + await s.get( url_str, - timeout=timeout.total_seconds(), + timeout=ClientTimeout(total=timeout.total_seconds()), ) - return response.status_code == 200 + return response.status == 200 except Exception as e: logger.debug(f"Not connectable: {url_str}\n" + repr(e)) return False @@ -98,8 +89,17 @@ async def resolve_site_fallback(cr_id: CrawlerID, default: str) -> Url: if __name__ == '__main__': async def aentry(): print(await choose_one_connectable(['http://iandown.what', 'http://www.baidu.com'])) + from javsp.network.client import clear_clients + await clear_clients() # async def aentry(): # print(await test_connect("https://www.y78k.com/", Duration(seconds=3))) + # async def aentry(): + # await asyncio.gather( + # url_download(Url('https://www.google.com/images/branding/googlelogo/2x/googlelogo_light_color_272x92dp.png'), 'gogle_logo.png'), + # url_download(Url('https://ei.phncdn.com/www-static/images/pornhub_logo_straight.svg?cache=2024092501'), 'pornhub_logo.svg'), + # ) + # await clear_clients() + asyncio.run(aentry()) diff --git a/javsp/translate.py b/javsp/translate.py index 1f202209a..66b8cb161 100644 --- a/javsp/translate.py +++ b/javsp/translate.py @@ -1,21 +1,19 @@ """网页翻译接口""" # 由于翻译服务不走代理,而且需要自己的错误处理机制,因此不通过base.py来管理网络请求 import time -from typing import Union import uuid import random import logging from pydantic_core import Url -import httpx from hashlib import md5 __all__ = ['translate', 'translate_movie_info'] -from javsp.config import BaiduTranslateEngine, BingTranslateEngine, Cfg, ClaudeTranslateEngine, GoogleTranslateEngine, OpenAITranslateEngine, TranslateEngine +from javsp.config import Cfg, TranslateEngine from javsp.datatype import MovieInfo -from javsp.network.client import get_proxy +from javsp.network.client import get_session logger = logging.getLogger(__name__) @@ -126,7 +124,7 @@ def translate(texts, engine: TranslateEngine, actress=[]): else: return {'trans': texts} -def baidu_translate(texts, app_id, api_key, to='zh'): +async def baidu_translate(texts, app_id, api_key, to='zh'): """使用百度翻译文本(默认翻译为简体中文)""" api_url = "https://api.fanyi.baidu.com/api/trans/vip/translate" headers = {'Content-Type': 'application/x-www-form-urlencoded'} @@ -140,13 +138,14 @@ def baidu_translate(texts, app_id, api_key, to='zh'): wait = 1.0 - (now - last_access) if wait > 0: time.sleep(wait) - r = httpx.post(api_url, params=payload, headers=headers) - result = r.json() + s = get_session(Url(api_url)) + r = await s.post(api_url, params=payload, headers=headers) + result = await r.json() baidu_translate._last_access = time.perf_counter() return result -def bing_translate(texts, api_key, to='zh-Hans'): +async def bing_translate(texts, api_key, to='zh-Hans'): """使用Bing翻译文本(默认翻译为简体中文)""" api_url = "https://api.cognitive.microsofttranslator.com/translate" params = {'api-version': '3.0', 'to': to, 'includeSentenceLength': True} @@ -157,34 +156,36 @@ def bing_translate(texts, api_key, to='zh-Hans'): 'X-ClientTraceId': str(uuid.uuid4()) } body = [{'text': texts}] - r = httpx.post(api_url, params=params, headers=headers, json=body) - result = r.json() + s = get_session(Url(api_url)) + r = await s.post(api_url, params=params, headers=headers, json=body) + result = await r.json() return result _google_trans_wait = 60 -def google_trans(texts, to='zh_CN'): +async def google_trans(texts, to='zh_CN'): """使用Google翻译文本(默认翻译为简体中文)""" # API: https://www.jianshu.com/p/ce35d89c25c3 # client参数的选择: https://github.com/lmk123/crx-selection-translate/issues/223#issue-184432017 global _google_trans_wait url = f"https://translate.google.com.hk/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={to}&q={texts}" - proxies = get_proxy(False) - r = httpx.get(url, proxies=proxies) - while r.status_code == 429: - logger.warning(f"HTTP {r.status_code}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试") + s = get_session(Url(url)) + r = await s.get(url) + # TODO: retry已经集成到client里了,这里考虑删除 + while r.status == 429: + logger.warning(f"HTTP {r.status}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试") time.sleep(_google_trans_wait) - r = httpx.get(url, proxies=proxies) - if r.status_code == 429: + r = await client.get(url) + if r.status == 429: _google_trans_wait += random.randint(60, 90) - if r.status_code == 200: - result = r.json() + if r.status == 200: + result = await r.json() else: - result = {'error_code': r.status_code, 'error_msg': r.reason} + result = {'error_code': r.status, 'error_msg': r.reason} time.sleep(4) # Google翻译的API有QPS限制,因此需要等待一段时间 return result -def claude_translate(texts, api_key, to="zh_CN"): +async def claude_translate(texts, api_key, to="zh_CN"): """使用Claude翻译文本(默认翻译为简体中文)""" api_url = "https://api.anthropic.com/v1/messages" headers = { @@ -198,17 +199,20 @@ def claude_translate(texts, api_key, to="zh_CN"): "max_tokens": 1024, "messages": [{"role": "user", "content": texts}], } - r = httpx.post(api_url, headers=headers, json=data) - if r.status_code == 200: - result = r.json().get("content", [{}])[0].get("text", "").strip() + + s = get_session(Url(api_url)) + r = await s.post(api_url, headers=headers, json=data) + j = await r.json() + if r.status == 200: + result = j.get("content", [{}])[0].get("text", "").strip() else: result = { - "error_code": r.status_code, - "error_msg": r.json().get("error", {}).get("message", r.reason), + "error_code": r.status, + "error_msg": j.get("error", {}).get("message", r.reason), } return result -def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"): +async def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"): """使用 OpenAI 翻译文本(默认翻译为简体中文)""" api_url = str(url) headers = { @@ -230,18 +234,20 @@ def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"): "temperature": 0, "max_tokens": 1024, } - r = httpx.post(api_url, headers=headers, json=data) - if r.status_code == 200: - if 'error' in r.json(): + s = get_session(Url(api_url)) + r = await s.post(api_url, headers=headers, json=data) + if r.status == 200: + j = await r.json() + if 'error' in j: result = { - "error_code": r.status_code, - "error_msg": r.json().get("error", {}).get("message", ""), + "error_code": r.status, + "error_msg": j.get("error", {}).get("message", ""), } else: - result = r.json().get("choices", [{}])[0].get("message", {}).get("content", "").strip() + result = j.get("choices", [{}])[0].get("message", {}).get("content", "").strip() else: result = { - "error_code": r.status_code, + "error_code": r.status, "error_msg": r.reason, } return result diff --git a/poetry.lock b/poetry.lock index f9b1b8d77..5d679f751 100644 --- a/poetry.lock +++ b/poetry.lock @@ -16,6 +16,178 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "aiohappyeyeballs" +version = "2.4.2" +description = "Happy Eyeballs for asyncio" +optional = false +python-versions = ">=3.8" +files = [ + {file = "aiohappyeyeballs-2.4.2-py3-none-any.whl", hash = "sha256:8522691d9a154ba1145b157d6d5c15e5c692527ce6a53c5e5f9876977f6dab2f"}, + {file = "aiohappyeyeballs-2.4.2.tar.gz", hash = "sha256:4ca893e6c5c1f5bf3888b04cb5a3bee24995398efef6e0b9f747b5e89d84fd74"}, +] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + +[[package]] +name = "aiohttp" +version = "3.10.8" +description = "Async http client/server framework (asyncio)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "aiohttp-3.10.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a1ba7bc139592339ddeb62c06486d0fa0f4ca61216e14137a40d626c81faf10c"}, + {file = "aiohttp-3.10.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:85e4d7bd05d18e4b348441e7584c681eff646e3bf38f68b2626807f3add21aa2"}, + {file = "aiohttp-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:69de056022e7abf69cb9fec795515973cc3eeaff51e3ea8d72a77aa933a91c52"}, + {file = "aiohttp-3.10.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee3587506898d4a404b33bd19689286ccf226c3d44d7a73670c8498cd688e42c"}, + {file = "aiohttp-3.10.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fe285a697c851734285369614443451462ce78aac2b77db23567507484b1dc6f"}, + {file = "aiohttp-3.10.8-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10c7932337285a6bfa3a5fe1fd4da90b66ebfd9d0cbd1544402e1202eb9a8c3e"}, + {file = "aiohttp-3.10.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd9716ef0224fe0d0336997eb242f40619f9f8c5c57e66b525a1ebf9f1d8cebe"}, + {file = "aiohttp-3.10.8-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ceacea31f8a55cdba02bc72c93eb2e1b77160e91f8abd605969c168502fd71eb"}, + {file = "aiohttp-3.10.8-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9721554bfa9e15f6e462da304374c2f1baede3cb06008c36c47fa37ea32f1dc4"}, + {file = "aiohttp-3.10.8-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:22cdeb684d8552490dd2697a5138c4ecb46f844892df437aaf94f7eea99af879"}, + {file = "aiohttp-3.10.8-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e56bb7e31c4bc79956b866163170bc89fd619e0581ce813330d4ea46921a4881"}, + {file = "aiohttp-3.10.8-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:3a95d2686bc4794d66bd8de654e41b5339fab542b2bca9238aa63ed5f4f2ce82"}, + {file = "aiohttp-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d82404a0e7b10e0d7f022cf44031b78af8a4f99bd01561ac68f7c24772fed021"}, + {file = "aiohttp-3.10.8-cp310-cp310-win32.whl", hash = "sha256:4e10b04542d27e21538e670156e88766543692a0a883f243ba8fad9ddea82e53"}, + {file = "aiohttp-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:680dbcff5adc7f696ccf8bf671d38366a1f620b5616a1d333d0cb33956065395"}, + {file = "aiohttp-3.10.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:33a68011a38020ed4ff41ae0dbf4a96a202562ecf2024bdd8f65385f1d07f6ef"}, + {file = "aiohttp-3.10.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6c7efa6616a95e3bd73b8a69691012d2ef1f95f9ea0189e42f338fae080c2fc6"}, + {file = "aiohttp-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ddb9b9764cfb4459acf01c02d2a59d3e5066b06a846a364fd1749aa168efa2be"}, + {file = "aiohttp-3.10.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c7f270f4ca92760f98a42c45a58674fff488e23b144ec80b1cc6fa2effed377"}, + {file = "aiohttp-3.10.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6984dda9d79064361ab58d03f6c1e793ea845c6cfa89ffe1a7b9bb400dfd56bd"}, + {file = "aiohttp-3.10.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f6d47e392c27206701565c8df4cac6ebed28fdf6dcaea5b1eea7a4631d8e6db"}, + {file = "aiohttp-3.10.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a72f89aea712c619b2ca32c6f4335c77125ede27530ad9705f4f349357833695"}, + {file = "aiohttp-3.10.8-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c36074b26f3263879ba8e4dbd33db2b79874a3392f403a70b772701363148b9f"}, + {file = "aiohttp-3.10.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e32148b4a745e70a255a1d44b5664de1f2e24fcefb98a75b60c83b9e260ddb5b"}, + {file = "aiohttp-3.10.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5aa1a073514cf59c81ad49a4ed9b5d72b2433638cd53160fd2f3a9cfa94718db"}, + {file = "aiohttp-3.10.8-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d3a79200a9d5e621c4623081ddb25380b713c8cf5233cd11c1aabad990bb9381"}, + {file = "aiohttp-3.10.8-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e45fdfcb2d5bcad83373e4808825b7512953146d147488114575780640665027"}, + {file = "aiohttp-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f78e2a78432c537ae876a93013b7bc0027ba5b93ad7b3463624c4b6906489332"}, + {file = "aiohttp-3.10.8-cp311-cp311-win32.whl", hash = "sha256:f8179855a4e4f3b931cb1764ec87673d3fbdcca2af496c8d30567d7b034a13db"}, + {file = "aiohttp-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:ef9b484604af05ca745b6108ca1aaa22ae1919037ae4f93aaf9a37ba42e0b835"}, + {file = "aiohttp-3.10.8-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ab2d6523575fc98896c80f49ac99e849c0b0e69cc80bf864eed6af2ae728a52b"}, + {file = "aiohttp-3.10.8-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f5d5d5401744dda50b943d8764508d0e60cc2d3305ac1e6420935861a9d544bc"}, + {file = "aiohttp-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de23085cf90911600ace512e909114385026b16324fa203cc74c81f21fd3276a"}, + {file = "aiohttp-3.10.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4618f0d2bf523043866a9ff8458900d8eb0a6d4018f251dae98e5f1fb699f3a8"}, + {file = "aiohttp-3.10.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:21c1925541ca84f7b5e0df361c0a813a7d6a56d3b0030ebd4b220b8d232015f9"}, + {file = "aiohttp-3.10.8-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:497a7d20caea8855c5429db3cdb829385467217d7feb86952a6107e033e031b9"}, + {file = "aiohttp-3.10.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c887019dbcb4af58a091a45ccf376fffe800b5531b45c1efccda4bedf87747ea"}, + {file = "aiohttp-3.10.8-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40d2d719c3c36a7a65ed26400e2b45b2d9ed7edf498f4df38b2ae130f25a0d01"}, + {file = "aiohttp-3.10.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:57359785f27394a8bcab0da6dcd46706d087dfebf59a8d0ad2e64a4bc2f6f94f"}, + {file = "aiohttp-3.10.8-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a961ee6f2cdd1a2be4735333ab284691180d40bad48f97bb598841bfcbfb94ec"}, + {file = "aiohttp-3.10.8-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:fe3d79d6af839ffa46fdc5d2cf34295390894471e9875050eafa584cb781508d"}, + {file = "aiohttp-3.10.8-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9a281cba03bdaa341c70b7551b2256a88d45eead149f48b75a96d41128c240b3"}, + {file = "aiohttp-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c6769d71bfb1ed60321363a9bc05e94dcf05e38295ef41d46ac08919e5b00d19"}, + {file = "aiohttp-3.10.8-cp312-cp312-win32.whl", hash = "sha256:a3081246bab4d419697ee45e555cef5cd1def7ac193dff6f50be761d2e44f194"}, + {file = "aiohttp-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:ab1546fc8e00676febc81c548a876c7bde32f881b8334b77f84719ab2c7d28dc"}, + {file = "aiohttp-3.10.8-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:b1a012677b8e0a39e181e218de47d6741c5922202e3b0b65e412e2ce47c39337"}, + {file = "aiohttp-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2df786c96c57cd6b87156ba4c5f166af7b88f3fc05f9d592252fdc83d8615a3c"}, + {file = "aiohttp-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8885ca09d3a9317219c0831276bfe26984b17b2c37b7bf70dd478d17092a4772"}, + {file = "aiohttp-3.10.8-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4dbf252ac19860e0ab56cd480d2805498f47c5a2d04f5995d8d8a6effd04b48c"}, + {file = "aiohttp-3.10.8-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b2036479b6b94afaaca7d07b8a68dc0e67b0caf5f6293bb6a5a1825f5923000"}, + {file = "aiohttp-3.10.8-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:365783e1b7c40b59ed4ce2b5a7491bae48f41cd2c30d52647a5b1ee8604c68ad"}, + {file = "aiohttp-3.10.8-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:270e653b5a4b557476a1ed40e6b6ce82f331aab669620d7c95c658ef976c9c5e"}, + {file = "aiohttp-3.10.8-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8960fabc20bfe4fafb941067cda8e23c8c17c98c121aa31c7bf0cdab11b07842"}, + {file = "aiohttp-3.10.8-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f21e8f2abed9a44afc3d15bba22e0dfc71e5fa859bea916e42354c16102b036f"}, + {file = "aiohttp-3.10.8-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:fecd55e7418fabd297fd836e65cbd6371aa4035a264998a091bbf13f94d9c44d"}, + {file = "aiohttp-3.10.8-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:badb51d851358cd7535b647bb67af4854b64f3c85f0d089c737f75504d5910ec"}, + {file = "aiohttp-3.10.8-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e860985f30f3a015979e63e7ba1a391526cdac1b22b7b332579df7867848e255"}, + {file = "aiohttp-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:71462f8eeca477cbc0c9700a9464e3f75f59068aed5e9d4a521a103692da72dc"}, + {file = "aiohttp-3.10.8-cp313-cp313-win32.whl", hash = "sha256:177126e971782769b34933e94fddd1089cef0fe6b82fee8a885e539f5b0f0c6a"}, + {file = "aiohttp-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:98a4eb60e27033dee9593814ca320ee8c199489fbc6b2699d0f710584db7feb7"}, + {file = "aiohttp-3.10.8-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ffef3d763e4c8fc97e740da5b4d0f080b78630a3914f4e772a122bbfa608c1db"}, + {file = "aiohttp-3.10.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:597128cb7bc5f068181b49a732961f46cb89f85686206289d6ccb5e27cb5fbe2"}, + {file = "aiohttp-3.10.8-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f23a6c1d09de5de89a33c9e9b229106cb70dcfdd55e81a3a3580eaadaa32bc92"}, + {file = "aiohttp-3.10.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da57af0c54a302b7c655fa1ccd5b1817a53739afa39924ef1816e7b7c8a07ccb"}, + {file = "aiohttp-3.10.8-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e7a6af57091056a79a35104d6ec29d98ec7f1fb7270ad9c6fff871b678d1ff8"}, + {file = "aiohttp-3.10.8-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:32710d6b3b6c09c60c794d84ca887a3a2890131c0b02b3cefdcc6709a2260a7c"}, + {file = "aiohttp-3.10.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b91f4f62ad39a8a42d511d66269b46cb2fb7dea9564c21ab6c56a642d28bff5"}, + {file = "aiohttp-3.10.8-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:471a8c47344b9cc309558b3fcc469bd2c12b49322b4b31eb386c4a2b2d44e44a"}, + {file = "aiohttp-3.10.8-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:fc0e7f91705445d79beafba9bb3057dd50830e40fe5417017a76a214af54e122"}, + {file = "aiohttp-3.10.8-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:85431c9131a9a0f65260dc7a65c800ca5eae78c4c9931618f18c8e0933a0e0c1"}, + {file = "aiohttp-3.10.8-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:b91557ee0893da52794b25660d4f57bb519bcad8b7df301acd3898f7197c5d81"}, + {file = "aiohttp-3.10.8-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:4954e6b06dd0be97e1a5751fc606be1f9edbdc553c5d9b57d72406a8fbd17f9d"}, + {file = "aiohttp-3.10.8-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:a087c84b4992160ffef7afd98ef24177c8bd4ad61c53607145a8377457385100"}, + {file = "aiohttp-3.10.8-cp38-cp38-win32.whl", hash = "sha256:e1f0f7b27171b2956a27bd8f899751d0866ddabdd05cbddf3520f945130a908c"}, + {file = "aiohttp-3.10.8-cp38-cp38-win_amd64.whl", hash = "sha256:c4916070e12ae140110aa598031876c1bf8676a36a750716ea0aa5bd694aa2e7"}, + {file = "aiohttp-3.10.8-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5284997e3d88d0dfb874c43e51ae8f4a6f4ca5b90dcf22995035187253d430db"}, + {file = "aiohttp-3.10.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9443d9ebc5167ce1fbb552faf2d666fb22ef5716a8750be67efd140a7733738c"}, + {file = "aiohttp-3.10.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b667e2a03407d79a76c618dc30cedebd48f082d85880d0c9c4ec2faa3e10f43e"}, + {file = "aiohttp-3.10.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98fae99d5c2146f254b7806001498e6f9ffb0e330de55a35e72feb7cb2fa399b"}, + {file = "aiohttp-3.10.8-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8296edd99d0dd9d0eb8b9e25b3b3506eef55c1854e9cc230f0b3f885f680410b"}, + {file = "aiohttp-3.10.8-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1ce46dfb49cfbf9e92818be4b761d4042230b1f0e05ffec0aad15b3eb162b905"}, + {file = "aiohttp-3.10.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c38cfd355fd86c39b2d54651bd6ed7d63d4fe3b5553f364bae3306e2445f847"}, + {file = "aiohttp-3.10.8-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:713dff3f87ceec3bde4f3f484861464e722cf7533f9fa6b824ec82bb5a9010a7"}, + {file = "aiohttp-3.10.8-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:21a72f4a9c69a8567a0aca12042f12bba25d3139fd5dd8eeb9931f4d9e8599cd"}, + {file = "aiohttp-3.10.8-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6d1ad868624f6cea77341ef2877ad4e71f7116834a6cd7ec36ec5c32f94ee6ae"}, + {file = "aiohttp-3.10.8-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:a78ba86d5a08207d1d1ad10b97aed6ea48b374b3f6831d02d0b06545ac0f181e"}, + {file = "aiohttp-3.10.8-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:aff048793d05e1ce05b62e49dccf81fe52719a13f4861530706619506224992b"}, + {file = "aiohttp-3.10.8-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d088ca05381fd409793571d8e34eca06daf41c8c50a05aeed358d2d340c7af81"}, + {file = "aiohttp-3.10.8-cp39-cp39-win32.whl", hash = "sha256:ee97c4e54f457c366e1f76fbbf3e8effee9de57dae671084a161c00f481106ce"}, + {file = "aiohttp-3.10.8-cp39-cp39-win_amd64.whl", hash = "sha256:d95ae4420669c871667aad92ba8cce6251d61d79c1a38504621094143f94a8b4"}, + {file = "aiohttp-3.10.8.tar.gz", hash = "sha256:21f8225f7dc187018e8433c9326be01477fb2810721e048b33ac49091b19fb4a"}, +] + +[package.dependencies] +aiohappyeyeballs = ">=2.3.0" +aiosignal = ">=1.1.2" +async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""} +attrs = ">=17.3.0" +frozenlist = ">=1.1.1" +multidict = ">=4.5,<7.0" +yarl = ">=1.12.0,<2.0" + +[package.extras] +speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + +[[package]] +name = "aiohttp-socks" +version = "0.9.0" +description = "Proxy connector for aiohttp" +optional = false +python-versions = "*" +files = [ + {file = "aiohttp_socks-0.9.0-py3-none-any.whl", hash = "sha256:90a8211fd5b904ccbd010900105f1fd2dab20ae8a07df508df399036ad8d3d88"}, + {file = "aiohttp_socks-0.9.0.tar.gz", hash = "sha256:22159a1af026b229cfe5ea007e065bb3fe56385a951a82623a6f4588a6758003"}, +] + +[package.dependencies] +aiohttp = ">=3.10.0" +python-socks = {version = ">=2.4.3,<3.0.0", extras = ["asyncio"]} + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + +[[package]] +name = "aiosignal" +version = "1.3.1" +description = "aiosignal: a list of registered asynchronous callbacks" +optional = false +python-versions = ">=3.7" +files = [ + {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"}, + {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"}, +] + +[package.dependencies] +frozenlist = ">=1.1.0" + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "annotated-types" version = "0.7.0" @@ -33,43 +205,40 @@ url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" [[package]] -name = "anyio" -version = "4.6.0" -description = "High level compatibility layer for multiple asynchronous event loop implementations" +name = "async-timeout" +version = "4.0.3" +description = "Timeout context manager for asyncio programs" optional = false -python-versions = ">=3.9" +python-versions = ">=3.7" files = [ - {file = "anyio-4.6.0-py3-none-any.whl", hash = "sha256:c7d2e9d63e31599eeb636c8c5c03a7e108d73b345f064f1c19fdc87b79036a9a"}, - {file = "anyio-4.6.0.tar.gz", hash = "sha256:137b4559cbb034c477165047febb6ff83f390fc3b20bf181c1fc0a728cb8beeb"}, + {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, + {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, ] -[package.dependencies] -exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} -idna = ">=2.8" -sniffio = ">=1.1" -typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} - -[package.extras] -doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.21.0b1)"] -trio = ["trio (>=0.26.1)"] - [package.source] type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" [[package]] -name = "certifi" -version = "2024.8.30" -description = "Python package for providing Mozilla's CA Bundle." +name = "attrs" +version = "24.2.0" +description = "Classes Without Boilerplate" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, - {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, + {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"}, + {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"}, ] +[package.extras] +benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] + [package.source] type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" @@ -489,73 +658,91 @@ url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" [[package]] -name = "h11" -version = "0.14.0" -description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" -optional = false -python-versions = ">=3.7" -files = [ - {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, - {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, -] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - -[[package]] -name = "httpcore" -version = "1.0.5" -description = "A minimal low-level HTTP client." -optional = false -python-versions = ">=3.8" -files = [ - {file = "httpcore-1.0.5-py3-none-any.whl", hash = "sha256:421f18bac248b25d310f3cacd198d55b8e6125c107797b609ff9b7a6ba7991b5"}, - {file = "httpcore-1.0.5.tar.gz", hash = "sha256:34a38e2f9291467ee3b44e89dd52615370e152954ba21721378a87b2960f7a61"}, -] - -[package.dependencies] -certifi = "*" -h11 = ">=0.13,<0.15" - -[package.extras] -asyncio = ["anyio (>=4.0,<5.0)"] -http2 = ["h2 (>=3,<5)"] -socks = ["socksio (==1.*)"] -trio = ["trio (>=0.22.0,<0.26.0)"] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - -[[package]] -name = "httpx" -version = "0.27.2" -description = "The next generation HTTP client." +name = "frozenlist" +version = "1.4.1" +description = "A list-like structure which implements collections.abc.MutableSequence" optional = false python-versions = ">=3.8" files = [ - {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"}, - {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"}, + {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"}, + {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"}, + {file = "frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776"}, + {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a"}, + {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad"}, + {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c"}, + {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe"}, + {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a"}, + {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98"}, + {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75"}, + {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5"}, + {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950"}, + {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc"}, + {file = "frozenlist-1.4.1-cp310-cp310-win32.whl", hash = "sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1"}, + {file = "frozenlist-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439"}, + {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0"}, + {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49"}, + {file = "frozenlist-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced"}, + {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0"}, + {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106"}, + {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068"}, + {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2"}, + {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19"}, + {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82"}, + {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec"}, + {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a"}, + {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74"}, + {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2"}, + {file = "frozenlist-1.4.1-cp311-cp311-win32.whl", hash = "sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17"}, + {file = "frozenlist-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825"}, + {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae"}, + {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb"}, + {file = "frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b"}, + {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86"}, + {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480"}, + {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09"}, + {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a"}, + {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd"}, + {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6"}, + {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1"}, + {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b"}, + {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e"}, + {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8"}, + {file = "frozenlist-1.4.1-cp312-cp312-win32.whl", hash = "sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89"}, + {file = "frozenlist-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5"}, + {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d"}, + {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826"}, + {file = "frozenlist-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb"}, + {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6"}, + {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d"}, + {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887"}, + {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a"}, + {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b"}, + {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701"}, + {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0"}, + {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11"}, + {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09"}, + {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7"}, + {file = "frozenlist-1.4.1-cp38-cp38-win32.whl", hash = "sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497"}, + {file = "frozenlist-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09"}, + {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e"}, + {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d"}, + {file = "frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8"}, + {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0"}, + {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b"}, + {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0"}, + {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897"}, + {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7"}, + {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742"}, + {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea"}, + {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5"}, + {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9"}, + {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6"}, + {file = "frozenlist-1.4.1-cp39-cp39-win32.whl", hash = "sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932"}, + {file = "frozenlist-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0"}, + {file = "frozenlist-1.4.1-py3-none-any.whl", hash = "sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7"}, + {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"}, ] -[package.dependencies] -anyio = "*" -certifi = "*" -httpcore = "==1.*" -idna = "*" -sniffio = "*" -socksio = {version = "==1.*", optional = true, markers = "extra == \"socks\""} - -[package.extras] -brotli = ["brotli", "brotlicffi"] -cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] -http2 = ["h2 (>=3,<5)"] -socks = ["socksio (==1.*)"] -zstd = ["zstandard (>=0.18.0)"] - [package.source] type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" @@ -896,6 +1083,115 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "multidict" +version = "6.1.0" +description = "multidict implementation" +optional = false +python-versions = ">=3.8" +files = [ + {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"}, + {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"}, + {file = "multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53"}, + {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5"}, + {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581"}, + {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56"}, + {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429"}, + {file = "multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748"}, + {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db"}, + {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056"}, + {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76"}, + {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160"}, + {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7"}, + {file = "multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0"}, + {file = "multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d"}, + {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6"}, + {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156"}, + {file = "multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb"}, + {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b"}, + {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72"}, + {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304"}, + {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351"}, + {file = "multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb"}, + {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3"}, + {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399"}, + {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423"}, + {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3"}, + {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753"}, + {file = "multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80"}, + {file = "multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926"}, + {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa"}, + {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436"}, + {file = "multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761"}, + {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e"}, + {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef"}, + {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95"}, + {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925"}, + {file = "multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966"}, + {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305"}, + {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2"}, + {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2"}, + {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6"}, + {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3"}, + {file = "multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133"}, + {file = "multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1"}, + {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008"}, + {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f"}, + {file = "multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28"}, + {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b"}, + {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c"}, + {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3"}, + {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44"}, + {file = "multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2"}, + {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3"}, + {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa"}, + {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa"}, + {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4"}, + {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6"}, + {file = "multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81"}, + {file = "multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774"}, + {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:db7457bac39421addd0c8449933ac32d8042aae84a14911a757ae6ca3eef1392"}, + {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d094ddec350a2fb899fec68d8353c78233debde9b7d8b4beeafa70825f1c281a"}, + {file = "multidict-6.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5845c1fd4866bb5dd3125d89b90e57ed3138241540897de748cdf19de8a2fca2"}, + {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9079dfc6a70abe341f521f78405b8949f96db48da98aeb43f9907f342f627cdc"}, + {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3914f5aaa0f36d5d60e8ece6a308ee1c9784cd75ec8151062614657a114c4478"}, + {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c08be4f460903e5a9d0f76818db3250f12e9c344e79314d1d570fc69d7f4eae4"}, + {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d093be959277cb7dee84b801eb1af388b6ad3ca6a6b6bf1ed7585895789d027d"}, + {file = "multidict-6.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3702ea6872c5a2a4eeefa6ffd36b042e9773f05b1f37ae3ef7264b1163c2dcf6"}, + {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:2090f6a85cafc5b2db085124d752757c9d251548cedabe9bd31afe6363e0aff2"}, + {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:f67f217af4b1ff66c68a87318012de788dd95fcfeb24cc889011f4e1c7454dfd"}, + {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:189f652a87e876098bbc67b4da1049afb5f5dfbaa310dd67c594b01c10388db6"}, + {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:6bb5992037f7a9eff7991ebe4273ea7f51f1c1c511e6a2ce511d0e7bdb754492"}, + {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f4c2b9e770c4e393876e35a7046879d195cd123b4f116d299d442b335bcd"}, + {file = "multidict-6.1.0-cp38-cp38-win32.whl", hash = "sha256:e27bbb6d14416713a8bd7aaa1313c0fc8d44ee48d74497a0ff4c3a1b6ccb5167"}, + {file = "multidict-6.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:22f3105d4fb15c8f57ff3959a58fcab6ce36814486500cd7485651230ad4d4ef"}, + {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c"}, + {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1"}, + {file = "multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c"}, + {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c"}, + {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f"}, + {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875"}, + {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255"}, + {file = "multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30"}, + {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057"}, + {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657"}, + {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28"}, + {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972"}, + {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43"}, + {file = "multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada"}, + {file = "multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a"}, + {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"}, + {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"}, +] + +[package.dependencies] +typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""} + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "packaging" version = "24.1" @@ -1484,6 +1780,31 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "python-socks" +version = "2.5.2" +description = "Core proxy (SOCKS4, SOCKS5, HTTP tunneling) functionality for Python" +optional = false +python-versions = "*" +files = [ + {file = "python_socks-2.5.2-py3-none-any.whl", hash = "sha256:e2511c0d270d5135f8052d5e7ab7c4f089bd0f3fe0f54b8c322f8cbda5db2b2e"}, + {file = "python_socks-2.5.2.tar.gz", hash = "sha256:1a5220d159f88a92ef2f77d1acb77d175d40cb34af9176609d3cf728cb7499c7"}, +] + +[package.dependencies] +async-timeout = {version = ">=3.0.1", optional = true, markers = "python_version < \"3.11\" and extra == \"asyncio\""} + +[package.extras] +anyio = ["anyio (>=3.3.4,<5.0.0)"] +asyncio = ["async-timeout (>=3.0.1)"] +curio = ["curio (>=1.4)"] +trio = ["trio (>=0.16.0)"] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "pywin32" version = "306" @@ -1675,38 +1996,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "sniffio" -version = "1.3.1" -description = "Sniff out which async library your code is running under" -optional = false -python-versions = ">=3.7" -files = [ - {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, - {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, -] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - -[[package]] -name = "socksio" -version = "1.0.0" -description = "Sans-I/O implementation of SOCKS4, SOCKS4A, and SOCKS5." -optional = false -python-versions = ">=3.6" -files = [ - {file = "socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3"}, - {file = "socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac"}, -] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "time-machine" version = "2.15.0" @@ -1951,6 +2240,116 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "yarl" +version = "1.13.1" +description = "Yet another URL library" +optional = false +python-versions = ">=3.8" +files = [ + {file = "yarl-1.13.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:82e692fb325013a18a5b73a4fed5a1edaa7c58144dc67ad9ef3d604eccd451ad"}, + {file = "yarl-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df4e82e68f43a07735ae70a2d84c0353e58e20add20ec0af611f32cd5ba43fb4"}, + {file = "yarl-1.13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ec9dd328016d8d25702a24ee274932aebf6be9787ed1c28d021945d264235b3c"}, + {file = "yarl-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5820bd4178e6a639b3ef1db8b18500a82ceab6d8b89309e121a6859f56585b05"}, + {file = "yarl-1.13.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86c438ce920e089c8c2388c7dcc8ab30dfe13c09b8af3d306bcabb46a053d6f7"}, + {file = "yarl-1.13.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3de86547c820e4f4da4606d1c8ab5765dd633189791f15247706a2eeabc783ae"}, + {file = "yarl-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ca53632007c69ddcdefe1e8cbc3920dd88825e618153795b57e6ebcc92e752a"}, + {file = "yarl-1.13.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d4ee1d240b84e2f213565f0ec08caef27a0e657d4c42859809155cf3a29d1735"}, + {file = "yarl-1.13.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c49f3e379177f4477f929097f7ed4b0622a586b0aa40c07ac8c0f8e40659a1ac"}, + {file = "yarl-1.13.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5c5e32fef09ce101fe14acd0f498232b5710effe13abac14cd95de9c274e689e"}, + {file = "yarl-1.13.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ab9524e45ee809a083338a749af3b53cc7efec458c3ad084361c1dbf7aaf82a2"}, + {file = "yarl-1.13.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:b1481c048fe787f65e34cb06f7d6824376d5d99f1231eae4778bbe5c3831076d"}, + {file = "yarl-1.13.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:31497aefd68036d8e31bfbacef915826ca2e741dbb97a8d6c7eac66deda3b606"}, + {file = "yarl-1.13.1-cp310-cp310-win32.whl", hash = "sha256:1fa56f34b2236f5192cb5fceba7bbb09620e5337e0b6dfe2ea0ddbd19dd5b154"}, + {file = "yarl-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:1bbb418f46c7f7355084833051701b2301092e4611d9e392360c3ba2e3e69f88"}, + {file = "yarl-1.13.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:216a6785f296169ed52cd7dcdc2612f82c20f8c9634bf7446327f50398732a51"}, + {file = "yarl-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:40c6e73c03a6befb85b72da213638b8aaa80fe4136ec8691560cf98b11b8ae6e"}, + {file = "yarl-1.13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2430cf996113abe5aee387d39ee19529327205cda975d2b82c0e7e96e5fdabdc"}, + {file = "yarl-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fb4134cc6e005b99fa29dbc86f1ea0a298440ab6b07c6b3ee09232a3b48f495"}, + {file = "yarl-1.13.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:309c104ecf67626c033845b860d31594a41343766a46fa58c3309c538a1e22b2"}, + {file = "yarl-1.13.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f90575e9fe3aae2c1e686393a9689c724cd00045275407f71771ae5d690ccf38"}, + {file = "yarl-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d2e1626be8712333a9f71270366f4a132f476ffbe83b689dd6dc0d114796c74"}, + {file = "yarl-1.13.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b66c87da3c6da8f8e8b648878903ca54589038a0b1e08dde2c86d9cd92d4ac9"}, + {file = "yarl-1.13.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cf1ad338620249f8dd6d4b6a91a69d1f265387df3697ad5dc996305cf6c26fb2"}, + {file = "yarl-1.13.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9915300fe5a0aa663c01363db37e4ae8e7c15996ebe2c6cce995e7033ff6457f"}, + {file = "yarl-1.13.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:703b0f584fcf157ef87816a3c0ff868e8c9f3c370009a8b23b56255885528f10"}, + {file = "yarl-1.13.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1d8e3ca29f643dd121f264a7c89f329f0fcb2e4461833f02de6e39fef80f89da"}, + {file = "yarl-1.13.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:7055bbade838d68af73aea13f8c86588e4bcc00c2235b4b6d6edb0dbd174e246"}, + {file = "yarl-1.13.1-cp311-cp311-win32.whl", hash = "sha256:a3442c31c11088e462d44a644a454d48110f0588de830921fd201060ff19612a"}, + {file = "yarl-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:81bad32c8f8b5897c909bf3468bf601f1b855d12f53b6af0271963ee67fff0d2"}, + {file = "yarl-1.13.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f452cc1436151387d3d50533523291d5f77c6bc7913c116eb985304abdbd9ec9"}, + {file = "yarl-1.13.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9cec42a20eae8bebf81e9ce23fb0d0c729fc54cf00643eb251ce7c0215ad49fe"}, + {file = "yarl-1.13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d959fe96e5c2712c1876d69af0507d98f0b0e8d81bee14cfb3f6737470205419"}, + {file = "yarl-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8c837ab90c455f3ea8e68bee143472ee87828bff19ba19776e16ff961425b57"}, + {file = "yarl-1.13.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94a993f976cdcb2dc1b855d8b89b792893220db8862d1a619efa7451817c836b"}, + {file = "yarl-1.13.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b2442a415a5f4c55ced0fade7b72123210d579f7d950e0b5527fc598866e62c"}, + {file = "yarl-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3fdbf0418489525231723cdb6c79e7738b3cbacbaed2b750cb033e4ea208f220"}, + {file = "yarl-1.13.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6b7f6e699304717fdc265a7e1922561b02a93ceffdaefdc877acaf9b9f3080b8"}, + {file = "yarl-1.13.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bcd5bf4132e6a8d3eb54b8d56885f3d3a38ecd7ecae8426ecf7d9673b270de43"}, + {file = "yarl-1.13.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:2a93a4557f7fc74a38ca5a404abb443a242217b91cd0c4840b1ebedaad8919d4"}, + {file = "yarl-1.13.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:22b739f99c7e4787922903f27a892744189482125cc7b95b747f04dd5c83aa9f"}, + {file = "yarl-1.13.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:2db874dd1d22d4c2c657807562411ffdfabec38ce4c5ce48b4c654be552759dc"}, + {file = "yarl-1.13.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4feaaa4742517eaceafcbe74595ed335a494c84634d33961214b278126ec1485"}, + {file = "yarl-1.13.1-cp312-cp312-win32.whl", hash = "sha256:bbf9c2a589be7414ac4a534d54e4517d03f1cbb142c0041191b729c2fa23f320"}, + {file = "yarl-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:d07b52c8c450f9366c34aa205754355e933922c79135125541daae6cbf31c799"}, + {file = "yarl-1.13.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:95c6737f28069153c399d875317f226bbdea939fd48a6349a3b03da6829fb550"}, + {file = "yarl-1.13.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:cd66152561632ed4b2a9192e7f8e5a1d41e28f58120b4761622e0355f0fe034c"}, + {file = "yarl-1.13.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6a2acde25be0cf9be23a8f6cbd31734536a264723fca860af3ae5e89d771cd71"}, + {file = "yarl-1.13.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a18595e6a2ee0826bf7dfdee823b6ab55c9b70e8f80f8b77c37e694288f5de1"}, + {file = "yarl-1.13.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a31d21089894942f7d9a8df166b495101b7258ff11ae0abec58e32daf8088813"}, + {file = "yarl-1.13.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:45f209fb4bbfe8630e3d2e2052535ca5b53d4ce2d2026bed4d0637b0416830da"}, + {file = "yarl-1.13.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f722f30366474a99745533cc4015b1781ee54b08de73260b2bbe13316079851"}, + {file = "yarl-1.13.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3bf60444269345d712838bb11cc4eadaf51ff1a364ae39ce87a5ca8ad3bb2c8"}, + {file = "yarl-1.13.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:942c80a832a79c3707cca46bd12ab8aa58fddb34b1626d42b05aa8f0bcefc206"}, + {file = "yarl-1.13.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:44b07e1690f010c3c01d353b5790ec73b2f59b4eae5b0000593199766b3f7a5c"}, + {file = "yarl-1.13.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:396e59b8de7e4d59ff5507fb4322d2329865b909f29a7ed7ca37e63ade7f835c"}, + {file = "yarl-1.13.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:3bb83a0f12701c0b91112a11148b5217617982e1e466069d0555be9b372f2734"}, + {file = "yarl-1.13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c92b89bffc660f1274779cb6fbb290ec1f90d6dfe14492523a0667f10170de26"}, + {file = "yarl-1.13.1-cp313-cp313-win32.whl", hash = "sha256:269c201bbc01d2cbba5b86997a1e0f73ba5e2f471cfa6e226bcaa7fd664b598d"}, + {file = "yarl-1.13.1-cp313-cp313-win_amd64.whl", hash = "sha256:1d0828e17fa701b557c6eaed5edbd9098eb62d8838344486248489ff233998b8"}, + {file = "yarl-1.13.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:8be8cdfe20787e6a5fcbd010f8066227e2bb9058331a4eccddec6c0db2bb85b2"}, + {file = "yarl-1.13.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:08d7148ff11cb8e886d86dadbfd2e466a76d5dd38c7ea8ebd9b0e07946e76e4b"}, + {file = "yarl-1.13.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4afdf84610ca44dcffe8b6c22c68f309aff96be55f5ea2fa31c0c225d6b83e23"}, + {file = "yarl-1.13.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0d12fe78dcf60efa205e9a63f395b5d343e801cf31e5e1dda0d2c1fb618073d"}, + {file = "yarl-1.13.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:298c1eecfd3257aa16c0cb0bdffb54411e3e831351cd69e6b0739be16b1bdaa8"}, + {file = "yarl-1.13.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c14c16831b565707149c742d87a6203eb5597f4329278446d5c0ae7a1a43928e"}, + {file = "yarl-1.13.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a9bacedbb99685a75ad033fd4de37129449e69808e50e08034034c0bf063f99"}, + {file = "yarl-1.13.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:658e8449b84b92a4373f99305de042b6bd0d19bf2080c093881e0516557474a5"}, + {file = "yarl-1.13.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:373f16f38721c680316a6a00ae21cc178e3a8ef43c0227f88356a24c5193abd6"}, + {file = "yarl-1.13.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:45d23c4668d4925688e2ea251b53f36a498e9ea860913ce43b52d9605d3d8177"}, + {file = "yarl-1.13.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f7917697bcaa3bc3e83db91aa3a0e448bf5cde43c84b7fc1ae2427d2417c0224"}, + {file = "yarl-1.13.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:5989a38ba1281e43e4663931a53fbf356f78a0325251fd6af09dd03b1d676a09"}, + {file = "yarl-1.13.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:11b3ca8b42a024513adce810385fcabdd682772411d95bbbda3b9ed1a4257644"}, + {file = "yarl-1.13.1-cp38-cp38-win32.whl", hash = "sha256:dcaef817e13eafa547cdfdc5284fe77970b891f731266545aae08d6cce52161e"}, + {file = "yarl-1.13.1-cp38-cp38-win_amd64.whl", hash = "sha256:7addd26594e588503bdef03908fc207206adac5bd90b6d4bc3e3cf33a829f57d"}, + {file = "yarl-1.13.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a0ae6637b173d0c40b9c1462e12a7a2000a71a3258fa88756a34c7d38926911c"}, + {file = "yarl-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:576365c9f7469e1f6124d67b001639b77113cfd05e85ce0310f5f318fd02fe85"}, + {file = "yarl-1.13.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:78f271722423b2d4851cf1f4fa1a1c4833a128d020062721ba35e1a87154a049"}, + {file = "yarl-1.13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d74f3c335cfe9c21ea78988e67f18eb9822f5d31f88b41aec3a1ec5ecd32da5"}, + {file = "yarl-1.13.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1891d69a6ba16e89473909665cd355d783a8a31bc84720902c5911dbb6373465"}, + {file = "yarl-1.13.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fb382fd7b4377363cc9f13ba7c819c3c78ed97c36a82f16f3f92f108c787cbbf"}, + {file = "yarl-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c8854b9f80693d20cec797d8e48a848c2fb273eb6f2587b57763ccba3f3bd4b"}, + {file = "yarl-1.13.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bbf2c3f04ff50f16404ce70f822cdc59760e5e2d7965905f0e700270feb2bbfc"}, + {file = "yarl-1.13.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fb9f59f3848edf186a76446eb8bcf4c900fe147cb756fbbd730ef43b2e67c6a7"}, + {file = "yarl-1.13.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ef9b85fa1bc91c4db24407e7c4da93a5822a73dd4513d67b454ca7064e8dc6a3"}, + {file = "yarl-1.13.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:098b870c18f1341786f290b4d699504e18f1cd050ed179af8123fd8232513424"}, + {file = "yarl-1.13.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:8c723c91c94a3bc8033dd2696a0f53e5d5f8496186013167bddc3fb5d9df46a3"}, + {file = "yarl-1.13.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:44a4c40a6f84e4d5955b63462a0e2a988f8982fba245cf885ce3be7618f6aa7d"}, + {file = "yarl-1.13.1-cp39-cp39-win32.whl", hash = "sha256:84bbcdcf393139f0abc9f642bf03f00cac31010f3034faa03224a9ef0bb74323"}, + {file = "yarl-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:fc2931ac9ce9c61c9968989ec831d3a5e6fcaaff9474e7cfa8de80b7aff5a093"}, + {file = "yarl-1.13.1-py3-none-any.whl", hash = "sha256:6a5185ad722ab4dd52d5fb1f30dcc73282eb1ed494906a92d1a228d3f89607b0"}, + {file = "yarl-1.13.1.tar.gz", hash = "sha256:ec8cfe2295f3e5e44c51f57272afbd69414ae629ec7c6b27f5a410efc78b70a0"}, +] + +[package.dependencies] +idna = ">=2.0" +multidict = ">=4.0" + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "zipp" version = "3.20.2" @@ -1978,4 +2377,4 @@ reference = "mirrors" [metadata] lock-version = "2.0" python-versions = "<3.13,>=3.10" -content-hash = "3c98b4c2562b1cc5d88474d6962ab34e60be1be488d840c691c0d0e1095d7285" +content-hash = "4f40efe2d34c2dd6b279869363068ee58b82ac0de10b674eaf50acc3160f8527" diff --git a/pyproject.toml b/pyproject.toml index a74d2bc1b..c25caa463 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,8 +27,9 @@ confz = "^2.0.1" pydantic-extra-types = "^2.9.0" pendulum = "^3.0.0" slimeface = "^2024.9.27" -httpx = {extras = ["socks"], version = "^0.27.2"} aiofiles = "^24.1.0" +aiohttp = "^3.10.8" +aiohttp-socks = "^0.9.0" [tool.poetry.scripts] javsp = "javsp.__main__:entry" diff --git a/unittest/test_proxyfree.py b/unittest/test_proxyfree.py index 65151a9d4..7738a7361 100644 --- a/unittest/test_proxyfree.py +++ b/unittest/test_proxyfree.py @@ -3,11 +3,13 @@ from javsp.crawlers.proxyfree import get_proxy_free_url from javsp.config import CrawlerID +from javsp.network.client import clear_clients def test_get_url(): async def wrap(): assert await get_proxy_free_url(CrawlerID.javlib) != None assert await get_proxy_free_url(CrawlerID.javdb) != None + await clear_clients() asyncio.run(wrap()) @@ -15,11 +17,13 @@ def test_get_url_with_prefer(): async def wrap(): prefer_url = 'https://www.baidu.com' assert prefer_url == await get_proxy_free_url(CrawlerID.javlib, prefer_url) + await clear_clients() asyncio.run(wrap()) if __name__ == "__main__": async def aentry(): print(await get_proxy_free_url(CrawlerID.javlib)) + await clear_clients() tracemalloc.start() asyncio.run(aentry(), debug=True) From a6230ece25a9ad0537d9bae592b327ff0cc44c31 Mon Sep 17 00:00:00 2001 From: glyh Date: Sun, 29 Sep 2024 11:58:47 +0800 Subject: [PATCH 09/10] test web crawlers first --- .github/workflows/test-web-funcs.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-web-funcs.yml b/.github/workflows/test-web-funcs.yml index 37f2cdd9a..3c1d05c13 100644 --- a/.github/workflows/test-web-funcs.yml +++ b/.github/workflows/test-web-funcs.yml @@ -48,12 +48,12 @@ jobs: - name: Public IP id: ip uses: haythem/public-ip@v1.3 - - name: Test proxyfree.py - run: | - poetry run pytest unittest/test_proxyfree.py - name: Test web crawlers run: | poetry run pytest unittest/test_crawlers.py + - name: Test proxyfree.py + run: | + poetry run pytest unittest/test_proxyfree.py - name: Upload log as artifact uses: actions/upload-artifact@v4 if: ${{ always() }} From 9f0499d7bf05498c0aaa650e1cb9519d02f053d3 Mon Sep 17 00:00:00 2001 From: glyh Date: Sun, 29 Sep 2024 16:33:29 +0800 Subject: [PATCH 10/10] fix: remove extra `/`, make links absolute --- javsp/crawlers/sites/airav.py | 2 +- javsp/crawlers/sites/arzon.py | 2 +- javsp/crawlers/sites/arzon_iv.py | 2 +- javsp/crawlers/sites/avwiki.py | 2 +- javsp/crawlers/sites/dl_getchu.py | 3 ++- javsp/crawlers/sites/fc2.py | 1 + javsp/crawlers/sites/gyutto.py | 1 + javsp/crawlers/sites/jav321.py | 1 + javsp/crawlers/sites/javbus.py | 7 +++++-- javsp/crawlers/sites/javdb.py | 9 ++++++--- javsp/crawlers/sites/mgstage.py | 2 +- javsp/crawlers/sites/prestige.py | 2 +- 12 files changed, 22 insertions(+), 12 deletions(-) diff --git a/javsp/crawlers/sites/airav.py b/javsp/crawlers/sites/airav.py index 8bc4fa6e6..00c0503b9 100644 --- a/javsp/crawlers/sites/airav.py +++ b/javsp/crawlers/sites/airav.py @@ -80,7 +80,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: data = resp_json['result'] dvdid = data['barcode'] movie.dvdid = dvdid - movie.url = self.base_url + '/video/' + dvdid + movie.url = self.base_url + 'video/' + dvdid # plot和title中可能含有HTML的转义字符,需要进行解转义处理 movie.plot = unescape(data['description']) or None movie.cover = data['img_url'] diff --git a/javsp/crawlers/sites/arzon.py b/javsp/crawlers/sites/arzon.py index f325984d0..6fb868cc4 100644 --- a/javsp/crawlers/sites/arzon.py +++ b/javsp/crawlers/sites/arzon.py @@ -39,7 +39,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: if len(urls) == 0: raise MovieNotFoundError(__name__, movie.dvdid) - item_url = self.base_url + urls[0] + item_url = self.base_url[:-1] + urls[0] e = await self.client.get(item_url) item = html.fromstring(await e.read()) diff --git a/javsp/crawlers/sites/arzon_iv.py b/javsp/crawlers/sites/arzon_iv.py index 65c9b1367..40b763b7f 100644 --- a/javsp/crawlers/sites/arzon_iv.py +++ b/javsp/crawlers/sites/arzon_iv.py @@ -40,7 +40,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: if len(urls) == 0: raise MovieNotFoundError(__name__, movie.dvdid) - item_url = self.base_url + urls[0] + item_url = self.base_url[:-1] + urls[0] e = await self.client.get(item_url) item = html.fromstring(await e.read()) diff --git a/javsp/crawlers/sites/avwiki.py b/javsp/crawlers/sites/avwiki.py index 6a75dd345..ec4b3adf9 100644 --- a/javsp/crawlers/sites/avwiki.py +++ b/javsp/crawlers/sites/avwiki.py @@ -24,7 +24,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: Args: movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 """ - movie.url = url = f'{self.base_url}/{movie.dvdid}' + movie.url = url = f'{self.base_url}{movie.dvdid}' resp = await self.client.get(url) if resp.status == 404: diff --git a/javsp/crawlers/sites/dl_getchu.py b/javsp/crawlers/sites/dl_getchu.py index a635515d4..c34ad17c2 100644 --- a/javsp/crawlers/sites/dl_getchu.py +++ b/javsp/crawlers/sites/dl_getchu.py @@ -66,11 +66,12 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: raise ValueError('Invalid GETCHU number: ' + movie.dvdid) getchu_id = id_uc.replace('GETCHU-', '') # 抓取网页 - url = f'{self.base_url}/i/item{getchu_id}' + url = f'{self.base_url}i/item{getchu_id}' r = await self.client.get(url) if r.status == 404: raise MovieNotFoundError(__name__, movie.dvdid) tree = html.fromstring((await r.read()).decode(encoding='euc_jp', errors='ignore')) + tree.make_links_absolute(base_url=str(self.base_url)) container = tree.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[3]") if len(container) > 0: container = container[0] diff --git a/javsp/crawlers/sites/fc2.py b/javsp/crawlers/sites/fc2.py index 4ef981ff1..01deffdab 100644 --- a/javsp/crawlers/sites/fc2.py +++ b/javsp/crawlers/sites/fc2.py @@ -57,6 +57,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: if '/id.fc2.com/' in str(resp.url): raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP') tree = html.fromstring(await resp.text()) + tree.make_links_absolute(base_url=self.base_url) container = tree.xpath("//div[@class='items_article_left']") if len(container) > 0: container = container[0] diff --git a/javsp/crawlers/sites/gyutto.py b/javsp/crawlers/sites/gyutto.py index 632fb9123..8f294c8f2 100644 --- a/javsp/crawlers/sites/gyutto.py +++ b/javsp/crawlers/sites/gyutto.py @@ -57,6 +57,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: if r.status == 404: raise MovieNotFoundError(__name__, movie.dvdid) tree = html.fromstring(await r.text()) + tree.make_links_absolute(self.base_url) container = tree.xpath("//dl[@class='BasicInfo clearfix']") producer = None diff --git a/javsp/crawlers/sites/jav321.py b/javsp/crawlers/sites/jav321.py index 61f609bfd..6a50da46e 100644 --- a/javsp/crawlers/sites/jav321.py +++ b/javsp/crawlers/sites/jav321.py @@ -30,6 +30,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: """解析指定番号的影片数据""" resp = await self.client.post(f'{self.base_url}/search', data={'sn': movie.dvdid}) tree = html.fromstring(await resp.text()) + tree.make_links_absolute(self.base_url) page_url = tree.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0] #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542 cid = page_url.split('/')[-1] # /video/ipx00177 diff --git a/javsp/crawlers/sites/javbus.py b/javsp/crawlers/sites/javbus.py index 3038579cd..07d085563 100644 --- a/javsp/crawlers/sites/javbus.py +++ b/javsp/crawlers/sites/javbus.py @@ -20,11 +20,13 @@ class JavbusCrawler(Crawler): id = CrawlerID.javbus genre_map: GenreMap + perma_url: str @classmethod async def create(cls): self = cls() - url = await resolve_site_fallback(self.id, 'https://www.javbus.com') + self.perma_url = 'https://www.javbus.com' + url = await resolve_site_fallback(self.id, self.perma_url) self.base_url = str(url) self.client = get_session(url) self.client.cookie_jar.update_cookies({'age': 'verified', 'dv': '1'}) @@ -41,6 +43,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: resp = await self.client.get(url) tree = html.fromstring(await resp.text()) + tree.make_links_absolute(base_url=self.perma_url) # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息 # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404 page_title = tree.xpath('/html/head/title/text()') @@ -93,7 +96,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像 actress_pics[name] = pic_url # 整理数据并更新movie的相应属性 - movie.url = f'{self.base_url}/{movie.dvdid}' + movie.url = f'{self.perma_url}/{movie.dvdid}' movie.dvdid = dvdid movie.title = title.replace(dvdid, '').strip() movie.cover = cover diff --git a/javsp/crawlers/sites/javdb.py b/javsp/crawlers/sites/javdb.py index d101e2c4e..dafa010d1 100644 --- a/javsp/crawlers/sites/javdb.py +++ b/javsp/crawlers/sites/javdb.py @@ -18,6 +18,7 @@ from lxml import html logger = logging.getLogger(__name__) +perma_url = 'https://www.javdb.com' class JavDbCrawler(Crawler): id = CrawlerID.javdb @@ -28,7 +29,7 @@ class JavDbCrawler(Crawler): @classmethod async def create(cls): self = cls() - url = await resolve_site_fallback(self.id, 'https://www.javdb.com') + url = await resolve_site_fallback(self.id, perma_url) self.base_url = str(url) self.client = get_session(url) self.headers = {'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5'} @@ -66,7 +67,9 @@ async def get_html_wrapper(self, url: str): raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'") else: - return html.fromstring(await r.text()) + tree = html.fromstring(await r.text()) + tree.make_links_absolute(base_url=perma_url) + return tree elif r.status in (403, 503): tree = html.fromstring(await r.text()) code_tag = tree.xpath("//span[@class='code-label']/span") @@ -131,7 +134,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: index = ids.index(movie.dvdid.lower()) new_url = movie_urls[index] try: - html2 = await self.get_html_wrapper(self.base_url + new_url) + html2 = await self.get_html_wrapper(new_url) except (SitePermissionError, CredentialError): # 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面 box = tree.xpath("//a[@class='box']")[index] diff --git a/javsp/crawlers/sites/mgstage.py b/javsp/crawlers/sites/mgstage.py index a352470bf..d21d5e3af 100644 --- a/javsp/crawlers/sites/mgstage.py +++ b/javsp/crawlers/sites/mgstage.py @@ -29,7 +29,7 @@ async def create(cls): async def crawl_and_fill(self, movie: MovieInfo) -> None: """解析指定番号的影片数据""" - url = f'{self.base_url}/product/product_detail/{movie.dvdid}/' + url = f'{self.base_url}product/product_detail/{movie.dvdid}/' resp = await self.client.get(url) if resp.status == 403: raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理') diff --git a/javsp/crawlers/sites/prestige.py b/javsp/crawlers/sites/prestige.py index 5d0d4c9bb..a4fe7de41 100644 --- a/javsp/crawlers/sites/prestige.py +++ b/javsp/crawlers/sites/prestige.py @@ -35,7 +35,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None: Args: movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 """ - url = f'{self.base_url}/goods/goods_detail.php?sku={movie.dvdid}' + url = f'{self.base_url}goods/goods_detail.php?sku={movie.dvdid}' resp = await self.client.get(url) if resp.status == 500: # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试