fix(tmdb): remove fallback to TMDB website scraping when API search fails

This commit is contained in:
jxxghp
2026-05-17 19:46:50 +08:00
parent 93130baf0a
commit c857ae3e14
3 changed files with 35 additions and 231 deletions

View File

@@ -501,9 +501,6 @@ class TheMovieDbModule(_ModuleBase):
names = self._prepare_search_names(meta)
for name in names:
info = self._search_by_name(name, meta, group_seasons)
if not info:
# 从网站查询
info = self.tmdb.match_web(name=name, mtype=meta.type)
if info:
# 查到就退出
break
@@ -590,9 +587,6 @@ class TheMovieDbModule(_ModuleBase):
names = self._prepare_search_names(meta)
for name in names:
info = await self._async_search_by_name(name, meta, group_seasons)
if not info:
# 从网站查询
info = await self.tmdb.async_match_web(name=name, mtype=meta.type)
if info:
# 查到就退出
break

View File

@@ -1,18 +1,11 @@
import re
import traceback
from typing import Optional, List
from urllib.parse import quote
import zhconv
from lxml import etree
from app.core.cache import cached
from app.core.config import settings
from app.log import logger
from app.schemas import APIRateLimitException
from app.schemas.types import MediaType
from app.utils.http import RequestUtils, AsyncRequestUtils
from app.utils.limit import rate_limit_exponential
from app.utils.string import StringUtils
from .tmdbv3api import TMDb, Search, Movie, TV, Season, Episode, Discover, Trending, Person, Collection
from .tmdbv3api.exceptions import TMDbException
@@ -261,152 +254,6 @@ class TmdbApi:
return multi
return None
# match_web 公共方法
@staticmethod
def _validate_web_params(name: str) -> Optional[dict]:
"""
验证网站搜索参数
:return: None表示继续dict表示直接返回结果
"""
if not name:
return None
if StringUtils.is_chinese(name):
return {}
return None # 继续执行
@staticmethod
def _build_tmdb_search_url(name: str) -> str:
"""
构建TMDB搜索URL
"""
return "https://www.themoviedb.org/search?query=%s" % quote(name)
@staticmethod
def _validate_response(res) -> Optional[dict]:
"""
验证HTTP响应
:return: None表示继续dict表示直接返回结果Exception表示抛出异常
"""
if res is None:
return None
if res.status_code == 429:
raise APIRateLimitException("触发TheDbMovie网站限流获取媒体信息失败")
if res.status_code != 200:
return {}
return None # 继续执行
@staticmethod
def _extract_tmdb_links(html_text: str, mtype: MediaType) -> List[str]:
"""
从HTML文本中提取TMDB链接
"""
if not html_text:
return []
html = None
try:
tmdb_links = []
html = etree.HTML(html_text)
if mtype == MediaType.TV:
links = html.xpath("//a[@data-id and @data-media-type='tv']/@href")
else:
links = html.xpath("//a[@data-id]/@href")
for link in links:
if not link or (not link.startswith("/tv") and not link.startswith("/movie")):
continue
if link not in tmdb_links:
tmdb_links.append(link)
return tmdb_links
except Exception as err:
logger.error(f"解析TMDB网站HTML出错{str(err)}")
return []
finally:
if html is not None:
del html
@staticmethod
def _log_web_search_result(name: str, tmdbinfo: dict):
"""
记录网站搜索结果日志
"""
if tmdbinfo.get('media_type') == MediaType.MOVIE:
logger.info("%s 从WEB识别到 电影TMDBID=%s, 名称=%s, 上映日期=%s" % (
name,
tmdbinfo.get('id'),
tmdbinfo.get('title'),
tmdbinfo.get('release_date')))
else:
logger.info("%s 从WEB识别到 电视剧TMDBID=%s, 名称=%s, 首播日期=%s" % (
name,
tmdbinfo.get('id'),
tmdbinfo.get('name'),
tmdbinfo.get('first_air_date')))
def _process_web_search_links(self, name: str, mtype: MediaType,
tmdb_links: List[str], get_info_func) -> Optional[dict]:
"""
处理网站搜索得到的链接
"""
if len(tmdb_links) == 1:
tmdbid = self._parse_tmdb_id_from_link(tmdb_links[0])
if not tmdbid:
logger.warn(f"无法从链接解析TMDBID{tmdb_links[0]}")
return {}
tmdbinfo = get_info_func(
mtype=MediaType.TV if tmdb_links[0].startswith("/tv") else MediaType.MOVIE,
tmdbid=tmdbid)
if tmdbinfo:
if mtype == MediaType.TV and tmdbinfo.get('media_type') != MediaType.TV:
return {}
self._log_web_search_result(name, tmdbinfo)
return tmdbinfo
elif len(tmdb_links) > 1:
logger.info("%s TMDB网站返回数据过多%s" % (name, len(tmdb_links)))
else:
logger.info("%s TMDB网站未查询到媒体信息" % name)
return {}
async def _async_process_web_search_links(self, name: str,
mtype: MediaType, tmdb_links: List[str]) -> Optional[dict]:
"""
处理网站搜索得到的链接(异步版本)
"""
if len(tmdb_links) == 1:
tmdbid = self._parse_tmdb_id_from_link(tmdb_links[0])
if not tmdbid:
logger.warn(f"无法从链接解析TMDBID{tmdb_links[0]}")
return {}
tmdbinfo = await self.async_get_info(
mtype=MediaType.TV if tmdb_links[0].startswith("/tv") else MediaType.MOVIE,
tmdbid=tmdbid)
if tmdbinfo:
if mtype == MediaType.TV and tmdbinfo.get('media_type') != MediaType.TV:
return {}
self._log_web_search_result(name, tmdbinfo)
return tmdbinfo
elif len(tmdb_links) > 1:
logger.info("%s TMDB网站返回数据过多%s" % (name, len(tmdb_links)))
else:
logger.info("%s TMDB网站未查询到媒体信息" % name)
return {}
@staticmethod
def _parse_tmdb_id_from_link(link: str) -> Optional[int]:
"""
从 TMDB 相对链接中解析数值 ID。
兼容格式:/movie/1195631-william-tell、/tv/65942-re、/tv/79744-the-rookie
"""
if not link:
return None
match = re.match(r"^/[^/]+/(\d+)", link)
if match:
try:
return int(match.group(1))
except Exception as err:
logger.debug(f"解析TMDBID失败{str(err)} - {traceback.format_exc()}")
return None
return None
@staticmethod
def __get_names(tmdb_info: dict) -> List[str]:
"""
@@ -737,40 +584,6 @@ class TmdbApi:
# 类型变更
return self._convert_media_type(ret_info)
@cached(maxsize=settings.CONF.tmdb, ttl=settings.CONF.meta)
@rate_limit_exponential(source="match_tmdb_web", base_wait=5, max_wait=1800, enable_logging=True)
def match_web(self, name: str, mtype: MediaType) -> Optional[dict]:
"""
搜索TMDB网站直接抓取结果结果只有一条时才返回
:param name: 名称
:param mtype: 媒体类型
"""
# 参数验证
validation_result = self._validate_web_params(name)
if validation_result is not None:
return validation_result
logger.info("正在从TheMovieDb网站查询%s ..." % name)
tmdb_url = self._build_tmdb_search_url(name)
res = RequestUtils(timeout=5, ua=settings.NORMAL_USER_AGENT, proxies=settings.PROXY).get_res(url=tmdb_url)
if res is None:
logger.error("无法连接TheMovieDb")
return None
# 响应验证
response_result = self._validate_response(res)
if response_result is not None:
return response_result
try:
# 提取链接
tmdb_links = self._extract_tmdb_links(res.text, mtype)
# 处理结果
return self._process_web_search_links(name, mtype, tmdb_links, self.get_info)
except Exception as err:
logger.error(f"从TheDbMovie网站查询出错{str(err)}")
return {}
def get_info(self,
mtype: MediaType,
tmdbid: int) -> dict:
@@ -1625,7 +1438,6 @@ class TmdbApi:
"""
清除缓存
"""
self.match_web.cache_clear()
self.discover.discover_movies.cache_clear()
self.discover.discover_tv_shows.cache_clear()
self.tmdb.cache_clear()
@@ -1851,42 +1663,6 @@ class TmdbApi:
logger.error(str(e))
return None
# 公共异步方法
@cached(maxsize=settings.CONF.tmdb, ttl=settings.CONF.meta)
@rate_limit_exponential(source="match_tmdb_web", base_wait=5, max_wait=1800, enable_logging=True)
async def async_match_web(self, name: str, mtype: MediaType) -> Optional[dict]:
"""
搜索TMDB网站直接抓取结果结果只有一条时才返回异步版本
:param name: 名称
:param mtype: 媒体类型
"""
# 参数验证
validation_result = self._validate_web_params(name)
if validation_result is not None:
return validation_result
logger.info("正在从TheDbMovie网站查询%s ..." % name)
tmdb_url = self._build_tmdb_search_url(name)
res = await AsyncRequestUtils(timeout=5, ua=settings.NORMAL_USER_AGENT, proxies=settings.PROXY).get_res(
url=tmdb_url)
if res is None:
logger.error("无法连接TheDbMovie")
return None
# 响应验证
response_result = self._validate_response(res)
if response_result is not None:
return response_result
try:
# 提取链接
tmdb_links = self._extract_tmdb_links(res.text, mtype)
# 处理结果
return await self._async_process_web_search_links(name, mtype, tmdb_links)
except Exception as err:
logger.error(f"从TheDbMovie网站查询出错{str(err)}")
return {}
async def async_search_multiis(self, title: str) -> List[dict]:
"""
同时查询模糊匹配的电影、电视剧TMDB信息异步版本

View File

@@ -1,6 +1,6 @@
import asyncio
from unittest import TestCase
from unittest.mock import Mock
from unittest.mock import AsyncMock, Mock
from app.core.context import MediaInfo
from app.core.meta import MetaBase
@@ -62,6 +62,40 @@ class MediaRecognizeModulesTest(TestCase):
module.cache.get.assert_not_called()
module.cache.update.assert_called_once()
def test_tmdb_recognize_does_not_fallback_to_match_web(self):
"""TMDB API 搜索无结果时,不应再回退抓取 TMDB 网站搜索页。"""
module = TheMovieDbModule()
meta = MetaBase("No Match Movie")
meta.name = "No Match Movie"
meta.type = MediaType.MOVIE
module.cache = Mock()
module.tmdb = Mock()
module.tmdb.match_web.side_effect = AssertionError("不应调用 TMDB 网站搜索")
module._search_by_name = Mock(return_value=None)
result = module.recognize_media(meta=meta, cache=False)
self.assertIsNone(result)
module._search_by_name.assert_called()
module.tmdb.match_web.assert_not_called()
def test_async_tmdb_recognize_does_not_fallback_to_match_web(self):
"""异步 TMDB API 搜索无结果时,不应再回退抓取 TMDB 网站搜索页。"""
module = TheMovieDbModule()
meta = MetaBase("No Match Movie")
meta.name = "No Match Movie"
meta.type = MediaType.MOVIE
module.cache = Mock()
module.tmdb = Mock()
module.tmdb.async_match_web = AsyncMock(side_effect=AssertionError("不应调用 TMDB 网站搜索"))
module._async_search_by_name = AsyncMock(return_value=None)
result = asyncio.run(module.async_recognize_media(meta=meta, cache=False))
self.assertIsNone(result)
module._async_search_by_name.assert_called()
module.tmdb.async_match_web.assert_not_called()
def test_douban_prepare_search_names_deduplicates_simplified_name(self):
"""豆瓣候选名称应保留顺序,并去掉繁简转换后的重复项。"""
meta = MetaBase("流浪地球")