perf: precompile media metadata regexes

This commit is contained in:
jxxghp
2026-05-23 10:43:47 +08:00
parent 2eb7f57a4c
commit 3ebd06a3a7
4 changed files with 116 additions and 56 deletions

View File

@@ -8,6 +8,8 @@ AUXILIARY_CN_STEM_FULLMATCH_RE = re.compile(
r"国英|台粤|音轨|评论|国配|台配|粤语|韩语|日语|杜比|全景声|无损|中字|"
r"国语|原声)+$"
)
PARENT_LATIN_TITLE_RE = re.compile(r"[A-Za-z]{2,}")
SEASON_EPISODE_CN_RE = re.compile(r"[第共]\s*[0-9一二三四五六七八九十百零]+\s*[季集话話]")
def should_use_parent_title_for_file_stem(
@@ -23,7 +25,7 @@ def should_use_parent_title_for_file_stem(
return False
if file_meta.tmdbid or file_meta.doubanid:
return False
if not re.search(r"[A-Za-z]{2,}", parent_dir_name):
if not PARENT_LATIN_TITLE_RE.search(parent_dir_name):
return False
if not StringUtils.is_all_chinese(stem):
return False
@@ -31,7 +33,7 @@ def should_use_parent_title_for_file_stem(
return False
if not AUXILIARY_CN_STEM_FULLMATCH_RE.match(stem):
return False
if re.search(r"[第共]\s*[0-9一二三四五六七八九十百零]+\s*[季集话話]", stem):
if SEASON_EPISODE_CN_RE.search(stem):
return False
return True

View File

@@ -10,6 +10,26 @@ from app.schemas.types import MediaType
from app.utils.string import StringUtils
TITLE_EPISODE_RE = re.compile(r"Episode\s+(\d{1,4})", re.IGNORECASE)
SUBTITLE_HAS_SEASON_EPISODE_RE = re.compile(r"[全第季集话話期幕]", re.IGNORECASE)
SUBTITLE_SEASON_RE = re.compile(r"(?<![全共]\s*)[第\s]+([0-9一二三四五六七八九十S\-]+)\s*季(?!\s*[全共])", re.IGNORECASE)
SUBTITLE_SEASON_ALL_RE = re.compile(r"[全共]\s*([0-9一二三四五六七八九十]+)\s*季", re.IGNORECASE)
SUBTITLE_EPISODE_RE = re.compile(r"(?<![全共]\s*)[第\s]+([0-9一二三四五六七八九十百零EP]+)\s*[集话話期幕](?!\s*[全共])", re.IGNORECASE)
SUBTITLE_EPISODE_BETWEEN_RE = re.compile(
r"[第]*\s*([0-9一二三四五六七八九十百零]+)\s*[集话話期幕]?\s*-\s*第*\s*"
r"([0-9一二三四五六七八九十百零]+)\s*[集话話期幕]",
re.IGNORECASE,
)
SUBTITLE_EPISODE_ALL_RE = re.compile(
r"([0-9一二三四五六七八九十百零]+)\s*集\s*全|[全共]\s*([0-9一二三四五六七八九十百零]+)\s*[集话話期幕]",
re.IGNORECASE,
)
VIDEO_BIT_RE = re.compile(
r"(?<![A-Za-z0-9])(?P<bit>8|10|12|16)[\s._-]*bits?(?![A-Za-z0-9])",
re.IGNORECASE,
)
@dataclass
class MetaBase(object):
"""
@@ -121,8 +141,8 @@ class MetaBase(object):
if not title_text:
return
title_text = f" {title_text} "
if re.search(r"%s" % self._title_episodel_re, title_text, re.IGNORECASE):
episode_str = re.search(r'%s' % self._title_episodel_re, title_text, re.IGNORECASE)
episode_str = TITLE_EPISODE_RE.search(title_text)
if episode_str:
if episode_str:
try:
episode = int(episode_str.group(1))
@@ -136,9 +156,9 @@ class MetaBase(object):
self.total_episode = 1
self.type = MediaType.TV
self._subtitle_flag = True
elif re.search(r'[全第季集话話期幕]', title_text, re.IGNORECASE):
elif SUBTITLE_HAS_SEASON_EPISODE_RE.search(title_text):
# 全x季 x季全
season_all_str = re.search(r"%s" % self._subtitle_season_all_re, title_text, re.IGNORECASE)
season_all_str = SUBTITLE_SEASON_ALL_RE.search(title_text)
if season_all_str:
season_all = season_all_str.group(1)
if not season_all:
@@ -155,7 +175,7 @@ class MetaBase(object):
self._subtitle_flag = True
return
# 第x季
season_str = re.search(r'%s' % self._subtitle_season_re, title_text, re.IGNORECASE)
season_str = SUBTITLE_SEASON_RE.search(title_text)
if season_str:
seasons = season_str.group(1)
if seasons:
@@ -190,7 +210,7 @@ class MetaBase(object):
self.type = MediaType.TV
self._subtitle_flag = True
# 第x-x集 第x集-x集
episode_between_str = re.search(r'%s' % self._subtitle_episode_between_re, title_text, re.IGNORECASE)
episode_between_str = SUBTITLE_EPISODE_BETWEEN_RE.search(title_text)
if episode_between_str:
episodes = episode_between_str.groups()
if episodes:
@@ -221,7 +241,7 @@ class MetaBase(object):
self._subtitle_flag = True
return
# 第x集
episode_str = re.search(r'%s' % self._subtitle_episode_re, title_text, re.IGNORECASE)
episode_str = SUBTITLE_EPISODE_RE.search(title_text)
if episode_str:
episodes = episode_str.group(1)
if episodes:
@@ -257,7 +277,7 @@ class MetaBase(object):
self._subtitle_flag = True
return
# x集全/全x集
episode_all_str = re.search(r'%s' % self._subtitle_episode_all_re, title_text, re.IGNORECASE)
episode_all_str = SUBTITLE_EPISODE_ALL_RE.search(title_text)
if episode_all_str:
episode_all = episode_all_str.group(1)
if not episode_all:
@@ -469,11 +489,7 @@ class MetaBase(object):
"""
if not value:
return None
bit_match = re.search(
r"(?<![A-Za-z0-9])(?P<bit>8|10|12|16)[\s._-]*bits?(?![A-Za-z0-9])",
value,
re.IGNORECASE,
)
bit_match = VIDEO_BIT_RE.search(value)
if not bit_match:
return None
return f"{bit_match.group('bit')}bit"

View File

@@ -13,6 +13,23 @@ from app.utils.tokens import Tokens
from app.core.meta.streamingplatform import StreamingPlatforms
SEASON_FULL_RE = re.compile(r"^(?:Season\s+|S)(\d{1,3})$", re.IGNORECASE)
FIRST_BRACKET_RE = re.compile(r'^[\[【](.+?)[\]】]')
BRACKET_DOT_TITLE_RE = re.compile(r'[A-Za-z]+\..+(?:19|20)\d{2}')
BRACKET_RESOURCE_RE = re.compile(
r'(?:2160|1080|720|480)[PIpi]|4K|UHD|Blu[\-.]?ray|REMUX|WEB[\-.]?DL|HDTV',
re.IGNORECASE,
)
YEAR_RANGE_RE = re.compile(r'([\s.]+)(\d{4})-(\d{4})')
FILE_SIZE_RE = re.compile(r'[0-9.]+\s*[MGT]i?B(?![A-Z]+)', re.IGNORECASE)
DATE_RE = re.compile(r'\d{4}[\s._-]\d{1,2}[\s._-]\d{1,2}')
DIY_RE = re.compile(r'DIY', re.IGNORECASE)
DIY_TITLE_RE = re.compile(r'-DIY@', re.IGNORECASE)
DESCRIPTION_SPLIT_RE = re.compile(r'[\s/|]+')
SPACE_RE = re.compile(r'\s+')
SEASON_SUFFIX_RE = re.compile(r"SEASON$", re.IGNORECASE)
class MetaVideo(MetaBase):
"""
识别电影、电视剧
@@ -54,6 +71,21 @@ class MetaVideo(MetaBase):
_video_encode_re = r"^(H26[45])$|^(x26[45])$|^AVC$|^HEVC$|^VC\d?$|^MPEG\d?$|^Xvid$|^DivX$|^AV1$|^HDR\d*$|^AVS(\+|[23])$"
_audio_encode_re = r"^DTS\d?$|^DTSHD$|^DTSHDMA$|^Atmos$|^TrueHD\d?$|^AC3$|^\dAudios?$|^DDP\d?$|^DD\+\d?$|^DD\d?$|^LPCM\d?$|^AAC\d?$|^FLAC\d?$|^HD\d?$|^MA\d?$|^HR\d?$|^Opus\d?$|^Vorbis\d?$|^AV[3S]A$"
_fps_re = r"(\d{2,3})(?=FPS)"
_season_pattern = re.compile(_season_re, re.IGNORECASE)
_episode_pattern = re.compile(_episode_re, re.IGNORECASE)
_part_pattern = re.compile(_part_re, re.IGNORECASE)
_roman_numerals_pattern = re.compile(_roman_numerals)
_source_pattern = re.compile(r"(%s)" % _source_re, re.IGNORECASE)
_effect_pattern = re.compile(r"(%s)" % _effect_re, re.IGNORECASE)
_resources_type_pattern = re.compile(r"(%s)" % _resources_type_re, re.IGNORECASE)
_name_no_chinese_pattern = re.compile(_name_no_chinese_re, re.IGNORECASE)
_name_movie_words_pattern = re.compile("|".join(_name_movie_words), re.IGNORECASE)
_name_nostring_pattern = re.compile(_name_nostring_re, re.IGNORECASE)
_resources_pix_pattern = re.compile(_resources_pix_re, re.IGNORECASE)
_resources_pix_pattern2 = re.compile(_resources_pix_re2, re.IGNORECASE)
_video_encode_pattern = re.compile(r"(%s)" % _video_encode_re, re.IGNORECASE)
_audio_encode_pattern = re.compile(r"(%s)" % _audio_encode_re, re.IGNORECASE)
_fps_pattern = re.compile(r"(%s)" % _fps_re, re.IGNORECASE)
def __init__(self, title: str, subtitle: str = None, isfile: bool = False):
"""
@@ -77,7 +109,7 @@ class MetaVideo(MetaBase):
self.type = MediaType.TV
return
# 全名为Season xx 及 Sxx 直接返回
season_full_res = re.search(r"^(?:Season\s+|S)(\d{1,3})$", title, re.IGNORECASE)
season_full_res = SEASON_FULL_RE.search(title)
if season_full_res:
self.type = MediaType.TV
season = season_full_res.group(1)
@@ -86,22 +118,21 @@ class MetaVideo(MetaBase):
self.total_season = 1
return
# 去掉名称中第1个[]的内容
_first_bracket = re.match(r'^[\[【](.+?)[\]】]', title)
_first_bracket = FIRST_BRACKET_RE.match(title)
if _first_bracket:
_bracket_content = _first_bracket.group(1)
# 如果第一个括号内为点分隔的英文发布名格式(含年份+资源类型),保留内容去掉括号
if re.search(r'[A-Za-z]+\..+(?:19|20)\d{2}', _bracket_content) \
and re.search(r'(?:2160|1080|720|480)[PIpi]|4K|UHD|Blu[\-.]?ray|REMUX|WEB[\-.]?DL|HDTV',
_bracket_content, re.IGNORECASE):
if BRACKET_DOT_TITLE_RE.search(_bracket_content) \
and BRACKET_RESOURCE_RE.search(_bracket_content):
title = _bracket_content + title[_first_bracket.end():]
else:
title = title[_first_bracket.end():]
# 把xxxx-xxxx年份换成前一个年份常出现在季集上
title = re.sub(r'([\s.]+)(\d{4})-(\d{4})', r'\1\2', title)
title = YEAR_RANGE_RE.sub(r'\1\2', title)
# 把大小去掉
title = re.sub(r'[0-9.]+\s*[MGT]i?B(?![A-Z]+)', "", title, flags=re.IGNORECASE)
title = FILE_SIZE_RE.sub("", title)
# 把年月日去掉
title = re.sub(r'\d{4}[\s._-]\d{1,2}[\s._-]\d{1,2}', "", title)
title = DATE_RE.sub("", title)
media_exts = settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT
# 拆分tokens
tokens = Tokens(title)
@@ -157,8 +188,8 @@ class MetaVideo(MetaBase):
self.resource_type = self._source.strip()
# 提取原盘DIY
if self.resource_type and "BluRay" in self.resource_type:
if (self.subtitle and re.findall(r'D[Ii]Y', self.subtitle)) \
or re.findall(r'-D[Ii]Y@', original_title):
if (self.subtitle and DIY_RE.search(self.subtitle)) \
or DIY_TITLE_RE.search(original_title):
self.resource_type = f"{self.resource_type} DIY"
# 解析副标题,只要季和集
self.init_subtitle(self.org_string)
@@ -192,7 +223,7 @@ class MetaVideo(MetaBase):
"""
if not description:
return None
titles = re.split(r'[\s/|]+', description)
titles = DESCRIPTION_SPLIT_RE.split(description)
if StringUtils.is_chinese(titles[0]):
return titles[0]
return None
@@ -215,9 +246,8 @@ class MetaVideo(MetaBase):
"""
if not name:
return name
name = re.sub(r'%s' % self._name_nostring_re, '', name,
flags=re.IGNORECASE).strip()
name = re.sub(r'\s+', ' ', name)
name = self._name_nostring_pattern.sub('', name).strip()
name = SPACE_RE.sub(' ', name)
if name.isdecimal() \
and int(name) < 1800 \
and not self.year \
@@ -263,13 +293,13 @@ class MetaVideo(MetaBase):
if not self.cn_name:
self.cn_name = token
elif not self._stop_cnname_flag:
if re.search("|".join(self._name_movie_words), token, flags=re.IGNORECASE) \
or (not re.search("%s" % self._name_no_chinese_re, token, flags=re.IGNORECASE)
if self._name_movie_words_pattern.search(token) \
or (not self._name_no_chinese_pattern.search(token)
and not any(w in token for w in self._name_se_words)):
self.cn_name = "%s %s" % (self.cn_name, token)
self._stop_cnname_flag = True
else:
is_roman_digit = re.search(self._roman_numerals, token)
is_roman_digit = self._roman_numerals_pattern.search(token)
# 阿拉伯数字或者罗马数字
if token.isdigit() or is_roman_digit:
# 第季集后面的不要
@@ -305,16 +335,16 @@ class MetaVideo(MetaBase):
# 名字未出现前的第一个数字,记下来
if not self._unknown_name_str:
self._unknown_name_str = token
elif re.search(r"%s" % self._season_re, token, re.IGNORECASE):
elif self._season_pattern.search(token):
# 季的处理
if self.en_name and re.search(r"SEASON$", self.en_name, re.IGNORECASE):
if self.en_name and SEASON_SUFFIX_RE.search(self.en_name):
# 如果匹配到季英文名结尾为Season说明Season属于标题不应在后续作为干扰词去除
self.en_name += ' '
self._stop_name_flag = True
return
elif re.search(r"%s" % self._episode_re, token, re.IGNORECASE) \
or re.search(r"(%s)" % self._resources_type_re, token, re.IGNORECASE) \
or re.search(r"%s" % self._resources_pix_re, token, re.IGNORECASE):
elif self._episode_pattern.search(token) \
or self._resources_type_pattern.search(token) \
or self._resources_pix_pattern.search(token):
# 集、来源、版本等不要
self._stop_name_flag = True
return
@@ -341,7 +371,7 @@ class MetaVideo(MetaBase):
and not self.resource_pix \
and not self.resource_type:
return
re_res = re.search(r"%s" % self._part_re, token, re.IGNORECASE)
re_res = self._part_pattern.search(token)
if re_res:
if not self.part:
self.part = re_res.group(1)
@@ -372,7 +402,7 @@ class MetaVideo(MetaBase):
self.en_name = "%s %s" % (self.en_name.strip(), self.year)
elif self.cn_name:
self.cn_name = "%s %s" % (self.cn_name, self.year)
elif self.en_name and re.search(r"SEASON$", self.en_name, re.IGNORECASE):
elif self.en_name and SEASON_SUFFIX_RE.search(self.en_name):
# 如果匹配到年且英文名结尾为Season说明Season属于标题不应在后续作为干扰词去除
self.en_name += ' '
self.year = token
@@ -386,7 +416,7 @@ class MetaVideo(MetaBase):
"""
if not self.name:
return
re_res = re.findall(r"%s" % self._resources_pix_re, token, re.IGNORECASE)
re_res = self._resources_pix_pattern.findall(token)
if re_res:
self._last_token_type = "pix"
self._continue_flag = False
@@ -411,7 +441,7 @@ class MetaVideo(MetaBase):
and self.resource_pix[-1] not in 'kpi':
self.resource_pix = "%sp" % self.resource_pix
else:
re_res = re.search(r"%s" % self._resources_pix_re2, token, re.IGNORECASE)
re_res = self._resources_pix_pattern2.search(token)
if re_res:
self._last_token_type = "pix"
self._continue_flag = False
@@ -423,7 +453,7 @@ class MetaVideo(MetaBase):
"""
识别季
"""
re_res = re.findall(r"%s" % self._season_re, token, re.IGNORECASE)
re_res = self._season_pattern.findall(token)
if re_res:
self._last_token_type = "season"
self.type = MediaType.TV
@@ -475,7 +505,7 @@ class MetaVideo(MetaBase):
"""
识别集
"""
re_res = re.findall(r"%s" % self._episode_re, token, re.IGNORECASE)
re_res = self._episode_pattern.findall(token)
if re_res:
self._last_token_type = "episode"
self._continue_flag = False
@@ -581,7 +611,7 @@ class MetaVideo(MetaBase):
self._source = "UHD BluRay"
self._continue_flag = False
return
source_res = re.search(r"(%s)" % self._source_re, token, re.IGNORECASE)
source_res = self._source_pattern.search(token)
if source_res:
self._last_token_type = "source"
self._continue_flag = False
@@ -590,7 +620,7 @@ class MetaVideo(MetaBase):
self._source = source_res.group(1)
self._last_token = self._source.upper()
return
effect_res = re.search(r"(%s)" % self._effect_re, token, re.IGNORECASE)
effect_res = self._effect_pattern.search(token)
if effect_res:
self._last_token_type = "effect"
self._continue_flag = False
@@ -663,7 +693,7 @@ class MetaVideo(MetaBase):
and not self.begin_season \
and not self.begin_episode:
return
re_res = re.search(r"(%s)" % self._video_encode_re, token, re.IGNORECASE)
re_res = self._video_encode_pattern.search(token)
if re_res:
self._continue_flag = False
self._stop_name_flag = True
@@ -732,7 +762,7 @@ class MetaVideo(MetaBase):
and not self.begin_season \
and not self.begin_episode:
return
re_res = re.search(r"(%s)" % self._audio_encode_re, token, re.IGNORECASE)
re_res = self._audio_encode_pattern.search(token)
if re_res:
self._continue_flag = False
self._stop_name_flag = True
@@ -763,7 +793,7 @@ class MetaVideo(MetaBase):
if not self.name:
return
re_res = re.search(rf"({self._fps_re})", token, re.IGNORECASE)
re_res = self._fps_pattern.search(token)
if re_res:
self._continue_flag = False
self._stop_name_flag = True

View File

@@ -1,3 +1,4 @@
from functools import lru_cache
from typing import List, Optional, Tuple
import cn2an
@@ -13,6 +14,14 @@ _COMBINED_WORD_RE = re.compile(r'^\s*(.*?)\s*=>\s*(.*?)\s*&&\s*(.*?)\s*<>\s*(.*?
_LEADING_ZERO_RE = re.compile(r"^0+")
@lru_cache(maxsize=1024)
def _compile_custom_word_regex(pattern: str):
"""
编译自定义识别词正则,缓存重复识别链路中反复使用的同一规则。
"""
return re.compile(pattern)
class WordsMatcher(metaclass=Singleton):
def __init__(self):
@@ -86,7 +95,7 @@ class WordsMatcher(metaclass=Singleton):
正则替换
"""
try:
replaced_re = re.compile(r'%s' % replaced)
replaced_re = _compile_custom_word_regex(r'%s' % replaced)
title, count = replaced_re.subn(r'%s' % replace, title)
return title, "", count > 0
except Exception as err:
@@ -99,12 +108,14 @@ class WordsMatcher(metaclass=Singleton):
集数偏移
"""
try:
if back and not re.findall(r'%s' % back, title):
if back and not _compile_custom_word_regex(r'%s' % back).search(title):
return title, "", False
if front and not re.findall(r'%s' % front, title):
if front and not _compile_custom_word_regex(r'%s' % front).search(title):
return title, "", False
offset_word_info_re = re.compile(r'(?<=%s.*?)[0-9一二三四五六七八九十]+(?=.*?%s)' % (front, back))
episode_nums_str = re.findall(offset_word_info_re, title)
offset_word_info_re = _compile_custom_word_regex(
r'(?<=%s.*?)[0-9一二三四五六七八九十]+(?=.*?%s)' % (front, back)
)
episode_nums_str = offset_word_info_re.findall(title)
if not episode_nums_str:
return title, "", False
episode_nums_offset_str = []
@@ -137,9 +148,10 @@ class WordsMatcher(metaclass=Singleton):
else:
episode_nums_list = sorted(episode_nums_dict.items(), key=lambda x: x[1], reverse=True)
for episode_num in episode_nums_list:
episode_offset_re = re.compile(
r'(?<=%s.*?)%s(?=.*?%s)' % (front, episode_num[0], back))
title = re.sub(episode_offset_re, r'%s' % episode_num[1], title)
episode_offset_re = _compile_custom_word_regex(
r'(?<=%s.*?)%s(?=.*?%s)' % (front, episode_num[0], back)
)
title = episode_offset_re.sub(r'%s' % episode_num[1], title)
return title, "", True
except Exception as err:
logger.warn(f"自定义识别词集数偏移失败:{str(err)} - 标题:{title},前定位词:{front},后定位词:{back},偏移量:{offset}")