From 3ebd06a3a7727bb09bfb9c3aedccb2a2179405f8 Mon Sep 17 00:00:00 2001 From: jxxghp Date: Sat, 23 May 2026 10:43:47 +0800 Subject: [PATCH] perf: precompile media metadata regexes --- app/core/meta/infopath.py | 6 ++- app/core/meta/metabase.py | 42 +++++++++++------ app/core/meta/metavideo.py | 96 +++++++++++++++++++++++++------------- app/core/meta/words.py | 28 +++++++---- 4 files changed, 116 insertions(+), 56 deletions(-) diff --git a/app/core/meta/infopath.py b/app/core/meta/infopath.py index 843f5b45..39fda6cf 100644 --- a/app/core/meta/infopath.py +++ b/app/core/meta/infopath.py @@ -8,6 +8,8 @@ AUXILIARY_CN_STEM_FULLMATCH_RE = re.compile( r"国英|台粤|音轨|评论|国配|台配|粤语|韩语|日语|杜比|全景声|无损|中字|" r"国语|原声)+$" ) +PARENT_LATIN_TITLE_RE = re.compile(r"[A-Za-z]{2,}") +SEASON_EPISODE_CN_RE = re.compile(r"[第共]\s*[0-9一二三四五六七八九十百零]+\s*[季集话話]") def should_use_parent_title_for_file_stem( @@ -23,7 +25,7 @@ def should_use_parent_title_for_file_stem( return False if file_meta.tmdbid or file_meta.doubanid: return False - if not re.search(r"[A-Za-z]{2,}", parent_dir_name): + if not PARENT_LATIN_TITLE_RE.search(parent_dir_name): return False if not StringUtils.is_all_chinese(stem): return False @@ -31,7 +33,7 @@ def should_use_parent_title_for_file_stem( return False if not AUXILIARY_CN_STEM_FULLMATCH_RE.match(stem): return False - if re.search(r"[第共]\s*[0-9一二三四五六七八九十百零]+\s*[季集话話]", stem): + if SEASON_EPISODE_CN_RE.search(stem): return False return True diff --git a/app/core/meta/metabase.py b/app/core/meta/metabase.py index f073e4bd..71937eee 100644 --- a/app/core/meta/metabase.py +++ b/app/core/meta/metabase.py @@ -10,6 +10,26 @@ from app.schemas.types import MediaType from app.utils.string import StringUtils +TITLE_EPISODE_RE = re.compile(r"Episode\s+(\d{1,4})", re.IGNORECASE) +SUBTITLE_HAS_SEASON_EPISODE_RE = re.compile(r"[全第季集话話期幕]", re.IGNORECASE) +SUBTITLE_SEASON_RE = re.compile(r"(?8|10|12|16)[\s._-]*bits?(?![A-Za-z0-9])", + re.IGNORECASE, +) + + @dataclass class MetaBase(object): """ @@ -121,8 +141,8 @@ class MetaBase(object): if not title_text: return title_text = f" {title_text} " - if re.search(r"%s" % self._title_episodel_re, title_text, re.IGNORECASE): - episode_str = re.search(r'%s' % self._title_episodel_re, title_text, re.IGNORECASE) + episode_str = TITLE_EPISODE_RE.search(title_text) + if episode_str: if episode_str: try: episode = int(episode_str.group(1)) @@ -136,9 +156,9 @@ class MetaBase(object): self.total_episode = 1 self.type = MediaType.TV self._subtitle_flag = True - elif re.search(r'[全第季集话話期幕]', title_text, re.IGNORECASE): + elif SUBTITLE_HAS_SEASON_EPISODE_RE.search(title_text): # 全x季 x季全 - season_all_str = re.search(r"%s" % self._subtitle_season_all_re, title_text, re.IGNORECASE) + season_all_str = SUBTITLE_SEASON_ALL_RE.search(title_text) if season_all_str: season_all = season_all_str.group(1) if not season_all: @@ -155,7 +175,7 @@ class MetaBase(object): self._subtitle_flag = True return # 第x季 - season_str = re.search(r'%s' % self._subtitle_season_re, title_text, re.IGNORECASE) + season_str = SUBTITLE_SEASON_RE.search(title_text) if season_str: seasons = season_str.group(1) if seasons: @@ -190,7 +210,7 @@ class MetaBase(object): self.type = MediaType.TV self._subtitle_flag = True # 第x-x集 第x集-x集 - episode_between_str = re.search(r'%s' % self._subtitle_episode_between_re, title_text, re.IGNORECASE) + episode_between_str = SUBTITLE_EPISODE_BETWEEN_RE.search(title_text) if episode_between_str: episodes = episode_between_str.groups() if episodes: @@ -221,7 +241,7 @@ class MetaBase(object): self._subtitle_flag = True return # 第x集 - episode_str = re.search(r'%s' % self._subtitle_episode_re, title_text, re.IGNORECASE) + episode_str = SUBTITLE_EPISODE_RE.search(title_text) if episode_str: episodes = episode_str.group(1) if episodes: @@ -257,7 +277,7 @@ class MetaBase(object): self._subtitle_flag = True return # x集全/全x集 - episode_all_str = re.search(r'%s' % self._subtitle_episode_all_re, title_text, re.IGNORECASE) + episode_all_str = SUBTITLE_EPISODE_ALL_RE.search(title_text) if episode_all_str: episode_all = episode_all_str.group(1) if not episode_all: @@ -469,11 +489,7 @@ class MetaBase(object): """ if not value: return None - bit_match = re.search( - r"(?8|10|12|16)[\s._-]*bits?(?![A-Za-z0-9])", - value, - re.IGNORECASE, - ) + bit_match = VIDEO_BIT_RE.search(value) if not bit_match: return None return f"{bit_match.group('bit')}bit" diff --git a/app/core/meta/metavideo.py b/app/core/meta/metavideo.py index bc1a77c0..bda3d94e 100644 --- a/app/core/meta/metavideo.py +++ b/app/core/meta/metavideo.py @@ -13,6 +13,23 @@ from app.utils.tokens import Tokens from app.core.meta.streamingplatform import StreamingPlatforms +SEASON_FULL_RE = re.compile(r"^(?:Season\s+|S)(\d{1,3})$", re.IGNORECASE) +FIRST_BRACKET_RE = re.compile(r'^[\[【](.+?)[\]】]') +BRACKET_DOT_TITLE_RE = re.compile(r'[A-Za-z]+\..+(?:19|20)\d{2}') +BRACKET_RESOURCE_RE = re.compile( + r'(?:2160|1080|720|480)[PIpi]|4K|UHD|Blu[\-.]?ray|REMUX|WEB[\-.]?DL|HDTV', + re.IGNORECASE, +) +YEAR_RANGE_RE = re.compile(r'([\s.]+)(\d{4})-(\d{4})') +FILE_SIZE_RE = re.compile(r'[0-9.]+\s*[MGT]i?B(?![A-Z]+)', re.IGNORECASE) +DATE_RE = re.compile(r'\d{4}[\s._-]\d{1,2}[\s._-]\d{1,2}') +DIY_RE = re.compile(r'DIY', re.IGNORECASE) +DIY_TITLE_RE = re.compile(r'-DIY@', re.IGNORECASE) +DESCRIPTION_SPLIT_RE = re.compile(r'[\s/|]+') +SPACE_RE = re.compile(r'\s+') +SEASON_SUFFIX_RE = re.compile(r"SEASON$", re.IGNORECASE) + + class MetaVideo(MetaBase): """ 识别电影、电视剧 @@ -54,6 +71,21 @@ class MetaVideo(MetaBase): _video_encode_re = r"^(H26[45])$|^(x26[45])$|^AVC$|^HEVC$|^VC\d?$|^MPEG\d?$|^Xvid$|^DivX$|^AV1$|^HDR\d*$|^AVS(\+|[23])$" _audio_encode_re = r"^DTS\d?$|^DTSHD$|^DTSHDMA$|^Atmos$|^TrueHD\d?$|^AC3$|^\dAudios?$|^DDP\d?$|^DD\+\d?$|^DD\d?$|^LPCM\d?$|^AAC\d?$|^FLAC\d?$|^HD\d?$|^MA\d?$|^HR\d?$|^Opus\d?$|^Vorbis\d?$|^AV[3S]A$" _fps_re = r"(\d{2,3})(?=FPS)" + _season_pattern = re.compile(_season_re, re.IGNORECASE) + _episode_pattern = re.compile(_episode_re, re.IGNORECASE) + _part_pattern = re.compile(_part_re, re.IGNORECASE) + _roman_numerals_pattern = re.compile(_roman_numerals) + _source_pattern = re.compile(r"(%s)" % _source_re, re.IGNORECASE) + _effect_pattern = re.compile(r"(%s)" % _effect_re, re.IGNORECASE) + _resources_type_pattern = re.compile(r"(%s)" % _resources_type_re, re.IGNORECASE) + _name_no_chinese_pattern = re.compile(_name_no_chinese_re, re.IGNORECASE) + _name_movie_words_pattern = re.compile("|".join(_name_movie_words), re.IGNORECASE) + _name_nostring_pattern = re.compile(_name_nostring_re, re.IGNORECASE) + _resources_pix_pattern = re.compile(_resources_pix_re, re.IGNORECASE) + _resources_pix_pattern2 = re.compile(_resources_pix_re2, re.IGNORECASE) + _video_encode_pattern = re.compile(r"(%s)" % _video_encode_re, re.IGNORECASE) + _audio_encode_pattern = re.compile(r"(%s)" % _audio_encode_re, re.IGNORECASE) + _fps_pattern = re.compile(r"(%s)" % _fps_re, re.IGNORECASE) def __init__(self, title: str, subtitle: str = None, isfile: bool = False): """ @@ -77,7 +109,7 @@ class MetaVideo(MetaBase): self.type = MediaType.TV return # 全名为Season xx 及 Sxx 直接返回 - season_full_res = re.search(r"^(?:Season\s+|S)(\d{1,3})$", title, re.IGNORECASE) + season_full_res = SEASON_FULL_RE.search(title) if season_full_res: self.type = MediaType.TV season = season_full_res.group(1) @@ -86,22 +118,21 @@ class MetaVideo(MetaBase): self.total_season = 1 return # 去掉名称中第1个[]的内容 - _first_bracket = re.match(r'^[\[【](.+?)[\]】]', title) + _first_bracket = FIRST_BRACKET_RE.match(title) if _first_bracket: _bracket_content = _first_bracket.group(1) # 如果第一个括号内为点分隔的英文发布名格式(含年份+资源类型),保留内容去掉括号 - if re.search(r'[A-Za-z]+\..+(?:19|20)\d{2}', _bracket_content) \ - and re.search(r'(?:2160|1080|720|480)[PIpi]|4K|UHD|Blu[\-.]?ray|REMUX|WEB[\-.]?DL|HDTV', - _bracket_content, re.IGNORECASE): + if BRACKET_DOT_TITLE_RE.search(_bracket_content) \ + and BRACKET_RESOURCE_RE.search(_bracket_content): title = _bracket_content + title[_first_bracket.end():] else: title = title[_first_bracket.end():] # 把xxxx-xxxx年份换成前一个年份,常出现在季集上 - title = re.sub(r'([\s.]+)(\d{4})-(\d{4})', r'\1\2', title) + title = YEAR_RANGE_RE.sub(r'\1\2', title) # 把大小去掉 - title = re.sub(r'[0-9.]+\s*[MGT]i?B(?![A-Z]+)', "", title, flags=re.IGNORECASE) + title = FILE_SIZE_RE.sub("", title) # 把年月日去掉 - title = re.sub(r'\d{4}[\s._-]\d{1,2}[\s._-]\d{1,2}', "", title) + title = DATE_RE.sub("", title) media_exts = settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT # 拆分tokens tokens = Tokens(title) @@ -157,8 +188,8 @@ class MetaVideo(MetaBase): self.resource_type = self._source.strip() # 提取原盘DIY if self.resource_type and "BluRay" in self.resource_type: - if (self.subtitle and re.findall(r'D[Ii]Y', self.subtitle)) \ - or re.findall(r'-D[Ii]Y@', original_title): + if (self.subtitle and DIY_RE.search(self.subtitle)) \ + or DIY_TITLE_RE.search(original_title): self.resource_type = f"{self.resource_type} DIY" # 解析副标题,只要季和集 self.init_subtitle(self.org_string) @@ -192,7 +223,7 @@ class MetaVideo(MetaBase): """ if not description: return None - titles = re.split(r'[\s/|]+', description) + titles = DESCRIPTION_SPLIT_RE.split(description) if StringUtils.is_chinese(titles[0]): return titles[0] return None @@ -215,9 +246,8 @@ class MetaVideo(MetaBase): """ if not name: return name - name = re.sub(r'%s' % self._name_nostring_re, '', name, - flags=re.IGNORECASE).strip() - name = re.sub(r'\s+', ' ', name) + name = self._name_nostring_pattern.sub('', name).strip() + name = SPACE_RE.sub(' ', name) if name.isdecimal() \ and int(name) < 1800 \ and not self.year \ @@ -263,13 +293,13 @@ class MetaVideo(MetaBase): if not self.cn_name: self.cn_name = token elif not self._stop_cnname_flag: - if re.search("|".join(self._name_movie_words), token, flags=re.IGNORECASE) \ - or (not re.search("%s" % self._name_no_chinese_re, token, flags=re.IGNORECASE) + if self._name_movie_words_pattern.search(token) \ + or (not self._name_no_chinese_pattern.search(token) and not any(w in token for w in self._name_se_words)): self.cn_name = "%s %s" % (self.cn_name, token) self._stop_cnname_flag = True else: - is_roman_digit = re.search(self._roman_numerals, token) + is_roman_digit = self._roman_numerals_pattern.search(token) # 阿拉伯数字或者罗马数字 if token.isdigit() or is_roman_digit: # 第季集后面的不要 @@ -305,16 +335,16 @@ class MetaVideo(MetaBase): # 名字未出现前的第一个数字,记下来 if not self._unknown_name_str: self._unknown_name_str = token - elif re.search(r"%s" % self._season_re, token, re.IGNORECASE): + elif self._season_pattern.search(token): # 季的处理 - if self.en_name and re.search(r"SEASON$", self.en_name, re.IGNORECASE): + if self.en_name and SEASON_SUFFIX_RE.search(self.en_name): # 如果匹配到季,英文名结尾为Season,说明Season属于标题,不应在后续作为干扰词去除 self.en_name += ' ' self._stop_name_flag = True return - elif re.search(r"%s" % self._episode_re, token, re.IGNORECASE) \ - or re.search(r"(%s)" % self._resources_type_re, token, re.IGNORECASE) \ - or re.search(r"%s" % self._resources_pix_re, token, re.IGNORECASE): + elif self._episode_pattern.search(token) \ + or self._resources_type_pattern.search(token) \ + or self._resources_pix_pattern.search(token): # 集、来源、版本等不要 self._stop_name_flag = True return @@ -341,7 +371,7 @@ class MetaVideo(MetaBase): and not self.resource_pix \ and not self.resource_type: return - re_res = re.search(r"%s" % self._part_re, token, re.IGNORECASE) + re_res = self._part_pattern.search(token) if re_res: if not self.part: self.part = re_res.group(1) @@ -372,7 +402,7 @@ class MetaVideo(MetaBase): self.en_name = "%s %s" % (self.en_name.strip(), self.year) elif self.cn_name: self.cn_name = "%s %s" % (self.cn_name, self.year) - elif self.en_name and re.search(r"SEASON$", self.en_name, re.IGNORECASE): + elif self.en_name and SEASON_SUFFIX_RE.search(self.en_name): # 如果匹配到年,且英文名结尾为Season,说明Season属于标题,不应在后续作为干扰词去除 self.en_name += ' ' self.year = token @@ -386,7 +416,7 @@ class MetaVideo(MetaBase): """ if not self.name: return - re_res = re.findall(r"%s" % self._resources_pix_re, token, re.IGNORECASE) + re_res = self._resources_pix_pattern.findall(token) if re_res: self._last_token_type = "pix" self._continue_flag = False @@ -411,7 +441,7 @@ class MetaVideo(MetaBase): and self.resource_pix[-1] not in 'kpi': self.resource_pix = "%sp" % self.resource_pix else: - re_res = re.search(r"%s" % self._resources_pix_re2, token, re.IGNORECASE) + re_res = self._resources_pix_pattern2.search(token) if re_res: self._last_token_type = "pix" self._continue_flag = False @@ -423,7 +453,7 @@ class MetaVideo(MetaBase): """ 识别季 """ - re_res = re.findall(r"%s" % self._season_re, token, re.IGNORECASE) + re_res = self._season_pattern.findall(token) if re_res: self._last_token_type = "season" self.type = MediaType.TV @@ -475,7 +505,7 @@ class MetaVideo(MetaBase): """ 识别集 """ - re_res = re.findall(r"%s" % self._episode_re, token, re.IGNORECASE) + re_res = self._episode_pattern.findall(token) if re_res: self._last_token_type = "episode" self._continue_flag = False @@ -581,7 +611,7 @@ class MetaVideo(MetaBase): self._source = "UHD BluRay" self._continue_flag = False return - source_res = re.search(r"(%s)" % self._source_re, token, re.IGNORECASE) + source_res = self._source_pattern.search(token) if source_res: self._last_token_type = "source" self._continue_flag = False @@ -590,7 +620,7 @@ class MetaVideo(MetaBase): self._source = source_res.group(1) self._last_token = self._source.upper() return - effect_res = re.search(r"(%s)" % self._effect_re, token, re.IGNORECASE) + effect_res = self._effect_pattern.search(token) if effect_res: self._last_token_type = "effect" self._continue_flag = False @@ -663,7 +693,7 @@ class MetaVideo(MetaBase): and not self.begin_season \ and not self.begin_episode: return - re_res = re.search(r"(%s)" % self._video_encode_re, token, re.IGNORECASE) + re_res = self._video_encode_pattern.search(token) if re_res: self._continue_flag = False self._stop_name_flag = True @@ -732,7 +762,7 @@ class MetaVideo(MetaBase): and not self.begin_season \ and not self.begin_episode: return - re_res = re.search(r"(%s)" % self._audio_encode_re, token, re.IGNORECASE) + re_res = self._audio_encode_pattern.search(token) if re_res: self._continue_flag = False self._stop_name_flag = True @@ -763,7 +793,7 @@ class MetaVideo(MetaBase): if not self.name: return - re_res = re.search(rf"({self._fps_re})", token, re.IGNORECASE) + re_res = self._fps_pattern.search(token) if re_res: self._continue_flag = False self._stop_name_flag = True diff --git a/app/core/meta/words.py b/app/core/meta/words.py index fb8df04a..ffd35838 100644 --- a/app/core/meta/words.py +++ b/app/core/meta/words.py @@ -1,3 +1,4 @@ +from functools import lru_cache from typing import List, Optional, Tuple import cn2an @@ -13,6 +14,14 @@ _COMBINED_WORD_RE = re.compile(r'^\s*(.*?)\s*=>\s*(.*?)\s*&&\s*(.*?)\s*<>\s*(.*? _LEADING_ZERO_RE = re.compile(r"^0+") +@lru_cache(maxsize=1024) +def _compile_custom_word_regex(pattern: str): + """ + 编译自定义识别词正则,缓存重复识别链路中反复使用的同一规则。 + """ + return re.compile(pattern) + + class WordsMatcher(metaclass=Singleton): def __init__(self): @@ -86,7 +95,7 @@ class WordsMatcher(metaclass=Singleton): 正则替换 """ try: - replaced_re = re.compile(r'%s' % replaced) + replaced_re = _compile_custom_word_regex(r'%s' % replaced) title, count = replaced_re.subn(r'%s' % replace, title) return title, "", count > 0 except Exception as err: @@ -99,12 +108,14 @@ class WordsMatcher(metaclass=Singleton): 集数偏移 """ try: - if back and not re.findall(r'%s' % back, title): + if back and not _compile_custom_word_regex(r'%s' % back).search(title): return title, "", False - if front and not re.findall(r'%s' % front, title): + if front and not _compile_custom_word_regex(r'%s' % front).search(title): return title, "", False - offset_word_info_re = re.compile(r'(?<=%s.*?)[0-9一二三四五六七八九十]+(?=.*?%s)' % (front, back)) - episode_nums_str = re.findall(offset_word_info_re, title) + offset_word_info_re = _compile_custom_word_regex( + r'(?<=%s.*?)[0-9一二三四五六七八九十]+(?=.*?%s)' % (front, back) + ) + episode_nums_str = offset_word_info_re.findall(title) if not episode_nums_str: return title, "", False episode_nums_offset_str = [] @@ -137,9 +148,10 @@ class WordsMatcher(metaclass=Singleton): else: episode_nums_list = sorted(episode_nums_dict.items(), key=lambda x: x[1], reverse=True) for episode_num in episode_nums_list: - episode_offset_re = re.compile( - r'(?<=%s.*?)%s(?=.*?%s)' % (front, episode_num[0], back)) - title = re.sub(episode_offset_re, r'%s' % episode_num[1], title) + episode_offset_re = _compile_custom_word_regex( + r'(?<=%s.*?)%s(?=.*?%s)' % (front, episode_num[0], back) + ) + title = episode_offset_re.sub(r'%s' % episode_num[1], title) return title, "", True except Exception as err: logger.warn(f"自定义识别词集数偏移失败:{str(err)} - 标题:{title},前定位词:{front},后定位词:{back},偏移量:{offset}")