fix: scheduled data cleanup chain

This commit is contained in:
jxxghp
2026-05-09 18:30:55 +08:00
parent a7823fb4d1
commit ac11b303b3
3 changed files with 174 additions and 179 deletions

View File

@@ -1,160 +0,0 @@
import json
from datetime import datetime, timedelta
from typing import Callable, Optional, Dict, Any
from sqlalchemy.orm import Session
from app.db import SessionFactory
from app.db.models.downloadhistory import DownloadHistory, DownloadFiles
from app.db.models.message import Message
from app.db.models.siteuserdata import SiteUserData
from app.db.models.transferhistory import TransferHistory
from app.log import logger
class DataCleanupChain:
"""
系统数据清理链。
"""
DEFAULT_BATCH_SIZE = 500
MESSAGE_RETENTION_DAYS = 90
DOWNLOAD_HISTORY_RETENTION_DAYS = 180
SITE_USERDATA_RETENTION_DAYS = 180
TRANSFER_HISTORY_RETENTION_DAYS = 365 * 3
def cleanup(self, batch_size: Optional[int] = None) -> Dict[str, Any]:
"""
按预设保留期执行分批清理。
"""
started_at = datetime.now()
batch_size = batch_size or self.DEFAULT_BATCH_SIZE
if batch_size <= 0:
batch_size = self.DEFAULT_BATCH_SIZE
message_cutoff = (
started_at - timedelta(days=self.MESSAGE_RETENTION_DAYS)
).strftime("%Y-%m-%d %H:%M:%S")
download_history_cutoff = (
started_at - timedelta(days=self.DOWNLOAD_HISTORY_RETENTION_DAYS)
).strftime("%Y-%m-%d %H:%M:%S")
site_userdata_cutoff = (
started_at - timedelta(days=self.SITE_USERDATA_RETENTION_DAYS)
).strftime("%Y-%m-%d")
transfer_history_cutoff = (
started_at - timedelta(days=self.TRANSFER_HISTORY_RETENTION_DAYS)
).strftime("%Y-%m-%d %H:%M:%S")
report: Dict[str, Any] = {
"started_at": started_at.strftime("%Y-%m-%d %H:%M:%S"),
"batch_size": batch_size,
"tables": {},
"total_deleted": 0,
}
errors = []
plans = [
{
"name": "message",
"cutoff": message_cutoff,
"handler": lambda db: Message.delete_before(
db=db,
before_time=message_cutoff,
limit=batch_size,
),
},
{
"name": "downloadhistory",
"cutoff": download_history_cutoff,
"handler": lambda db: DownloadHistory.delete_before(
db=db,
before_time=download_history_cutoff,
limit=batch_size,
),
},
{
"name": "downloadfiles",
"cutoff": "follow-parent-history",
"handler": lambda db: DownloadFiles.delete_orphans(
db=db,
limit=batch_size,
),
},
{
"name": "siteuserdata",
"cutoff": site_userdata_cutoff,
"handler": lambda db: SiteUserData.delete_before(
db=db,
before_day=site_userdata_cutoff,
limit=batch_size,
),
},
{
"name": "transferhistory",
"cutoff": transfer_history_cutoff,
"handler": lambda db: TransferHistory.delete_before(
db=db,
before_time=transfer_history_cutoff,
limit=batch_size,
),
},
]
with SessionFactory() as db:
for plan in plans:
name = plan["name"]
try:
table_report = self._cleanup_in_batches(
db=db,
table_name=name,
delete_batch=plan["handler"],
)
table_report["cutoff"] = plan["cutoff"]
report["tables"][name] = table_report
report["total_deleted"] += table_report["deleted"]
except Exception as err:
errors.append(f"{name}: {str(err)}")
logger.error(f"数据表 {name} 清理失败:{str(err)}")
report["tables"][name] = {
"deleted": 0,
"batches": 0,
"cutoff": plan["cutoff"],
"error": str(err),
}
if errors:
report["errors"] = errors
logger.error(
f"数据表清理部分失败:{json.dumps(report, ensure_ascii=False)}"
)
raise RuntimeError("".join(errors))
logger.info(f"数据表清理完成:{json.dumps(report, ensure_ascii=False)}")
return report
@staticmethod
def _cleanup_in_batches(
db: Session,
table_name: str,
delete_batch: Callable[[Session], int],
) -> Dict[str, int]:
"""
循环执行单表分批删除,直到没有可删除数据。
"""
total_deleted = 0
batches = 0
while True:
deleted = delete_batch(db) or 0
if deleted <= 0:
break
batches += 1
total_deleted += deleted
logger.info(
f"数据表 {table_name} 清理第 {batches} 批完成,删除 {deleted} 条记录"
)
return {
"deleted": total_deleted,
"batches": batches,
}

View File

@@ -1,21 +1,23 @@
import asyncio
import gc
import inspect
import json
import multiprocessing
import threading
import traceback
from datetime import datetime, timedelta
from typing import List, Optional
from typing import Callable, Optional, Dict, Any
from typing import List
import pytz
from apscheduler.executors.pool import ThreadPoolExecutor
from apscheduler.jobstores.base import JobLookupError
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from sqlalchemy.orm import Session
from app import schemas
from app.chain import ChainBase
from app.chain.data_cleanup import DataCleanupChain
from app.chain.mediaserver import MediaServerChain
from app.chain.recommend import RecommendChain
from app.chain.site import SiteChain
@@ -25,10 +27,15 @@ from app.chain.workflow import WorkflowChain
from app.core.config import settings, global_vars
from app.core.event import eventmanager
from app.core.plugin import PluginManager
from app.db import SessionFactory
from app.db.models.downloadhistory import DownloadHistory, DownloadFiles
from app.db.models.message import Message
from app.db.models.siteuserdata import SiteUserData
from app.db.models.transferhistory import TransferHistory
from app.db.systemconfig_oper import SystemConfigOper
from app.helper.image import WallpaperHelper
from app.helper.message import MessageHelper
from app.helper.sites import SitesHelper # noqa
from app.helper.image import WallpaperHelper
from app.log import logger
from app.schemas import Notification, NotificationType, Workflow
from app.schemas.types import EventType, SystemConfigKey
@@ -41,7 +48,155 @@ lock = threading.Lock()
class SchedulerChain(ChainBase):
pass
"""
定时任务链,负责执行各类定时任务,包括数据清理等
"""
# 每批处理的记录数,避免一次性删除过多数据导致性能问题
DEFAULT_BATCH_SIZE = 500
# 消息保留期,单位:天
MESSAGE_RETENTION_DAYS = 90
# 下载历史保留期,单位:天
DOWNLOAD_HISTORY_RETENTION_DAYS = 180
# 站点用户数据保留期,单位:天
SITE_USERDATA_RETENTION_DAYS = 180
# 下载转移历史保留期,单位:天
TRANSFER_HISTORY_RETENTION_DAYS = 365 * 3
def cleanup(self, batch_size: Optional[int] = None) -> Dict[str, Any]:
"""
按预设保留期执行分批清理。
"""
started_at = datetime.now()
batch_size = batch_size or self.DEFAULT_BATCH_SIZE
if batch_size <= 0:
batch_size = self.DEFAULT_BATCH_SIZE
message_cutoff = (
started_at - timedelta(days=self.MESSAGE_RETENTION_DAYS)
).strftime("%Y-%m-%d %H:%M:%S")
download_history_cutoff = (
started_at - timedelta(days=self.DOWNLOAD_HISTORY_RETENTION_DAYS)
).strftime("%Y-%m-%d %H:%M:%S")
site_userdata_cutoff = (
started_at - timedelta(days=self.SITE_USERDATA_RETENTION_DAYS)
).strftime("%Y-%m-%d")
transfer_history_cutoff = (
started_at - timedelta(days=self.TRANSFER_HISTORY_RETENTION_DAYS)
).strftime("%Y-%m-%d %H:%M:%S")
report: Dict[str, Any] = {
"started_at": started_at.strftime("%Y-%m-%d %H:%M:%S"),
"batch_size": batch_size,
"tables": {},
"total_deleted": 0,
}
errors = []
plans = [
{
"name": "message",
"cutoff": message_cutoff,
"handler": lambda db: Message.delete_before(
db=db,
before_time=message_cutoff,
limit=batch_size,
),
},
{
"name": "downloadhistory",
"cutoff": download_history_cutoff,
"handler": lambda db: DownloadHistory.delete_before(
db=db,
before_time=download_history_cutoff,
limit=batch_size,
),
},
{
"name": "downloadfiles",
"cutoff": "follow-parent-history",
"handler": lambda db: DownloadFiles.delete_orphans(
db=db,
limit=batch_size,
),
},
{
"name": "siteuserdata",
"cutoff": site_userdata_cutoff,
"handler": lambda db: SiteUserData.delete_before(
db=db,
before_day=site_userdata_cutoff,
limit=batch_size,
),
},
{
"name": "transferhistory",
"cutoff": transfer_history_cutoff,
"handler": lambda db: TransferHistory.delete_before(
db=db,
before_time=transfer_history_cutoff,
limit=batch_size,
),
},
]
with SessionFactory() as db:
for plan in plans:
name = plan["name"]
try:
table_report = self._cleanup_in_batches(
db=db,
table_name=name,
delete_batch=plan["handler"],
)
table_report["cutoff"] = plan["cutoff"]
report["tables"][name] = table_report
report["total_deleted"] += table_report["deleted"]
except Exception as err:
errors.append(f"{name}: {str(err)}")
logger.error(f"数据表 {name} 清理失败:{str(err)}")
report["tables"][name] = {
"deleted": 0,
"batches": 0,
"cutoff": plan["cutoff"],
"error": str(err),
}
if errors:
report["errors"] = errors
logger.error(
f"数据表清理部分失败:{json.dumps(report, ensure_ascii=False)}"
)
raise RuntimeError("".join(errors))
logger.info(f"数据表清理完成:{json.dumps(report, ensure_ascii=False)}")
return report
@staticmethod
def _cleanup_in_batches(
db: Session,
table_name: str,
delete_batch: Callable[[Session], int],
) -> Dict[str, int]:
"""
循环执行单表分批删除,直到没有可删除数据。
"""
total_deleted = 0
batches = 0
while True:
deleted = delete_batch(db) or 0
if deleted <= 0:
break
batches += 1
total_deleted += deleted
logger.info(
f"数据表 {table_name} 清理第 {batches} 批完成,删除 {deleted} 条记录"
)
return {
"deleted": total_deleted,
"batches": batches,
}
class Scheduler(ConfigReloadMixin, metaclass=SingletonClass):
@@ -148,7 +303,7 @@ class Scheduler(ConfigReloadMixin, metaclass=SingletonClass):
},
"data_cleanup": {
"name": "数据表清理",
"func": DataCleanupChain().cleanup,
"func": SchedulerChain().cleanup,
"running": False,
},
"user_auth": {
@@ -207,8 +362,8 @@ class Scheduler(ConfigReloadMixin, metaclass=SingletonClass):
# CookieCloud定时同步
if (
settings.COOKIECLOUD_INTERVAL
and str(settings.COOKIECLOUD_INTERVAL).isdigit()
settings.COOKIECLOUD_INTERVAL
and str(settings.COOKIECLOUD_INTERVAL).isdigit()
):
self._scheduler.add_job(
self.start,
@@ -217,14 +372,14 @@ class Scheduler(ConfigReloadMixin, metaclass=SingletonClass):
name="同步CookieCloud站点",
minutes=int(settings.COOKIECLOUD_INTERVAL),
next_run_time=datetime.now(pytz.timezone(settings.TZ))
+ timedelta(minutes=5),
+ timedelta(minutes=5),
kwargs={"job_id": "cookiecloud"},
)
# 媒体服务器同步
if (
settings.MEDIASERVER_SYNC_INTERVAL
and str(settings.MEDIASERVER_SYNC_INTERVAL).isdigit()
settings.MEDIASERVER_SYNC_INTERVAL
and str(settings.MEDIASERVER_SYNC_INTERVAL).isdigit()
):
self._scheduler.add_job(
self.start,
@@ -233,7 +388,7 @@ class Scheduler(ConfigReloadMixin, metaclass=SingletonClass):
name="同步媒体服务器",
hours=int(settings.MEDIASERVER_SYNC_INTERVAL),
next_run_time=datetime.now(pytz.timezone(settings.TZ))
+ timedelta(minutes=10),
+ timedelta(minutes=10),
kwargs={"job_id": "mediaserver_sync"},
)
@@ -284,8 +439,8 @@ class Scheduler(ConfigReloadMixin, metaclass=SingletonClass):
else:
# RSS订阅模式
if (
not settings.SUBSCRIBE_RSS_INTERVAL
or not str(settings.SUBSCRIBE_RSS_INTERVAL).isdigit()
not settings.SUBSCRIBE_RSS_INTERVAL
or not str(settings.SUBSCRIBE_RSS_INTERVAL).isdigit()
):
settings.SUBSCRIBE_RSS_INTERVAL = 30
elif int(settings.SUBSCRIBE_RSS_INTERVAL) < 5:
@@ -327,7 +482,7 @@ class Scheduler(ConfigReloadMixin, metaclass=SingletonClass):
name="壁纸缓存",
minutes=30,
next_run_time=datetime.now(pytz.timezone(settings.TZ))
+ timedelta(seconds=1),
+ timedelta(seconds=1),
kwargs={"job_id": "random_wallpager"},
)
@@ -391,7 +546,7 @@ class Scheduler(ConfigReloadMixin, metaclass=SingletonClass):
name="推荐缓存",
hours=24,
next_run_time=datetime.now(pytz.timezone(settings.TZ))
+ timedelta(seconds=5),
+ timedelta(seconds=5),
kwargs={"job_id": "recommend_refresh"},
)
@@ -413,7 +568,7 @@ class Scheduler(ConfigReloadMixin, metaclass=SingletonClass):
name="订阅日历缓存",
hours=6,
next_run_time=datetime.now(pytz.timezone(settings.TZ))
+ timedelta(minutes=2),
+ timedelta(minutes=2),
kwargs={"job_id": "subscribe_calendar_cache"},
)

View File

@@ -7,12 +7,12 @@ from unittest.mock import patch
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from app.chain.data_cleanup import DataCleanupChain
from app.db import Base
from app.db.models.downloadhistory import DownloadHistory, DownloadFiles
from app.db.models.message import Message
from app.db.models.siteuserdata import SiteUserData
from app.db.models.transferhistory import TransferHistory
from app.scheduler import SchedulerChain
class DataCleanupChainTest(unittest.TestCase):
@@ -135,7 +135,7 @@ class DataCleanupChainTest(unittest.TestCase):
db.commit()
with patch("app.chain.data_cleanup.SessionFactory", self.SessionFactory):
report = DataCleanupChain().cleanup(batch_size=1)
report = SchedulerChain().cleanup(batch_size=1)
self.assertEqual(report["tables"]["message"]["deleted"], 3)
self.assertEqual(report["tables"]["message"]["batches"], 3)
@@ -172,7 +172,7 @@ class DataCleanupChainTest(unittest.TestCase):
db.commit()
with patch("app.chain.data_cleanup.SessionFactory", self.SessionFactory):
report = DataCleanupChain().cleanup(batch_size=10)
report = SchedulerChain().cleanup(batch_size=10)
self.assertEqual(report["tables"]["transferhistory"]["deleted"], 0)