title filter for subscriptions (closes #968)

This commit is contained in:
Alex Shnitman
2026-04-26 22:51:48 +03:00
parent d89a5ddbe5
commit 91ee8312bf
8 changed files with 470 additions and 8 deletions
+6 -1
View File
@@ -645,6 +645,7 @@ async def subscribe(request):
subtitle_mode=o['subtitle_mode'],
ytdl_options_presets=o['ytdl_options_presets'],
ytdl_options_overrides=o['ytdl_options_overrides'],
title_regex=post.get('title_regex'),
)
return web.Response(text=serializer.encode(result))
@@ -660,7 +661,11 @@ async def subscriptions_update(request):
sub_id = post.get('id')
if not sub_id:
raise web.HTTPBadRequest(reason='missing subscription id')
changes = {k: v for k, v in post.items() if k != 'id' and k in ('enabled', 'check_interval_minutes', 'name')}
changes = {
k: v
for k, v in post.items()
if k != 'id' and k in ('enabled', 'check_interval_minutes', 'name', 'title_regex')
}
if not changes:
raise web.HTTPBadRequest(reason='no valid fields to update')
log.info("Subscription update requested for %s: %s", sub_id, sorted(changes.keys()))
+62 -5
View File
@@ -6,6 +6,7 @@ import asyncio
import copy
import logging
import os
import re
import time
import types
import uuid
@@ -147,6 +148,7 @@ class SubscriptionInfo:
subtitle_mode: str = "prefer_manual"
ytdl_options_presets: list[str] = field(default_factory=list)
ytdl_options_overrides: dict[str, Any] = field(default_factory=dict)
title_regex: str = ""
last_checked: Optional[float] = None
seen_ids: list[str] = field(default_factory=list)
error: Optional[str] = None
@@ -167,6 +169,7 @@ class SubscriptionInfo:
"format": self.format,
"quality": self.quality,
"folder": self.folder,
"title_regex": self.title_regex,
"last_checked": self.last_checked,
"seen_count": len(self.seen_ids),
"error": self.error,
@@ -194,6 +197,7 @@ def _subscription_to_record(sub: SubscriptionInfo) -> dict[str, Any]:
"subtitle_mode": sub.subtitle_mode,
"ytdl_options_presets": list(sub.ytdl_options_presets),
"ytdl_options_overrides": sub.ytdl_options_overrides,
"title_regex": sub.title_regex,
"last_checked": sub.last_checked,
"seen_ids": list(sub.seen_ids),
"error": sub.error,
@@ -231,6 +235,22 @@ def _subscription_from_record(record: Any) -> Optional[SubscriptionInfo]:
return None
def _normalize_title_regex_value(value: Any) -> str:
if value is None:
return ""
if isinstance(value, str):
return value.strip()
return str(value).strip()
def validate_title_regex(value: Any) -> str:
"""Return stored title regex string; non-empty values must compile (re.error on failure)."""
s = _normalize_title_regex_value(value)
if s:
re.compile(s)
return s
def _coerce_bool(value: Any) -> bool:
"""Accept JSON booleans and common string forms used by API clients."""
if isinstance(value, bool):
@@ -448,10 +468,15 @@ class SubscriptionManager:
subtitle_mode: str,
ytdl_options_presets: Optional[list[str]] = None,
ytdl_options_overrides: Optional[dict[str, Any]] = None,
title_regex: Any = None,
) -> dict:
url = self._normalize_url(url)
if not url:
return {"status": "error", "msg": "Missing URL"}
try:
title_regex_stored = validate_title_regex(title_regex)
except re.error as exc:
return {"status": "error", "msg": f"Invalid title_regex: {exc}"}
async with self._lock:
if url in self._url_index or url in self._pending_urls:
@@ -509,6 +534,7 @@ class SubscriptionManager:
subtitle_mode=subtitle_mode,
ytdl_options_presets=list(ytdl_options_presets or []),
ytdl_options_overrides=dict(ytdl_options_overrides or {}),
title_regex=title_regex_stored,
last_checked=time.time(),
seen_ids=list(dict.fromkeys(all_ids)),
error=None,
@@ -555,6 +581,13 @@ class SubscriptionManager:
return {"status": "ok"}
async def update_subscription(self, sub_id: str, changes: dict) -> dict:
validated_tr: Optional[str] = None
if "title_regex" in changes:
try:
validated_tr = validate_title_regex(changes["title_regex"])
except re.error as exc:
return {"status": "error", "msg": f"Invalid title_regex: {exc}"}
async with self._lock:
sub = self._subs.get(sub_id)
if not sub:
@@ -568,6 +601,8 @@ class SubscriptionManager:
sub.check_interval_minutes = max(1, int(changes["check_interval_minutes"]))
if "name" in changes and changes["name"]:
sub.name = str(changes["name"])
if validated_tr is not None:
sub.title_regex = validated_tr
try:
self._save_locked()
@@ -659,9 +694,9 @@ class SubscriptionManager:
dl_submode = cur.subtitle_mode
dl_ytdl_presets = list(cur.ytdl_options_presets)
dl_ytdl_overrides = dict(cur.ytdl_options_overrides)
dl_title_regex = cur.title_regex or ""
new_entries: list[dict] = []
new_ids: list[str] = []
for ent in entries:
eid = _entry_id(ent)
if not eid:
@@ -669,10 +704,31 @@ class SubscriptionManager:
if eid in seen and ent.get("live_status") != "is_live":
continue
new_entries.append(ent)
new_ids.append(eid)
pattern_re: Optional[re.Pattern[str]] = None
if dl_title_regex:
try:
pattern_re = re.compile(dl_title_regex)
except re.error:
log.warning(
"Invalid stored title_regex on subscription %s, ignoring filter",
sub.name,
)
queue_entries: list[dict] = []
filtered_ids: list[str] = []
for ent in new_entries:
eid = _entry_id(ent)
if pattern_re is not None:
title = str(ent.get("title") or "")
if not pattern_re.search(title):
if eid:
filtered_ids.append(eid)
continue
queue_entries.append(ent)
queued_ids, queue_errors = await self._queue_subscription_entries(
new_entries,
queue_entries,
download_type=dl_type,
codec=dl_codec,
format=dl_format,
@@ -689,14 +745,15 @@ class SubscriptionManager:
ytdl_options_overrides=dl_ytdl_overrides,
)
log.info(
"Subscription check finished for %s: %d new, %d queued, %d failed",
"Subscription check finished for %s: %d new, %d filtered, %d queued, %d failed",
sub.name,
len(new_entries),
len(filtered_ids),
len(queued_ids),
len(queue_errors),
)
merged = list(dict.fromkeys(queued_ids + seen_ids_snapshot))
merged = list(dict.fromkeys(queued_ids + filtered_ids + seen_ids_snapshot))
max_seen = int(getattr(self.config, "SUBSCRIPTION_MAX_SEEN_IDS", 50000))
if len(merged) > max_seen:
merged = merged[:max_seen]
+277
View File
@@ -453,6 +453,283 @@ class SubscriptionPersistenceTests(unittest.IsolatedAsyncioTestCase):
with self.assertRaises(ValueError):
await mgr.update_subscription(sub_id, {"enabled": "maybe"})
async def test_add_subscription_rejects_invalid_title_regex(self):
with tempfile.TemporaryDirectory() as tmp:
mgr = SubscriptionManager(_Config(tmp), _Queue(), _Notifier())
with patch(
"subscriptions.extract_flat_playlist",
return_value=(
{"_type": "channel", "title": "Channel"},
[{"id": "v1", "title": "One", "webpage_url": "https://example.com/v1"}],
),
):
result = await mgr.add_subscription(
"https://example.com/channel",
check_interval_minutes=60,
download_type="video",
codec="auto",
format="any",
quality="best",
folder="",
custom_name_prefix="",
auto_start=True,
playlist_item_limit=0,
split_by_chapters=False,
chapter_template="",
subtitle_language="en",
subtitle_mode="prefer_manual",
title_regex="[",
)
self.assertEqual(result["status"], "error")
self.assertIn("title_regex", result["msg"].lower())
self.assertEqual(mgr.list_all(), [])
async def test_add_subscription_stores_and_exposes_title_regex(self):
with tempfile.TemporaryDirectory() as tmp:
queue = _Queue()
mgr = SubscriptionManager(_Config(tmp), queue, _Notifier())
with patch(
"subscriptions.extract_flat_playlist",
return_value=(
{"_type": "channel", "title": "Channel"},
[{"id": "v1", "title": "One", "webpage_url": "https://example.com/v1"}],
),
):
result = await mgr.add_subscription(
"https://example.com/channel",
check_interval_minutes=60,
download_type="video",
codec="auto",
format="any",
quality="best",
folder="",
custom_name_prefix="",
auto_start=True,
playlist_item_limit=0,
split_by_chapters=False,
chapter_template="",
subtitle_language="en",
subtitle_mode="prefer_manual",
title_regex="EPISODE",
)
self.assertEqual(result["status"], "ok")
self.assertEqual(result["subscription"]["title_regex"], "EPISODE")
self.assertEqual(mgr.list_all()[0].title_regex, "EPISODE")
async def test_check_now_title_regex_queues_only_matches_and_marks_unmatched_seen(self):
with tempfile.TemporaryDirectory() as tmp:
queue = _Queue()
mgr = SubscriptionManager(_Config(tmp), queue, _Notifier())
with patch(
"subscriptions.extract_flat_playlist",
side_effect=[
(
{"_type": "channel", "title": "Channel"},
[{"id": "v1", "title": "Old", "webpage_url": "https://example.com/v1"}],
),
(
{"_type": "channel", "title": "Channel"},
[
{
"id": "v2",
"title": "Minecraft | EPISODE 1",
"webpage_url": "https://example.com/v2",
},
{
"id": "v3",
"title": "Unrelated IRL",
"webpage_url": "https://example.com/v3",
},
{
"id": "v1",
"title": "Old",
"webpage_url": "https://example.com/v1",
},
],
),
],
):
result = await mgr.add_subscription(
"https://example.com/channel",
check_interval_minutes=60,
download_type="video",
codec="auto",
format="any",
quality="best",
folder="",
custom_name_prefix="",
auto_start=True,
playlist_item_limit=0,
split_by_chapters=False,
chapter_template="",
subtitle_language="en",
subtitle_mode="prefer_manual",
title_regex="EPISODE",
)
await mgr.check_now([result["subscription"]["id"]])
self.assertEqual([e["webpage_url"] for e, _, _ in queue.entries], ["https://example.com/v2"])
sub = mgr.list_all()[0]
self.assertEqual(sub.seen_ids[:3], ["v2", "v3", "v1"])
async def test_check_now_title_regex_queue_failure_keeps_matched_id_unseen(self):
with tempfile.TemporaryDirectory() as tmp:
queue = _Queue()
mgr = SubscriptionManager(_Config(tmp), queue, _Notifier())
with patch(
"subscriptions.extract_flat_playlist",
side_effect=[
(
{"_type": "channel", "title": "Channel"},
[{"id": "v1", "title": "Old", "webpage_url": "https://example.com/v1"}],
),
(
{"_type": "channel", "title": "Channel"},
[
{
"id": "v2",
"title": "Show | EPISODE 1",
"webpage_url": "https://example.com/v2",
},
{
"id": "v3",
"title": "Other",
"webpage_url": "https://example.com/v3",
},
],
),
],
):
result = await mgr.add_subscription(
"https://example.com/channel",
check_interval_minutes=60,
download_type="video",
codec="auto",
format="any",
quality="best",
folder="",
custom_name_prefix="",
auto_start=True,
playlist_item_limit=0,
split_by_chapters=False,
chapter_template="",
subtitle_language="en",
subtitle_mode="prefer_manual",
title_regex="EPISODE",
)
queue.fail = True
await mgr.check_now([result["subscription"]["id"]])
sub = mgr.list_all()[0]
self.assertEqual(sub.error, "queue failed")
self.assertEqual(set(sub.seen_ids), {"v1", "v3"})
self.assertNotIn("v2", sub.seen_ids)
async def test_update_subscription_rejects_invalid_title_regex(self):
with tempfile.TemporaryDirectory() as tmp:
queue = _Queue()
mgr = SubscriptionManager(_Config(tmp), queue, _Notifier())
with patch(
"subscriptions.extract_flat_playlist",
return_value=(
{"_type": "channel", "title": "Channel"},
[{"id": "v1", "title": "One", "webpage_url": "https://example.com/v1"}],
),
):
result = await mgr.add_subscription(
"https://example.com/channel",
check_interval_minutes=60,
download_type="video",
codec="auto",
format="any",
quality="best",
folder="",
custom_name_prefix="",
auto_start=True,
playlist_item_limit=0,
split_by_chapters=False,
chapter_template="",
subtitle_language="en",
subtitle_mode="prefer_manual",
)
sub_id = result["subscription"]["id"]
upd = await mgr.update_subscription(sub_id, {"title_regex": "("})
self.assertEqual(upd["status"], "error")
self.assertEqual(mgr.list_all()[0].title_regex, "")
async def test_update_subscription_persists_valid_title_regex(self):
with tempfile.TemporaryDirectory() as tmp:
queue = _Queue()
mgr = SubscriptionManager(_Config(tmp), queue, _Notifier())
with patch(
"subscriptions.extract_flat_playlist",
return_value=(
{"_type": "channel", "title": "Channel"},
[{"id": "v1", "title": "One", "webpage_url": "https://example.com/v1"}],
),
):
result = await mgr.add_subscription(
"https://example.com/channel",
check_interval_minutes=60,
download_type="video",
codec="auto",
format="any",
quality="best",
folder="",
custom_name_prefix="",
auto_start=True,
playlist_item_limit=0,
split_by_chapters=False,
chapter_template="",
subtitle_language="en",
subtitle_mode="prefer_manual",
)
sub_id = result["subscription"]["id"]
upd = await mgr.update_subscription(sub_id, {"title_regex": "foo|bar"})
self.assertEqual(upd["status"], "ok")
self.assertEqual(upd["subscription"]["title_regex"], "foo|bar")
self.assertEqual(mgr.list_all()[0].title_regex, "foo|bar")
def test_persistence_includes_title_regex(self):
with tempfile.TemporaryDirectory() as tmp:
json_path = os.path.join(tmp, "subscriptions.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(
{
"schema_version": 2,
"kind": "subscriptions",
"items": [
{
"id": "sub-1",
"name": "Channel",
"url": "https://example.com/channel",
"enabled": True,
"check_interval_minutes": 60,
"download_type": "video",
"codec": "auto",
"format": "any",
"quality": "best",
"folder": "",
"custom_name_prefix": "",
"auto_start": True,
"playlist_item_limit": 0,
"split_by_chapters": False,
"chapter_template": "",
"subtitle_language": "en",
"subtitle_mode": "prefer_manual",
"ytdl_options_presets": [],
"ytdl_options_overrides": {},
"title_regex": "EPISODE",
"last_checked": None,
"seen_ids": [],
"error": None,
}
],
},
f,
)
mgr = SubscriptionManager(_Config(tmp), _Queue(), _Notifier())
self.assertEqual(mgr.list_all()[0].title_regex, "EPISODE")
class ExtractFlatPlaylistTests(unittest.TestCase):
def test_descends_one_level_when_root_entries_are_nested_collections(self):
responses = iter(