fix: remove hardcoded chinese title heuristics (#887)

* fix: remove hardcoded chinese title heuristics * fix: use english placeholder for non-latin fallback titles
2026-04-23 20:45:34 +04:00
parent ae7be6deba
commit 96c97c5e0e
3 changed files with 74 additions and 43 deletions
--- a/api/streaming.py
+++ b/api/streaming.py
@@ -106,7 +106,7 @@ def _sanitize_generated_title(text: str) -> str:
    """Sanitize LLM-generated title text before persisting to session."""
    s = _strip_thinking_markup(text or '')
    s = re.sub(
-        r'^\s*(?:[*_`~]+\s*)?(?:session\s+title|title)\s*[:：]\s*(?:[*_`~]+\s*)?',
+        r'^\s*(?:[*_`~]+\s*)?(?:session\s+title|title)\s*:\s*(?:[*_`~]+\s*)?',
        '',
        s,
        flags=re.IGNORECASE,
@@ -132,10 +132,7 @@ def _looks_invalid_generated_title(text: str) -> bool:
        or re.search(r'^\s*(i|we)\s+(should|need to|will|can)\b', s, flags=re.IGNORECASE)
        or re.search(r'^\s*let me\b', s, flags=re.IGNORECASE)
        or re.search(r"^\s*here(?:'s| is) (?:a |my )?(?:thinking|thought)", s, flags=re.IGNORECASE)
        or re.search(r'用户(要求|希望|想让|让我)', s)
        or re.search(r'请只?回复', s)
        or re.search(r'^\s*(ok|okay|done|all set|complete|completed|finished)\b[\s.!?]*$', s, flags=re.IGNORECASE)
        or re.search(r'^\s*(好的|好啦|完成了|已完成|测试完成|测试已完成|可以了|没问题)\s*[！!。\.\s]*$', s)
    )
@@ -209,10 +206,10 @@ def _title_prompts(user_text: str, assistant_text: str) -> tuple[str, list[str]]
            "Return only the title text, 3-8 words, as a topic label.\n"
            "Do not use markdown, bullets, labels, or prefixes like Session Title:.\n"
            "Do not output a full sentence.\n"
-            "Do not output acknowledgements or completion phrases like OK, done, all set, 测试完成.\n"
+            "Do not output acknowledgements or completion phrases like OK, done, or all set.\n"
            "Do not describe internal reasoning.\n"
-            "Bad: The user is asking..., OK, 好的，测试完成！\n"
+            "Bad: The user is asking..., OK, all set.\n"
-            "Good: 自动标题生成测试, Clarify Dialog Layout, GitHub Issue Triage"
+            "Good: Title Generation Test, Clarify Dialog Layout, GitHub Issue Triage"
        ),
        (
            "Rewrite this conversation start as a concise noun-phrase title.\n"
@@ -437,10 +434,10 @@ def _fallback_title_from_exchange(user_text: str, assistant_text: str) -> Option
    combined = f"{user_text} {assistant_text}".strip().lower()
    combined_raw = f"{user_text} {assistant_text}".strip()
    def _contains_latin(text: str) -> bool:
        return bool(re.search(r'[A-Za-z]', text or ''))
    def _extract_named_topic(text: str) -> str:
        m = re.search(r'《([^》]{2,24})》', text)
        if m:
            return (m.group(1) or '').strip()
        m = re.search(r'"([^"\n]{2,24})"', text)
        if m:
            return (m.group(1) or '').strip()
@@ -451,57 +448,53 @@ def _fallback_title_from_exchange(user_text: str, assistant_text: str) -> Option
    topic_name = _extract_named_topic(combined_raw)
    if topic_name:
-        if any(k in combined for k in ('时间', 'time', '安排', '效率', '怎么办', '健身', '唱歌', '写毛笔', '不够用了')):
+        if not _contains_latin(topic_name):
-            return f'{topic_name}与时间管理'
+            if any(k in combined for k in ('time', 'schedule', 'efficiency', 'manage', 'fitness', 'singing', 'calligraphy')):
                return 'Time management discussion'
            if any(k in combined for k in ('hermes', 'codex', 'ai')):
                return 'AI productivity discussion'
            return 'Conversation topic'
        if any(k in combined for k in ('time', 'schedule', 'efficiency', 'manage', 'fitness', 'singing', 'calligraphy')):
            return f'{topic_name} time management'
        if any(k in combined for k in ('hermes', 'codex', 'ai')):
-            return f'{topic_name}与AI效率'
+            return f'{topic_name} AI productivity'
-        return f'{topic_name}讨论'
+        return f'{topic_name} discussion'
-    if any(k in combined for k in ('title', '标题')) and any(k in combined for k in ('summary', 'summar', '摘要', '短标题')):
+    if any(k in combined for k in ('title', 'session title')) and any(k in combined for k in ('summary', 'summar', 'short title')):
-        if any(k in combined for k in ('test', '测试', 'ok', '回复ok')):
+        if any(k in combined for k in ('test', 'ok', 'reply ok')):
-            return '会话标题自动摘要测试'
+            return 'Session title auto-summary test'
-        return '会话标题自动摘要'
+        return 'Session title auto-summary'
-    if any(k in combined for k in ('clarify', '澄清')) and any(k in combined for k in ('dialog', 'card', '对话', '卡片')):
+    if any(k in combined for k in ('clarify', 'clarification')) and any(k in combined for k in ('dialog', 'card')):
-        return 'Clarify 对话卡片'
+        return 'Clarify dialog card'
-    if any(k in combined for k in ('issue', 'github', 'pr')) and any(k in combined for k in ('triage', 'bug', 'review', '问题')):
+    if any(k in combined for k in ('issue', 'github', 'pr')) and any(k in combined for k in ('triage', 'bug', 'review')):
        return 'GitHub Issue Triage'
-    head = re.split(r'[。！？.!?\n]', user_text)[0].strip()
+    head = re.split(r'[.!?\n]', user_text)[0].strip()
    if not head:
        return None
    stop_cjk = {
        '我们', '看看', '一下', '这个', '标题', '是否', '可以', '用户', '理解', '这里', '测试', '一下',
        '你只', '需要', '回复', '就可', '可以', '不需', '需要做', '什么', '自动', '成用户', '短标题',
    }
    stop_en = {
        'the', 'this', 'that', 'with', 'from', 'into', 'just', 'reply', 'please',
        'need', 'needs', 'want', 'wants', 'user', 'assistant', 'could', 'would',
        'should', 'about', 'there', 'here', 'test', 'testing', 'title', 'summary',
    }
-    tokens = re.findall(r'[\u4e00-\u9fff]{2,6}|[A-Za-z0-9][A-Za-z0-9_./+-]*', head)
+    tokens = re.findall(r'[A-Za-z0-9][A-Za-z0-9_./+-]*', head)
    if not tokens:
-        return head[:64]
+        return 'Conversation topic'
    picked = []
    for tok in tokens:
        lower_tok = tok.lower()
-        if re.search(r'[\u4e00-\u9fff]', tok):
+        if lower_tok in stop_en or len(lower_tok) < 3:
-            if tok in stop_cjk:
+            continue
                continue
        else:
            if lower_tok in stop_en or len(lower_tok) < 3:
                continue
        if tok not in picked:
            picked.append(tok)
        if len(picked) >= 4:
            break
    if picked:
        if any(re.search(r'[\u4e00-\u9fff]', t) for t in picked):
            return ''.join(picked)[:20]
        return ' '.join(picked)[:60]
-    return head[:24]
+    return 'Conversation topic'
 def _run_background_title_update(session_id: str, user_text: str, assistant_text: str, placeholder_title: str, put_event, agent=None):
--- a/tests/test_sprint41.py
+++ b/tests/test_sprint41.py
@@ -137,16 +137,21 @@ class TestIssue495TitleStreaming(unittest.TestCase):
        )
    def test_streaming_rejects_generic_completion_titles(self):
        self.assertIn(
            "测试完成",
            STREAMING_PY,
            "streaming.py should reject generic completion phrases as session titles",
        )
        self.assertIn(
            "all set",
            STREAMING_PY,
            "streaming.py should reject generic English completion phrases as session titles",
        )
        self.assertIn(
            "completed",
            STREAMING_PY,
            "streaming.py should reject completion-status titles as session titles",
        )
        self.assertNotIn(
            "测试完成",
            STREAMING_PY,
            "streaming.py title generation should stay English-only",
        )
    def test_streaming_uses_reasoning_split_for_minimax_titles(self):
        self.assertIn(
--- a/tests/test_title_sanitization.py
+++ b/tests/test_title_sanitization.py
@@ -1,6 +1,11 @@
 import unittest
 from pathlib import Path
-from api.streaming import _first_exchange_snippets, _sanitize_generated_title
+from api.streaming import (
    _fallback_title_from_exchange,
    _first_exchange_snippets,
    _sanitize_generated_title,
 )
 class TestGeneratedTitleSanitization(unittest.TestCase):
@@ -33,3 +38,31 @@ class TestGeneratedTitleSanitization(unittest.TestCase):
            _first_exchange_snippets(messages),
            ("What time is it in San Francisco?", "It is 6:16 PM in San Francisco."),
        )
    def test_fallback_title_uses_english_discussion_suffix(self):
        self.assertEqual(
            _fallback_title_from_exchange('Please review "random cancel"', ""),
            "random cancel discussion",
        )
    def test_fallback_title_summary_label_is_english(self):
        self.assertEqual(
            _fallback_title_from_exchange("Generate a short title summary test", ""),
            "Session title auto-summary test",
        )
    def test_fallback_title_non_latin_input_uses_english_placeholder(self):
        self.assertEqual(
            _fallback_title_from_exchange("讨论一下这个问题", ""),
            "Conversation topic",
        )
    def test_fallback_title_non_latin_quoted_topic_uses_english_placeholder(self):
        self.assertEqual(
            _fallback_title_from_exchange('Please review "讨论主题"', ""),
            "Conversation topic",
        )
    def test_title_generation_source_has_no_cjk_literals(self):
        src = Path("api/streaming.py").read_text(encoding="utf-8")
        self.assertNotRegex(src, r"[\u4e00-\u9fff]", "title generation code should stay English-only")